diff --git a/opal/mca/crs/base/crs_base_fns.c b/opal/mca/crs/base/crs_base_fns.c index f33e31bcca..67e5ecc2c7 100644 --- a/opal/mca/crs/base/crs_base_fns.c +++ b/opal/mca/crs/base/crs_base_fns.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University. + * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. @@ -194,6 +194,7 @@ int opal_crs_base_metadata_write_token(char *snapshot_loc, char * token, char *v int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** component_name, int *prev_pid) { + int exit_status = OPAL_SUCCESS; char **pid_argv = NULL; char **name_argv = NULL; @@ -202,6 +203,8 @@ int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** compone *prev_pid = atoi(pid_argv[0]); } else { opal_output(0, "Error: expected_component: PID information unavailable!"); + exit_status = OPAL_ERROR; + goto cleanup; } opal_crs_base_metadata_read_token(snapshot_loc, CRS_METADATA_COMP, &name_argv); @@ -209,8 +212,11 @@ int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** compone *component_name = strdup(name_argv[0]); } else { opal_output(0, "Error: expected_component: Component Name information unavailable!"); + exit_status = OPAL_ERROR; + goto cleanup; } + cleanup: if( NULL != pid_argv ) { opal_argv_free(pid_argv); pid_argv = NULL; @@ -221,7 +227,7 @@ int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** compone name_argv = NULL; } - return OPAL_SUCCESS; + return exit_status; } char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name) diff --git a/opal/mca/crs/blcr/crs_blcr_module.c b/opal/mca/crs/blcr/crs_blcr_module.c index bd0495b293..d62a4f2d48 100644 --- a/opal/mca/crs/blcr/crs_blcr_module.c +++ b/opal/mca/crs/blcr/crs_blcr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University. + * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. @@ -873,6 +873,9 @@ static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) { */ if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location, &component_name, &prev_pid) ) ) { + opal_output(mca_crs_blcr_component.super.output_handle, + "crs:blcr: blcr_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.", + snapshot->super.local_location, ret); exit_status = ret; goto cleanup; } diff --git a/opal/mca/crs/self/crs_self_module.c b/opal/mca/crs/self/crs_self_module.c index b9093ffdc1..2838724724 100644 --- a/opal/mca/crs/self/crs_self_module.c +++ b/opal/mca/crs/self/crs_self_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University. + * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. @@ -682,6 +682,9 @@ static int self_cold_start(opal_crs_self_snapshot_t *snapshot) { */ if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location, &component_name, &prev_pid) ) ) { + opal_output(mca_crs_self_component.super.output_handle, + "crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.", + snapshot->super.local_location, ret); exit_status = ret; goto cleanup; } diff --git a/opal/tools/opal-restart/help-opal-restart.txt b/opal/tools/opal-restart/help-opal-restart.txt index 1c9482d81d..19efc2ff19 100644 --- a/opal/tools/opal-restart/help-opal-restart.txt +++ b/opal/tools/opal-restart/help-opal-restart.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana +# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. # Copyright (c) 2004-2005 The University of Tennessee and The University @@ -31,6 +31,12 @@ Error: The filename (%s) is invalid because either you have not provided a filen or provided an invalid filename. Please see --help for usage. +[invalid_metadata] +Error: The local checkpoint contains invalid or incomplete metadata. + This usually indicates that the original checkpoint was invalid. + Check the metadata file (%s) in the following directory: + %s + [restart_cmd_failure] Error: Unable to obtain the proper restart command to restart from the checkpoint file (%s). Returned %d. diff --git a/opal/tools/opal-restart/opal-restart.c b/opal/tools/opal-restart/opal-restart.c index 1e2809f125..36a00b6214 100644 --- a/opal/tools/opal-restart/opal-restart.c +++ b/opal/tools/opal-restart/opal-restart.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2007 The University of Tennessee and The University @@ -173,7 +173,14 @@ main(int argc, char *argv[]) char * base = NULL; base = opal_crs_base_get_snapshot_directory(opal_restart_globals.filename); - opal_crs_base_extract_expected_component(base, &expected_crs_comp, &prev_pid); + if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(base, + &expected_crs_comp, + &prev_pid)) ) { + opal_show_help("help-opal-restart.txt", "invalid_metadata", true, + opal_crs_base_metadata_filename, base); + exit_status = ret; + goto cleanup; + } free(base); } diff --git a/orte/mca/snapc/base/help-orte-snapc-base.txt b/orte/mca/snapc/base/help-orte-snapc-base.txt index fa3e3c4f8e..b186d3b5b4 100644 --- a/orte/mca/snapc/base/help-orte-snapc-base.txt +++ b/orte/mca/snapc/base/help-orte-snapc-base.txt @@ -1,6 +1,6 @@ -*- text -*- # -# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana +# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. # Copyright (c) 2004-2005 The University of Tennessee and The University @@ -18,3 +18,8 @@ # # This is the US/English general help file for ORTE SNAPC framework. # +[invalid_metadata] +Error: The local checkpoint contains invalid or incomplete metadata for Process %s. + This usually indicates that the local checkpoint is invalid. + Check the metadata file (%s) in the following directory: + %s diff --git a/orte/mca/snapc/base/snapc_base_fns.c b/orte/mca/snapc/base/snapc_base_fns.c index 8184dbd370..febeb4a334 100644 --- a/orte/mca/snapc/base/snapc_base_fns.c +++ b/orte/mca/snapc/base/snapc_base_fns.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University. + * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. @@ -44,6 +44,7 @@ #include "opal/mca/base/mca_base_param.h" #include "opal/util/os_dirpath.h" #include "opal/util/output.h" +#include "opal/util/show_help.h" #include "opal/util/basename.h" #include "opal/util/argv.h" #include "opal/mca/crs/crs.h" @@ -951,6 +952,7 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc, char * meta_data_fname = NULL; char * crs_comp = NULL; char * proc_name = NULL; + char * local_snapshot = NULL; int prev_pid = 0; if( NULL == snapshot_location ) { @@ -979,9 +981,11 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc, /* Extract the checkpointer */ if( NULL == crs_agent ) { - if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) { + asprintf(&local_snapshot, "%s/%s", snapshot_location, snapshot_ref); + if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(local_snapshot, &crs_comp, &prev_pid)) ) { + opal_show_help("help-orte-snapc-base.txt", "invalid_metadata", true, + proc_name, opal_crs_base_metadata_filename, local_snapshot); exit_status = ret; - ORTE_ERROR_LOG(ret); goto cleanup; } } else { @@ -995,10 +999,18 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc, fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, snapshot_location); cleanup: - if( NULL != meta_data ) + if( NULL != meta_data ) { fclose(meta_data); - if( NULL != meta_data_fname) + meta_data = NULL; + } + if( NULL != meta_data_fname) { free(meta_data_fname); + meta_data_fname = NULL; + } + if( NULL != local_snapshot ) { + free( local_snapshot ); + local_snapshot = NULL; + } return exit_status; }