From b749ecbab87c477b469dac46550f4ceaa096ca60 Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Mon, 25 Jan 2010 20:28:38 +0000 Subject: [PATCH] This commit fixes trac:2190. Originally the patch was to improve the error message, but when digging into the code I found a subtle bug. If the daemon does not tell the HNP what CRS component it used, then the HNP tries to figure it out from the metadata (this is an uncommon case). The path the HNP used was not complete, so it was unable to find the metadata information. This patch fixes this by adding the 'snapshot_reference' to the 'snapshot_location' which completes the path for this search. cmr:v1.4 (needs a custom patch) cmr:v1.5 This commit was SVN r22479. The following Trac tickets were found above: Ticket 2190 --> https://svn.open-mpi.org/trac/ompi/ticket/2190 --- opal/mca/crs/base/crs_base_fns.c | 10 +++++++-- opal/mca/crs/blcr/crs_blcr_module.c | 5 ++++- opal/mca/crs/self/crs_self_module.c | 5 ++++- opal/tools/opal-restart/help-opal-restart.txt | 8 ++++++- opal/tools/opal-restart/opal-restart.c | 11 ++++++++-- orte/mca/snapc/base/help-orte-snapc-base.txt | 7 +++++- orte/mca/snapc/base/snapc_base_fns.c | 22 ++++++++++++++----- 7 files changed, 55 insertions(+), 13 deletions(-) diff --git a/opal/mca/crs/base/crs_base_fns.c b/opal/mca/crs/base/crs_base_fns.c index f33e31bcca..67e5ecc2c7 100644 --- a/opal/mca/crs/base/crs_base_fns.c +++ b/opal/mca/crs/base/crs_base_fns.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University. + * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. @@ -194,6 +194,7 @@ int opal_crs_base_metadata_write_token(char *snapshot_loc, char * token, char *v int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** component_name, int *prev_pid) { + int exit_status = OPAL_SUCCESS; char **pid_argv = NULL; char **name_argv = NULL; @@ -202,6 +203,8 @@ int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** compone *prev_pid = atoi(pid_argv[0]); } else { opal_output(0, "Error: expected_component: PID information unavailable!"); + exit_status = OPAL_ERROR; + goto cleanup; } opal_crs_base_metadata_read_token(snapshot_loc, CRS_METADATA_COMP, &name_argv); @@ -209,8 +212,11 @@ int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** compone *component_name = strdup(name_argv[0]); } else { opal_output(0, "Error: expected_component: Component Name information unavailable!"); + exit_status = OPAL_ERROR; + goto cleanup; } + cleanup: if( NULL != pid_argv ) { opal_argv_free(pid_argv); pid_argv = NULL; @@ -221,7 +227,7 @@ int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** compone name_argv = NULL; } - return OPAL_SUCCESS; + return exit_status; } char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name) diff --git a/opal/mca/crs/blcr/crs_blcr_module.c b/opal/mca/crs/blcr/crs_blcr_module.c index bd0495b293..d62a4f2d48 100644 --- a/opal/mca/crs/blcr/crs_blcr_module.c +++ b/opal/mca/crs/blcr/crs_blcr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University. + * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. @@ -873,6 +873,9 @@ static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) { */ if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location, &component_name, &prev_pid) ) ) { + opal_output(mca_crs_blcr_component.super.output_handle, + "crs:blcr: blcr_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.", + snapshot->super.local_location, ret); exit_status = ret; goto cleanup; } diff --git a/opal/mca/crs/self/crs_self_module.c b/opal/mca/crs/self/crs_self_module.c index b9093ffdc1..2838724724 100644 --- a/opal/mca/crs/self/crs_self_module.c +++ b/opal/mca/crs/self/crs_self_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University. + * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. @@ -682,6 +682,9 @@ static int self_cold_start(opal_crs_self_snapshot_t *snapshot) { */ if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location, &component_name, &prev_pid) ) ) { + opal_output(mca_crs_self_component.super.output_handle, + "crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.", + snapshot->super.local_location, ret); exit_status = ret; goto cleanup; } diff --git a/opal/tools/opal-restart/help-opal-restart.txt b/opal/tools/opal-restart/help-opal-restart.txt index 1c9482d81d..19efc2ff19 100644 --- a/opal/tools/opal-restart/help-opal-restart.txt +++ b/opal/tools/opal-restart/help-opal-restart.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana +# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. # Copyright (c) 2004-2005 The University of Tennessee and The University @@ -31,6 +31,12 @@ Error: The filename (%s) is invalid because either you have not provided a filen or provided an invalid filename. Please see --help for usage. +[invalid_metadata] +Error: The local checkpoint contains invalid or incomplete metadata. + This usually indicates that the original checkpoint was invalid. + Check the metadata file (%s) in the following directory: + %s + [restart_cmd_failure] Error: Unable to obtain the proper restart command to restart from the checkpoint file (%s). Returned %d. diff --git a/opal/tools/opal-restart/opal-restart.c b/opal/tools/opal-restart/opal-restart.c index 1e2809f125..36a00b6214 100644 --- a/opal/tools/opal-restart/opal-restart.c +++ b/opal/tools/opal-restart/opal-restart.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2007 The University of Tennessee and The University @@ -173,7 +173,14 @@ main(int argc, char *argv[]) char * base = NULL; base = opal_crs_base_get_snapshot_directory(opal_restart_globals.filename); - opal_crs_base_extract_expected_component(base, &expected_crs_comp, &prev_pid); + if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(base, + &expected_crs_comp, + &prev_pid)) ) { + opal_show_help("help-opal-restart.txt", "invalid_metadata", true, + opal_crs_base_metadata_filename, base); + exit_status = ret; + goto cleanup; + } free(base); } diff --git a/orte/mca/snapc/base/help-orte-snapc-base.txt b/orte/mca/snapc/base/help-orte-snapc-base.txt index fa3e3c4f8e..b186d3b5b4 100644 --- a/orte/mca/snapc/base/help-orte-snapc-base.txt +++ b/orte/mca/snapc/base/help-orte-snapc-base.txt @@ -1,6 +1,6 @@ -*- text -*- # -# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana +# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. # Copyright (c) 2004-2005 The University of Tennessee and The University @@ -18,3 +18,8 @@ # # This is the US/English general help file for ORTE SNAPC framework. # +[invalid_metadata] +Error: The local checkpoint contains invalid or incomplete metadata for Process %s. + This usually indicates that the local checkpoint is invalid. + Check the metadata file (%s) in the following directory: + %s diff --git a/orte/mca/snapc/base/snapc_base_fns.c b/orte/mca/snapc/base/snapc_base_fns.c index 8184dbd370..febeb4a334 100644 --- a/orte/mca/snapc/base/snapc_base_fns.c +++ b/orte/mca/snapc/base/snapc_base_fns.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 The Trustees of Indiana University. + * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. @@ -44,6 +44,7 @@ #include "opal/mca/base/mca_base_param.h" #include "opal/util/os_dirpath.h" #include "opal/util/output.h" +#include "opal/util/show_help.h" #include "opal/util/basename.h" #include "opal/util/argv.h" #include "opal/mca/crs/crs.h" @@ -951,6 +952,7 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc, char * meta_data_fname = NULL; char * crs_comp = NULL; char * proc_name = NULL; + char * local_snapshot = NULL; int prev_pid = 0; if( NULL == snapshot_location ) { @@ -979,9 +981,11 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc, /* Extract the checkpointer */ if( NULL == crs_agent ) { - if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) { + asprintf(&local_snapshot, "%s/%s", snapshot_location, snapshot_ref); + if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(local_snapshot, &crs_comp, &prev_pid)) ) { + opal_show_help("help-orte-snapc-base.txt", "invalid_metadata", true, + proc_name, opal_crs_base_metadata_filename, local_snapshot); exit_status = ret; - ORTE_ERROR_LOG(ret); goto cleanup; } } else { @@ -995,10 +999,18 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc, fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, snapshot_location); cleanup: - if( NULL != meta_data ) + if( NULL != meta_data ) { fclose(meta_data); - if( NULL != meta_data_fname) + meta_data = NULL; + } + if( NULL != meta_data_fname) { free(meta_data_fname); + meta_data_fname = NULL; + } + if( NULL != local_snapshot ) { + free( local_snapshot ); + local_snapshot = NULL; + } return exit_status; }