This commit fixes trac:2190.
Originally the patch was to improve the error message, but when digging into the code I found a subtle bug. If the daemon does not tell the HNP what CRS component it used, then the HNP tries to figure it out from the metadata (this is an uncommon case). The path the HNP used was not complete, so it was unable to find the metadata information. This patch fixes this by adding the 'snapshot_reference' to the 'snapshot_location' which completes the path for this search. cmr:v1.4 (needs a custom patch) cmr:v1.5 This commit was SVN r22479. The following Trac tickets were found above: Ticket 2190 --> https://svn.open-mpi.org/trac/ompi/ticket/2190
Этот коммит содержится в:
родитель
44497a0567
Коммит
b749ecbab8
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -194,6 +194,7 @@ int opal_crs_base_metadata_write_token(char *snapshot_loc, char * token, char *v
|
|||||||
|
|
||||||
int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** component_name, int *prev_pid)
|
int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** component_name, int *prev_pid)
|
||||||
{
|
{
|
||||||
|
int exit_status = OPAL_SUCCESS;
|
||||||
char **pid_argv = NULL;
|
char **pid_argv = NULL;
|
||||||
char **name_argv = NULL;
|
char **name_argv = NULL;
|
||||||
|
|
||||||
@ -202,6 +203,8 @@ int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** compone
|
|||||||
*prev_pid = atoi(pid_argv[0]);
|
*prev_pid = atoi(pid_argv[0]);
|
||||||
} else {
|
} else {
|
||||||
opal_output(0, "Error: expected_component: PID information unavailable!");
|
opal_output(0, "Error: expected_component: PID information unavailable!");
|
||||||
|
exit_status = OPAL_ERROR;
|
||||||
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_crs_base_metadata_read_token(snapshot_loc, CRS_METADATA_COMP, &name_argv);
|
opal_crs_base_metadata_read_token(snapshot_loc, CRS_METADATA_COMP, &name_argv);
|
||||||
@ -209,8 +212,11 @@ int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** compone
|
|||||||
*component_name = strdup(name_argv[0]);
|
*component_name = strdup(name_argv[0]);
|
||||||
} else {
|
} else {
|
||||||
opal_output(0, "Error: expected_component: Component Name information unavailable!");
|
opal_output(0, "Error: expected_component: Component Name information unavailable!");
|
||||||
|
exit_status = OPAL_ERROR;
|
||||||
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cleanup:
|
||||||
if( NULL != pid_argv ) {
|
if( NULL != pid_argv ) {
|
||||||
opal_argv_free(pid_argv);
|
opal_argv_free(pid_argv);
|
||||||
pid_argv = NULL;
|
pid_argv = NULL;
|
||||||
@ -221,7 +227,7 @@ int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** compone
|
|||||||
name_argv = NULL;
|
name_argv = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name)
|
char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -873,6 +873,9 @@ static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
|
|||||||
*/
|
*/
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location,
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location,
|
||||||
&component_name, &prev_pid) ) ) {
|
&component_name, &prev_pid) ) ) {
|
||||||
|
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||||
|
"crs:blcr: blcr_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
|
||||||
|
snapshot->super.local_location, ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -682,6 +682,9 @@ static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
|
|||||||
*/
|
*/
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location,
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location,
|
||||||
&component_name, &prev_pid) ) ) {
|
&component_name, &prev_pid) ) ) {
|
||||||
|
opal_output(mca_crs_self_component.super.output_handle,
|
||||||
|
"crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
|
||||||
|
snapshot->super.local_location, ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# -*- text -*-
|
# -*- text -*-
|
||||||
#
|
#
|
||||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||||
# University Research and Technology
|
# University Research and Technology
|
||||||
# Corporation. All rights reserved.
|
# Corporation. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -31,6 +31,12 @@ Error: The filename (%s) is invalid because either you have not provided a filen
|
|||||||
or provided an invalid filename.
|
or provided an invalid filename.
|
||||||
Please see --help for usage.
|
Please see --help for usage.
|
||||||
|
|
||||||
|
[invalid_metadata]
|
||||||
|
Error: The local checkpoint contains invalid or incomplete metadata.
|
||||||
|
This usually indicates that the original checkpoint was invalid.
|
||||||
|
Check the metadata file (%s) in the following directory:
|
||||||
|
%s
|
||||||
|
|
||||||
[restart_cmd_failure]
|
[restart_cmd_failure]
|
||||||
Error: Unable to obtain the proper restart command to restart from the
|
Error: Unable to obtain the proper restart command to restart from the
|
||||||
checkpoint file (%s). Returned %d.
|
checkpoint file (%s). Returned %d.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||||
@ -173,7 +173,14 @@ main(int argc, char *argv[])
|
|||||||
char * base = NULL;
|
char * base = NULL;
|
||||||
|
|
||||||
base = opal_crs_base_get_snapshot_directory(opal_restart_globals.filename);
|
base = opal_crs_base_get_snapshot_directory(opal_restart_globals.filename);
|
||||||
opal_crs_base_extract_expected_component(base, &expected_crs_comp, &prev_pid);
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(base,
|
||||||
|
&expected_crs_comp,
|
||||||
|
&prev_pid)) ) {
|
||||||
|
opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
|
||||||
|
opal_crs_base_metadata_filename, base);
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
free(base);
|
free(base);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
-*- text -*-
|
-*- text -*-
|
||||||
#
|
#
|
||||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||||
# University Research and Technology
|
# University Research and Technology
|
||||||
# Corporation. All rights reserved.
|
# Corporation. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -18,3 +18,8 @@
|
|||||||
#
|
#
|
||||||
# This is the US/English general help file for ORTE SNAPC framework.
|
# This is the US/English general help file for ORTE SNAPC framework.
|
||||||
#
|
#
|
||||||
|
[invalid_metadata]
|
||||||
|
Error: The local checkpoint contains invalid or incomplete metadata for Process %s.
|
||||||
|
This usually indicates that the local checkpoint is invalid.
|
||||||
|
Check the metadata file (%s) in the following directory:
|
||||||
|
%s
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -44,6 +44,7 @@
|
|||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
#include "opal/util/os_dirpath.h"
|
#include "opal/util/os_dirpath.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
#include "opal/util/basename.h"
|
#include "opal/util/basename.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/mca/crs/crs.h"
|
#include "opal/mca/crs/crs.h"
|
||||||
@ -951,6 +952,7 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
|||||||
char * meta_data_fname = NULL;
|
char * meta_data_fname = NULL;
|
||||||
char * crs_comp = NULL;
|
char * crs_comp = NULL;
|
||||||
char * proc_name = NULL;
|
char * proc_name = NULL;
|
||||||
|
char * local_snapshot = NULL;
|
||||||
int prev_pid = 0;
|
int prev_pid = 0;
|
||||||
|
|
||||||
if( NULL == snapshot_location ) {
|
if( NULL == snapshot_location ) {
|
||||||
@ -979,9 +981,11 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
|||||||
|
|
||||||
/* Extract the checkpointer */
|
/* Extract the checkpointer */
|
||||||
if( NULL == crs_agent ) {
|
if( NULL == crs_agent ) {
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) {
|
asprintf(&local_snapshot, "%s/%s", snapshot_location, snapshot_ref);
|
||||||
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(local_snapshot, &crs_comp, &prev_pid)) ) {
|
||||||
|
opal_show_help("help-orte-snapc-base.txt", "invalid_metadata", true,
|
||||||
|
proc_name, opal_crs_base_metadata_filename, local_snapshot);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -995,10 +999,18 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
|||||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, snapshot_location);
|
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, snapshot_location);
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if( NULL != meta_data )
|
if( NULL != meta_data ) {
|
||||||
fclose(meta_data);
|
fclose(meta_data);
|
||||||
if( NULL != meta_data_fname)
|
meta_data = NULL;
|
||||||
|
}
|
||||||
|
if( NULL != meta_data_fname) {
|
||||||
free(meta_data_fname);
|
free(meta_data_fname);
|
||||||
|
meta_data_fname = NULL;
|
||||||
|
}
|
||||||
|
if( NULL != local_snapshot ) {
|
||||||
|
free( local_snapshot );
|
||||||
|
local_snapshot = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user