A bunch of improvements focused on Snapshot Coordination (SnapC) and File Management (FileM).
* Improved timing in SnapC Full Global Coordinator * Improved scalability of the SnapC Full protocol * Minor improvements to the error reporting mechanisms in SnapC and FileM * Improved the memory usage of the metadata routines - now the owner of the data is more explicit. * Added a FileM hint to indicate when files stored locally can be moved to/from a globally mounted file system using just the 'cp' command instead of the 'rcp/scp' command. Slightly improves performance, but not too drastically. Can be set using the following SnapC MCA parameter: {{{snapc_base_global_shared=1}}} * Implement the ability to throttle the number of outgoing connections in FileM. At larger scales this type of explicit throttling helps prevent overwhelming the HNP machine. Default: 10, set via MCA parameter: {{{filem_rsh_max_outgoing}}} * Add a few diagnostic/debugging features to SnapC and FileM. This commit was SVN r21131.
Этот коммит содержится в:
родитель
38aca518bd
Коммит
0deb009225
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -71,7 +71,11 @@ ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_file_set_t,
|
|||||||
|
|
||||||
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *req) {
|
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *req) {
|
||||||
req->local_target = NULL;
|
req->local_target = NULL;
|
||||||
|
req->local_hint = ORTE_FILEM_HINT_NONE;
|
||||||
|
|
||||||
req->remote_target = NULL;
|
req->remote_target = NULL;
|
||||||
|
req->remote_hint = ORTE_FILEM_HINT_NONE;
|
||||||
|
|
||||||
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -81,11 +85,13 @@ ORTE_DECLSPEC void orte_filem_base_file_set_destruct( orte_filem_base_file_set_t
|
|||||||
free(req->local_target);
|
free(req->local_target);
|
||||||
req->local_target = NULL;
|
req->local_target = NULL;
|
||||||
}
|
}
|
||||||
|
req->local_hint = ORTE_FILEM_HINT_NONE;
|
||||||
|
|
||||||
if( NULL != req->remote_target ) {
|
if( NULL != req->remote_target ) {
|
||||||
free(req->remote_target);
|
free(req->remote_target);
|
||||||
req->remote_target = NULL;
|
req->remote_target = NULL;
|
||||||
}
|
}
|
||||||
|
req->remote_hint = ORTE_FILEM_HINT_NONE;
|
||||||
|
|
||||||
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2007 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -77,6 +77,7 @@ int orte_filem_base_open(void)
|
|||||||
NULL, &str_value);
|
NULL, &str_value);
|
||||||
if( NULL != str_value ) {
|
if( NULL != str_value ) {
|
||||||
free(str_value);
|
free(str_value);
|
||||||
|
str_value = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Open up all available components */
|
/* Open up all available components */
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -54,6 +54,13 @@ extern "C" {
|
|||||||
#define ORTE_FILEM_MOVE_TYPE_RM 2
|
#define ORTE_FILEM_MOVE_TYPE_RM 2
|
||||||
#define ORTE_FILEM_MOVE_TYPE_UNKNOWN 3
|
#define ORTE_FILEM_MOVE_TYPE_UNKNOWN 3
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hints that describe the local or remote file target for
|
||||||
|
* optimization purposes.
|
||||||
|
*/
|
||||||
|
#define ORTE_FILEM_HINT_NONE 0
|
||||||
|
#define ORTE_FILEM_HINT_SHARED 1
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Define a Process Set
|
* Define a Process Set
|
||||||
*
|
*
|
||||||
@ -92,9 +99,15 @@ struct orte_filem_base_file_set_1_0_0_t {
|
|||||||
/* Local file reference */
|
/* Local file reference */
|
||||||
char * local_target;
|
char * local_target;
|
||||||
|
|
||||||
|
/* Local file reference hints */
|
||||||
|
int local_hint;
|
||||||
|
|
||||||
/* Remove file reference */
|
/* Remove file reference */
|
||||||
char * remote_target;
|
char * remote_target;
|
||||||
|
|
||||||
|
/* Remote file reference hints */
|
||||||
|
int remote_hint;
|
||||||
|
|
||||||
/* Type of file to move */
|
/* Type of file to move */
|
||||||
int target_flag;
|
int target_flag;
|
||||||
};
|
};
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -42,6 +42,7 @@
|
|||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/util/opal_environ.h"
|
#include "opal/util/opal_environ.h"
|
||||||
|
#include "opal/util/basename.h"
|
||||||
|
|
||||||
#include "opal/threads/mutex.h"
|
#include "opal/threads/mutex.h"
|
||||||
#include "opal/threads/condition.h"
|
#include "opal/threads/condition.h"
|
||||||
@ -632,41 +633,51 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
|
|||||||
}
|
}
|
||||||
/* Do not check a local get() operation, to help supress the warnings from the HNP */
|
/* Do not check a local get() operation, to help supress the warnings from the HNP */
|
||||||
else if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) {
|
else if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) {
|
||||||
|
char *base = NULL;
|
||||||
|
asprintf(&base, "%s/%s", f_set->local_target, opal_basename(f_set->remote_target));
|
||||||
/*
|
/*
|
||||||
* The file should not exist if we are getting a file with the
|
* The file should not exist if we are getting a file with the
|
||||||
* same name since we do not want to overwrite the filename
|
* same name since we do not want to overwrite the filename
|
||||||
* without the users consent.
|
* without the users consent.
|
||||||
*/
|
*/
|
||||||
if( 0 == access(f_set->local_target, R_OK) ) {
|
if( 0 == access(base, R_OK) ) {
|
||||||
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
||||||
"filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination\n",
|
"filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination (%s)\n",
|
||||||
ORTE_NAME_PRINT(&p_set->source),
|
ORTE_NAME_PRINT(&p_set->source),
|
||||||
ORTE_NAME_PRINT(&p_set->sink),
|
ORTE_NAME_PRINT(&p_set->sink),
|
||||||
f_set->remote_target,
|
f_set->remote_target,
|
||||||
f_set->local_target));
|
f_set->local_target, base));
|
||||||
orte_show_help("help-orte-filem-rsh.txt",
|
orte_show_help("help-orte-filem-rsh.txt",
|
||||||
"orte-filem-rsh:get-file-exists",
|
"orte-filem-rsh:get-file-exists",
|
||||||
true, f_set->local_target, orte_process_info.nodename);
|
true, f_set->local_target, orte_process_info.nodename);
|
||||||
|
free(base);
|
||||||
|
base = NULL;
|
||||||
request->is_done[cur_index] = true;
|
request->is_done[cur_index] = true;
|
||||||
request->is_active[cur_index] = true;
|
request->is_active[cur_index] = true;
|
||||||
request->exit_status[cur_index] = -1;
|
request->exit_status[cur_index] = -1;
|
||||||
goto continue_set;
|
goto continue_set;
|
||||||
}
|
}
|
||||||
|
free(base);
|
||||||
|
base = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
|
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
|
||||||
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
||||||
"filem:rsh: copy(): %s -> %s: Moving file %s to %s\n",
|
"filem:rsh: copy(): %s -> %s: Moving file %s %s to %s %s\n",
|
||||||
ORTE_NAME_PRINT(&p_set->source),
|
ORTE_NAME_PRINT(&p_set->source),
|
||||||
ORTE_NAME_PRINT(&p_set->sink),
|
ORTE_NAME_PRINT(&p_set->sink),
|
||||||
|
(f_set->local_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
|
||||||
f_set->local_target,
|
f_set->local_target,
|
||||||
|
(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
|
||||||
f_set->remote_target));
|
f_set->remote_target));
|
||||||
} else {
|
} else {
|
||||||
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
||||||
"filem:rsh: copy(): %s -> %s: Moving file %s to %s\n",
|
"filem:rsh: copy(): %s -> %s: Moving file %s %s to %s %s\n",
|
||||||
ORTE_NAME_PRINT(&p_set->source),
|
ORTE_NAME_PRINT(&p_set->source),
|
||||||
ORTE_NAME_PRINT(&p_set->sink),
|
ORTE_NAME_PRINT(&p_set->sink),
|
||||||
|
(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
|
||||||
f_set->remote_target,
|
f_set->remote_target,
|
||||||
|
(f_set->local_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
|
||||||
f_set->local_target));
|
f_set->local_target));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -736,12 +747,20 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
|
|||||||
* If this is the put() routine
|
* If this is the put() routine
|
||||||
*/
|
*/
|
||||||
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
|
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
|
||||||
|
/* Use a local 'cp' when able */
|
||||||
|
if(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ) {
|
||||||
|
asprintf(&command, "cp %s %s %s ",
|
||||||
|
dir_arg,
|
||||||
|
f_set->local_target,
|
||||||
|
remote_file);
|
||||||
|
} else {
|
||||||
asprintf(&command, "%s %s %s %s:%s ",
|
asprintf(&command, "%s %s %s %s:%s ",
|
||||||
mca_filem_rsh_component.cp_command,
|
mca_filem_rsh_component.cp_command,
|
||||||
dir_arg,
|
dir_arg,
|
||||||
f_set->local_target,
|
f_set->local_target,
|
||||||
remote_machine,
|
remote_machine,
|
||||||
remote_file);
|
remote_file);
|
||||||
|
}
|
||||||
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
|
||||||
"filem:rsh:put about to execute [%s]", command));
|
"filem:rsh:put about to execute [%s]", command));
|
||||||
|
|
||||||
@ -758,12 +777,22 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
|
|||||||
* ow it is the get() routine
|
* ow it is the get() routine
|
||||||
*/
|
*/
|
||||||
else {
|
else {
|
||||||
|
/* Use a local 'cp' when able */
|
||||||
|
if(f_set->local_hint == ORTE_FILEM_HINT_SHARED ) {
|
||||||
|
asprintf(&command, "%s %s cp %s %s %s ",
|
||||||
|
mca_filem_rsh_component.remote_sh_command,
|
||||||
|
remote_machine,
|
||||||
|
dir_arg,
|
||||||
|
remote_file,
|
||||||
|
f_set->local_target);
|
||||||
|
} else {
|
||||||
asprintf(&command, "%s %s %s:%s %s ",
|
asprintf(&command, "%s %s %s:%s %s ",
|
||||||
mca_filem_rsh_component.cp_command,
|
mca_filem_rsh_component.cp_command,
|
||||||
dir_arg,
|
dir_arg,
|
||||||
remote_machine,
|
remote_machine,
|
||||||
remote_file,
|
remote_file,
|
||||||
f_set->local_target);
|
f_set->local_target);
|
||||||
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
|
||||||
"filem:rsh:get about to execute [%s]", command));
|
"filem:rsh:get about to execute [%s]", command));
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -75,8 +75,8 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
|
|||||||
/**
|
/**
|
||||||
* Global Snapshot Object Maintenance functions
|
* Global Snapshot Object Maintenance functions
|
||||||
*/
|
*/
|
||||||
void orte_snapc_base_snapshot_construct(orte_snapc_base_snapshot_t *obj);
|
void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *obj);
|
||||||
void orte_snapc_base_snapshot_destruct( orte_snapc_base_snapshot_t *obj);
|
void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *obj);
|
||||||
|
|
||||||
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *obj);
|
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *obj);
|
||||||
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *obj);
|
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *obj);
|
||||||
@ -132,24 +132,26 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
|
|||||||
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
|
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
|
||||||
ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq;
|
ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq;
|
||||||
ORTE_DECLSPEC extern bool orte_snapc_base_establish_global_snapshot_dir;
|
ORTE_DECLSPEC extern bool orte_snapc_base_establish_global_snapshot_dir;
|
||||||
|
ORTE_DECLSPEC extern bool orte_snapc_base_is_global_dir_shared;
|
||||||
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;
|
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Some utility functions
|
* Some utility functions
|
||||||
*/
|
*/
|
||||||
ORTE_DECLSPEC char * orte_snapc_ckpt_state_str(size_t state);
|
ORTE_DECLSPEC int orte_snapc_ckpt_state_str(char ** state_str, int state);
|
||||||
|
|
||||||
ORTE_DECLSPEC char * orte_snapc_base_unique_global_snapshot_name(pid_t pid);
|
ORTE_DECLSPEC int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid);
|
||||||
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name);
|
ORTE_DECLSPEC int orte_snapc_base_get_global_snapshot_metadata_file(char **file_name, char *uniq_snapshot_name);
|
||||||
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_directory(char *uniq_global_snapshot_name);
|
ORTE_DECLSPEC int orte_snapc_base_get_global_snapshot_directory(char **dir_name, char *uniq_global_snapshot_name);
|
||||||
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
|
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
|
||||||
bool empty_metadata);
|
bool empty_metadata);
|
||||||
ORTE_DECLSPEC int orte_snapc_base_add_timestamp(char * global_snapshot_ref);
|
ORTE_DECLSPEC int orte_snapc_base_add_timestamp(char * global_snapshot_ref);
|
||||||
ORTE_DECLSPEC int orte_snapc_base_add_vpid_metadata(orte_process_name_t *proc,
|
ORTE_DECLSPEC int orte_snapc_base_add_vpid_metadata(orte_process_name_t *proc,
|
||||||
char * global_snapshot_ref,
|
char * global_snapshot_ref,
|
||||||
char *snapshot_ref,
|
char *snapshot_ref,
|
||||||
char *snapshot_location);
|
char *snapshot_location,
|
||||||
|
char *crs_agent);
|
||||||
ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref);
|
ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref);
|
||||||
ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot);
|
ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot);
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -41,6 +41,7 @@
|
|||||||
|
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/rml/rml_types.h"
|
#include "orte/mca/rml/rml_types.h"
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
|
||||||
@ -68,27 +69,51 @@ size_t orte_snapc_base_snapshot_seq_number = 0;
|
|||||||
/******************
|
/******************
|
||||||
* Object stuff
|
* Object stuff
|
||||||
******************/
|
******************/
|
||||||
OBJ_CLASS_INSTANCE(orte_snapc_base_snapshot_t,
|
OBJ_CLASS_INSTANCE(orte_snapc_base_local_snapshot_t,
|
||||||
opal_crs_base_snapshot_t,
|
opal_list_item_t,
|
||||||
orte_snapc_base_snapshot_construct,
|
orte_snapc_base_local_snapshot_construct,
|
||||||
orte_snapc_base_snapshot_destruct);
|
orte_snapc_base_local_snapshot_destruct);
|
||||||
|
|
||||||
void orte_snapc_base_snapshot_construct(orte_snapc_base_snapshot_t *snapshot)
|
void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *snapshot)
|
||||||
{
|
{
|
||||||
snapshot->process_name.jobid = 0;
|
snapshot->process_name.jobid = 0;
|
||||||
snapshot->process_name.vpid = 0;
|
snapshot->process_name.vpid = 0;
|
||||||
snapshot->process_pid = 0;
|
|
||||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||||
snapshot->term = false;
|
|
||||||
|
snapshot->reference_name = NULL;
|
||||||
|
snapshot->local_location = NULL;
|
||||||
|
snapshot->remote_location = NULL;
|
||||||
|
|
||||||
|
snapshot->opal_crs = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void orte_snapc_base_snapshot_destruct( orte_snapc_base_snapshot_t *snapshot)
|
void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot)
|
||||||
{
|
{
|
||||||
snapshot->process_name.jobid = 0;
|
snapshot->process_name.jobid = 0;
|
||||||
snapshot->process_name.vpid = 0;
|
snapshot->process_name.vpid = 0;
|
||||||
snapshot->process_pid = 0;
|
|
||||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||||
snapshot->term = false;
|
|
||||||
|
if( NULL != snapshot->reference_name ) {
|
||||||
|
free(snapshot->reference_name);
|
||||||
|
snapshot->reference_name = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( NULL != snapshot->local_location ) {
|
||||||
|
free(snapshot->local_location);
|
||||||
|
snapshot->local_location = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( NULL != snapshot->remote_location ) {
|
||||||
|
free(snapshot->remote_location);
|
||||||
|
snapshot->remote_location = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( NULL != snapshot->opal_crs ) {
|
||||||
|
free(snapshot->opal_crs);
|
||||||
|
snapshot->opal_crs = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/****/
|
/****/
|
||||||
@ -99,51 +124,38 @@ OBJ_CLASS_INSTANCE(orte_snapc_base_global_snapshot_t,
|
|||||||
|
|
||||||
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot)
|
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot)
|
||||||
{
|
{
|
||||||
OBJ_CONSTRUCT(&(snapshot->snapshots), opal_list_t);
|
char *tmp_dir = NULL;
|
||||||
|
|
||||||
snapshot->component_name = NULL;
|
OBJ_CONSTRUCT(&(snapshot->local_snapshots), opal_list_t);
|
||||||
snapshot->reference_name = orte_snapc_base_unique_global_snapshot_name(getpid());
|
|
||||||
snapshot->local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(snapshot->reference_name));
|
orte_snapc_base_unique_global_snapshot_name(&(snapshot->reference_name), getpid());
|
||||||
|
|
||||||
|
orte_snapc_base_get_global_snapshot_directory(&tmp_dir, snapshot->reference_name);
|
||||||
|
snapshot->local_location = opal_dirname(tmp_dir);
|
||||||
|
free(tmp_dir);
|
||||||
|
|
||||||
snapshot->seq_num = 0;
|
snapshot->seq_num = 0;
|
||||||
snapshot->start_time = NULL;
|
|
||||||
snapshot->end_time = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
|
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
|
||||||
{
|
{
|
||||||
opal_list_item_t* item = NULL;
|
opal_list_item_t* item = NULL;
|
||||||
|
|
||||||
while (NULL != (item = opal_list_remove_first(&snapshot->snapshots))) {
|
while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) {
|
||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&(snapshot->snapshots));
|
OBJ_DESTRUCT(&(snapshot->local_snapshots));
|
||||||
|
|
||||||
if(NULL != snapshot->reference_name) {
|
if(NULL != snapshot->reference_name) {
|
||||||
free(snapshot->reference_name);
|
free(snapshot->reference_name);
|
||||||
snapshot->reference_name = NULL;
|
snapshot->reference_name = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(NULL != snapshot->component_name) {
|
|
||||||
free(snapshot->component_name);
|
|
||||||
snapshot->component_name = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(NULL != snapshot->local_location) {
|
if(NULL != snapshot->local_location) {
|
||||||
free(snapshot->local_location);
|
free(snapshot->local_location);
|
||||||
snapshot->local_location = NULL;
|
snapshot->local_location = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(NULL != snapshot->start_time) {
|
|
||||||
free(snapshot->start_time);
|
|
||||||
snapshot->start_time = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(NULL != snapshot->end_time) {
|
|
||||||
free(snapshot->end_time);
|
|
||||||
snapshot->end_time = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
snapshot->seq_num = 0;
|
snapshot->seq_num = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -198,6 +210,7 @@ int orte_snapc_base_none_setup_job(orte_jobid_t jobid)
|
|||||||
ORTE_RML_PERSISTENT,
|
ORTE_RML_PERSISTENT,
|
||||||
snapc_none_global_cmdline_request,
|
snapc_none_global_cmdline_request,
|
||||||
NULL))) {
|
NULL))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
exit_status = rc;
|
exit_status = rc;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -238,6 +251,7 @@ static void snapc_none_global_cmdline_request(int status,
|
|||||||
|
|
||||||
n = 1;
|
n = 1;
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) {
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -251,6 +265,7 @@ static void snapc_none_global_cmdline_request(int status,
|
|||||||
* Do the basic handshake with the orte_checkpoint command
|
* Do the basic handshake with the orte_checkpoint command
|
||||||
*/
|
*/
|
||||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, &term, &jobid)) ) {
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, &term, &jobid)) ) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -259,6 +274,7 @@ static void snapc_none_global_cmdline_request(int status,
|
|||||||
* Respond with an invalid response
|
* Respond with an invalid response
|
||||||
*/
|
*/
|
||||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, -1, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, -1, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -312,6 +328,7 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
|
|||||||
"%s) base:ckpt_init_cmd: Error: DSS Unpack (term) Failure (ret = %d) (LINE = %d)\n",
|
"%s) base:ckpt_init_cmd: Error: DSS Unpack (term) Failure (ret = %d) (LINE = %d)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
ret, __LINE__);
|
ret, __LINE__);
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -322,6 +339,7 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
|
|||||||
"%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
|
"%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
ret, __LINE__);
|
ret, __LINE__);
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -373,6 +391,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -382,6 +401,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
|||||||
"%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
|
"%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
ret, __LINE__);
|
ret, __LINE__);
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -393,6 +413,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
|||||||
"%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
|
"%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
ret, __LINE__);
|
ret, __LINE__);
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -401,6 +422,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
|||||||
"%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
|
"%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
ret, __LINE__);
|
ret, __LINE__);
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -411,6 +433,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
|||||||
"%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
|
"%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
ret, __LINE__);
|
ret, __LINE__);
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -433,42 +456,36 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
|||||||
/*****************************
|
/*****************************
|
||||||
* Snapshot metadata functions
|
* Snapshot metadata functions
|
||||||
*****************************/
|
*****************************/
|
||||||
char * orte_snapc_base_unique_global_snapshot_name(pid_t pid)
|
int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid)
|
||||||
{
|
{
|
||||||
char * uniq_name;
|
|
||||||
|
|
||||||
if( NULL == orte_snapc_base_global_snapshot_ref ) {
|
if( NULL == orte_snapc_base_global_snapshot_ref ) {
|
||||||
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid);
|
asprintf(name_str, "ompi_global_snapshot_%d.ckpt", pid);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
uniq_name = strdup(orte_snapc_base_global_snapshot_ref);
|
*name_str = strdup(orte_snapc_base_global_snapshot_ref);
|
||||||
}
|
}
|
||||||
|
|
||||||
return uniq_name;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name)
|
int orte_snapc_base_get_global_snapshot_metadata_file(char **file_name, char *uniq_snapshot_name)
|
||||||
{
|
{
|
||||||
char * path = NULL;
|
asprintf(file_name, "%s/%s/%s",
|
||||||
|
|
||||||
asprintf(&path, "%s/%s/%s",
|
|
||||||
orte_snapc_base_global_snapshot_dir,
|
orte_snapc_base_global_snapshot_dir,
|
||||||
uniq_snapshot_name,
|
uniq_snapshot_name,
|
||||||
orte_snapc_base_metadata_filename);
|
orte_snapc_base_metadata_filename);
|
||||||
|
|
||||||
return path;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
char * orte_snapc_base_get_global_snapshot_directory(char *uniq_snapshot_name)
|
int orte_snapc_base_get_global_snapshot_directory(char **dir_name, char *uniq_snapshot_name)
|
||||||
{
|
{
|
||||||
char * dir_name = NULL;
|
asprintf(dir_name, "%s/%s/%d",
|
||||||
|
|
||||||
asprintf(&dir_name, "%s/%s/%d",
|
|
||||||
orte_snapc_base_global_snapshot_dir,
|
orte_snapc_base_global_snapshot_dir,
|
||||||
uniq_snapshot_name,
|
uniq_snapshot_name,
|
||||||
(int)orte_snapc_base_snapshot_seq_number);
|
(int)orte_snapc_base_snapshot_seq_number);
|
||||||
|
|
||||||
return dir_name;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, bool empty_metadata)
|
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, bool empty_metadata)
|
||||||
@ -482,8 +499,9 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
|
|||||||
/*
|
/*
|
||||||
* Make the snapshot directory from the uniq_global_snapshot_name
|
* Make the snapshot directory from the uniq_global_snapshot_name
|
||||||
*/
|
*/
|
||||||
dir_name = orte_snapc_base_get_global_snapshot_directory(uniq_global_snapshot_name);
|
orte_snapc_base_get_global_snapshot_directory(&dir_name, uniq_global_snapshot_name);
|
||||||
if(OPAL_SUCCESS != (ret = opal_os_dirpath_create(dir_name, my_mode)) ) {
|
if(OPAL_SUCCESS != (ret = opal_os_dirpath_create(dir_name, my_mode)) ) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -491,13 +509,14 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
|
|||||||
/*
|
/*
|
||||||
* Initialize the metadata file at the top of that directory.
|
* Initialize the metadata file at the top of that directory.
|
||||||
*/
|
*/
|
||||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(uniq_global_snapshot_name);
|
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, uniq_global_snapshot_name);
|
||||||
|
|
||||||
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
||||||
opal_output(orte_snapc_base_output,
|
opal_output(orte_snapc_base_output,
|
||||||
"%s) base:init_global_snapshot_directory: Error: Unable to open the file (%s)\n",
|
"%s) base:init_global_snapshot_directory: Error: Unable to open the file (%s)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
meta_data_fname);
|
meta_data_fname);
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||||
exit_status = ORTE_ERROR;
|
exit_status = ORTE_ERROR;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -529,7 +548,7 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
|
|||||||
if(NULL != meta_data_fname)
|
if(NULL != meta_data_fname)
|
||||||
free(meta_data_fname);
|
free(meta_data_fname);
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -575,13 +594,14 @@ int orte_snapc_base_add_timestamp(char * global_snapshot_ref)
|
|||||||
char * meta_data_fname = NULL;
|
char * meta_data_fname = NULL;
|
||||||
time_t timestamp;
|
time_t timestamp;
|
||||||
|
|
||||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
|
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
|
||||||
|
|
||||||
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
||||||
opal_output(orte_snapc_base_output,
|
opal_output(orte_snapc_base_output,
|
||||||
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
|
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
meta_data_fname);
|
meta_data_fname);
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||||
exit_status = ORTE_ERROR;
|
exit_status = ORTE_ERROR;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -607,13 +627,14 @@ int orte_snapc_base_finalize_metadata(char * global_snapshot_ref)
|
|||||||
/* Add the final timestamp */
|
/* Add the final timestamp */
|
||||||
orte_snapc_base_add_timestamp(global_snapshot_ref);
|
orte_snapc_base_add_timestamp(global_snapshot_ref);
|
||||||
|
|
||||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
|
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
|
||||||
|
|
||||||
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
||||||
opal_output(orte_snapc_base_output,
|
opal_output(orte_snapc_base_output,
|
||||||
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
|
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
meta_data_fname);
|
meta_data_fname);
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||||
exit_status = ORTE_ERROR;
|
exit_status = ORTE_ERROR;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -633,23 +654,28 @@ int orte_snapc_base_finalize_metadata(char * global_snapshot_ref)
|
|||||||
int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
||||||
char * global_snapshot_ref,
|
char * global_snapshot_ref,
|
||||||
char *snapshot_ref,
|
char *snapshot_ref,
|
||||||
char *snapshot_location)
|
char *snapshot_location,
|
||||||
|
char *crs_agent)
|
||||||
{
|
{
|
||||||
int ret, exit_status = ORTE_SUCCESS;
|
int ret, exit_status = ORTE_SUCCESS;
|
||||||
FILE * meta_data = NULL;
|
FILE * meta_data = NULL;
|
||||||
char * meta_data_fname = NULL;
|
char * meta_data_fname = NULL;
|
||||||
char * crs_comp = NULL;
|
char * crs_comp = NULL;
|
||||||
char * local_dir = NULL;
|
|
||||||
char * proc_name = NULL;
|
char * proc_name = NULL;
|
||||||
int prev_pid = 0;
|
int prev_pid = 0;
|
||||||
|
|
||||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
|
if( NULL == snapshot_location ) {
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
|
||||||
|
|
||||||
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
||||||
opal_output(orte_snapc_base_output,
|
opal_output(orte_snapc_base_output,
|
||||||
"%s) base:add_metadata: Error: Unable to open the file (%s)\n",
|
"%s) base:add_metadata: Error: Unable to open the file (%s)\n",
|
||||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||||
meta_data_fname);
|
meta_data_fname);
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||||
exit_status = ORTE_ERROR;
|
exit_status = ORTE_ERROR;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -663,20 +689,21 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
|||||||
orte_util_convert_process_name_to_string(&proc_name, proc);
|
orte_util_convert_process_name_to_string(&proc_name, proc);
|
||||||
|
|
||||||
/* Extract the checkpointer */
|
/* Extract the checkpointer */
|
||||||
|
if( NULL == crs_agent ) {
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) {
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) {
|
||||||
exit_status = ORTE_ERROR;
|
exit_status = ret;
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
/* get the base of the location */
|
crs_comp = strdup(crs_agent);
|
||||||
local_dir = strdup(snapshot_location);
|
}
|
||||||
local_dir = opal_dirname(local_dir);
|
|
||||||
|
|
||||||
/* Write the string */
|
/* Write the string */
|
||||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_PROCESS, proc_name);
|
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_PROCESS, proc_name);
|
||||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_CRS_COMP, crs_comp);
|
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_CRS_COMP, crs_comp);
|
||||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_REF, snapshot_ref);
|
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_REF, snapshot_ref);
|
||||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, local_dir);
|
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, snapshot_location);
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if( NULL != meta_data )
|
if( NULL != meta_data )
|
||||||
@ -684,9 +711,6 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
|||||||
if( NULL != meta_data_fname)
|
if( NULL != meta_data_fname)
|
||||||
free(meta_data_fname);
|
free(meta_data_fname);
|
||||||
|
|
||||||
if( NULL != local_dir)
|
|
||||||
free(local_dir);
|
|
||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -698,13 +722,14 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
|
|||||||
int next_seq_int;
|
int next_seq_int;
|
||||||
char * token = NULL;
|
char * token = NULL;
|
||||||
char * value = NULL;
|
char * value = NULL;
|
||||||
orte_snapc_base_snapshot_t *vpid_snapshot = NULL;
|
orte_snapc_base_local_snapshot_t *vpid_snapshot = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Open the metadata file
|
* Open the metadata file
|
||||||
*/
|
*/
|
||||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot->reference_name);
|
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot->reference_name);
|
||||||
if (NULL == (meta_data = fopen(meta_data_fname, "r")) ) {
|
if (NULL == (meta_data = fopen(meta_data_fname, "r")) ) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||||
exit_status = ORTE_ERROR;
|
exit_status = ORTE_ERROR;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -742,12 +767,7 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else if(0 == strncmp(SNAPC_METADATA_TIME, token, strlen(SNAPC_METADATA_TIME)) ) {
|
else if(0 == strncmp(SNAPC_METADATA_TIME, token, strlen(SNAPC_METADATA_TIME)) ) {
|
||||||
if( NULL == global_snapshot->start_time) {
|
;
|
||||||
global_snapshot->start_time = strdup(value);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
global_snapshot->end_time = strdup(value);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if(0 == strncmp(SNAPC_METADATA_PROCESS, token, strlen(SNAPC_METADATA_PROCESS)) ) {
|
else if(0 == strncmp(SNAPC_METADATA_PROCESS, token, strlen(SNAPC_METADATA_PROCESS)) ) {
|
||||||
orte_process_name_t proc;
|
orte_process_name_t proc;
|
||||||
@ -756,29 +776,29 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
|
|||||||
|
|
||||||
/* Not the first process, so append it to the list */
|
/* Not the first process, so append it to the list */
|
||||||
if( NULL != vpid_snapshot) {
|
if( NULL != vpid_snapshot) {
|
||||||
opal_list_append(&global_snapshot->snapshots, &(vpid_snapshot->crs_snapshot_super.super));
|
opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
|
||||||
}
|
}
|
||||||
|
|
||||||
vpid_snapshot = OBJ_NEW(orte_snapc_base_snapshot_t);
|
vpid_snapshot = OBJ_NEW(orte_snapc_base_local_snapshot_t);
|
||||||
|
|
||||||
vpid_snapshot->process_name.jobid = proc.jobid;
|
vpid_snapshot->process_name.jobid = proc.jobid;
|
||||||
vpid_snapshot->process_name.vpid = proc.vpid;
|
vpid_snapshot->process_name.vpid = proc.vpid;
|
||||||
}
|
}
|
||||||
else if(0 == strncmp(SNAPC_METADATA_CRS_COMP, token, strlen(SNAPC_METADATA_CRS_COMP)) ) {
|
else if(0 == strncmp(SNAPC_METADATA_CRS_COMP, token, strlen(SNAPC_METADATA_CRS_COMP)) ) {
|
||||||
vpid_snapshot->crs_snapshot_super.component_name = strdup(value);
|
vpid_snapshot->opal_crs = strdup(value);
|
||||||
}
|
}
|
||||||
else if(0 == strncmp(SNAPC_METADATA_SNAP_REF, token, strlen(SNAPC_METADATA_SNAP_REF)) ) {
|
else if(0 == strncmp(SNAPC_METADATA_SNAP_REF, token, strlen(SNAPC_METADATA_SNAP_REF)) ) {
|
||||||
vpid_snapshot->crs_snapshot_super.reference_name = strdup(value);
|
vpid_snapshot->reference_name = strdup(value);
|
||||||
}
|
}
|
||||||
else if(0 == strncmp(SNAPC_METADATA_SNAP_LOC, token, strlen(SNAPC_METADATA_SNAP_LOC)) ) {
|
else if(0 == strncmp(SNAPC_METADATA_SNAP_LOC, token, strlen(SNAPC_METADATA_SNAP_LOC)) ) {
|
||||||
vpid_snapshot->crs_snapshot_super.local_location = strdup(value);
|
vpid_snapshot->local_location = strdup(value);
|
||||||
vpid_snapshot->crs_snapshot_super.remote_location = strdup(value);
|
vpid_snapshot->remote_location = strdup(value);
|
||||||
}
|
}
|
||||||
} while(0 == feof(meta_data) );
|
} while(0 == feof(meta_data) );
|
||||||
|
|
||||||
/* Append the last item */
|
/* Append the last item */
|
||||||
if( NULL != vpid_snapshot) {
|
if( NULL != vpid_snapshot) {
|
||||||
opal_list_append(&global_snapshot->snapshots, &(vpid_snapshot->crs_snapshot_super.super));
|
opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
@ -960,34 +980,40 @@ static int metadata_extract_next_token(FILE *file, char **token, char **value)
|
|||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
char * orte_snapc_ckpt_state_str(size_t state)
|
int orte_snapc_ckpt_state_str(char ** state_str, int state)
|
||||||
{
|
{
|
||||||
switch(state) {
|
switch(state) {
|
||||||
case ORTE_SNAPC_CKPT_STATE_NONE:
|
case ORTE_SNAPC_CKPT_STATE_NONE:
|
||||||
return strdup(" -- ");
|
*state_str = strdup(" -- ");
|
||||||
break;
|
break;
|
||||||
case ORTE_SNAPC_CKPT_STATE_REQUEST:
|
case ORTE_SNAPC_CKPT_STATE_REQUEST:
|
||||||
return strdup("Requested");
|
*state_str = strdup("Requested");
|
||||||
break;
|
break;
|
||||||
case ORTE_SNAPC_CKPT_STATE_PENDING_TERM:
|
case ORTE_SNAPC_CKPT_STATE_PENDING_TERM:
|
||||||
return strdup("Pending (Termination)");
|
*state_str = strdup("Pending (Termination)");
|
||||||
break;
|
break;
|
||||||
case ORTE_SNAPC_CKPT_STATE_PENDING:
|
case ORTE_SNAPC_CKPT_STATE_PENDING:
|
||||||
return strdup("Pending");
|
*state_str = strdup("Pending");
|
||||||
break;
|
break;
|
||||||
case ORTE_SNAPC_CKPT_STATE_RUNNING:
|
case ORTE_SNAPC_CKPT_STATE_RUNNING:
|
||||||
return strdup("Running");
|
*state_str = strdup("Running");
|
||||||
break;
|
break;
|
||||||
case ORTE_SNAPC_CKPT_STATE_FILE_XFER:
|
case ORTE_SNAPC_CKPT_STATE_FILE_XFER:
|
||||||
return strdup("File Transfer");
|
*state_str = strdup("File Transfer");
|
||||||
break;
|
break;
|
||||||
case ORTE_SNAPC_CKPT_STATE_FINISHED:
|
case ORTE_SNAPC_CKPT_STATE_FINISHED:
|
||||||
return strdup("Finished");
|
*state_str = strdup("Finished");
|
||||||
|
break;
|
||||||
|
case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
|
||||||
|
*state_str = strdup("Locally Finished");
|
||||||
break;
|
break;
|
||||||
case ORTE_SNAPC_CKPT_STATE_ERROR:
|
case ORTE_SNAPC_CKPT_STATE_ERROR:
|
||||||
return strdup("Error");
|
*state_str = strdup("Error");
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
return strdup("Unknown");
|
asprintf(state_str, "Unknown %d", state);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2007 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2008 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2008 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -74,6 +74,7 @@ char * orte_snapc_base_global_snapshot_ref = NULL;
|
|||||||
bool orte_snapc_base_store_in_place = true;
|
bool orte_snapc_base_store_in_place = true;
|
||||||
bool orte_snapc_base_store_only_one_seq = false;
|
bool orte_snapc_base_store_only_one_seq = false;
|
||||||
bool orte_snapc_base_establish_global_snapshot_dir = false;
|
bool orte_snapc_base_establish_global_snapshot_dir = false;
|
||||||
|
bool orte_snapc_base_is_global_dir_shared = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function for finding and opening either all MCA components,
|
* Function for finding and opening either all MCA components,
|
||||||
@ -97,9 +98,20 @@ int orte_snapc_base_open(void)
|
|||||||
opal_home_directory(),
|
opal_home_directory(),
|
||||||
&orte_snapc_base_global_snapshot_dir);
|
&orte_snapc_base_global_snapshot_dir);
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("snapc",
|
||||||
|
"base_global_shared",
|
||||||
|
"If the global_snapshot_dir is on a shared file system all nodes can access, "
|
||||||
|
"then the checkpoint files can be copied more efficiently when FileM is used."
|
||||||
|
" [Default = disabled]",
|
||||||
|
false, false,
|
||||||
|
0,
|
||||||
|
&value);
|
||||||
|
orte_snapc_base_is_global_dir_shared = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((20, orte_snapc_base_output,
|
OPAL_OUTPUT_VERBOSE((20, orte_snapc_base_output,
|
||||||
"snapc:base: open: base_global_snapshot_dir = %s",
|
"snapc:base: open: base_global_snapshot_dir = %s (%s)",
|
||||||
orte_snapc_base_global_snapshot_dir));
|
orte_snapc_base_global_snapshot_dir,
|
||||||
|
(orte_snapc_base_is_global_dir_shared ? "Shared" : "Local") ));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Store the checkpoint files in their final location.
|
* Store the checkpoint files in their final location.
|
||||||
@ -173,8 +185,8 @@ int orte_snapc_base_open(void)
|
|||||||
if( NULL == orte_snapc_base_global_snapshot_loc ) {
|
if( NULL == orte_snapc_base_global_snapshot_loc ) {
|
||||||
char *t1 = NULL;
|
char *t1 = NULL;
|
||||||
char *t2 = NULL;
|
char *t2 = NULL;
|
||||||
t1 = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
orte_snapc_base_unique_global_snapshot_name(&t1, getpid() );
|
||||||
t2 = orte_snapc_base_get_global_snapshot_directory( t1 );
|
orte_snapc_base_get_global_snapshot_directory(&t2, t1 );
|
||||||
orte_snapc_base_global_snapshot_loc = strdup(t2);
|
orte_snapc_base_global_snapshot_loc = strdup(t2);
|
||||||
free(t1);
|
free(t1);
|
||||||
free(t2);
|
free(t2);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -30,6 +30,7 @@
|
|||||||
#include "opal/mca/mca.h"
|
#include "opal/mca/mca.h"
|
||||||
#include "opal/event/event.h"
|
#include "opal/event/event.h"
|
||||||
|
|
||||||
|
#include "orte/mca/filem/filem.h"
|
||||||
#include "orte/mca/snapc/snapc.h"
|
#include "orte/mca/snapc/snapc.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
@ -40,34 +41,53 @@ BEGIN_C_DECLS
|
|||||||
typedef uint8_t orte_snapc_full_cmd_flag_t;
|
typedef uint8_t orte_snapc_full_cmd_flag_t;
|
||||||
#define ORTE_SNAPC_FULL_CMD OPAL_UINT8
|
#define ORTE_SNAPC_FULL_CMD OPAL_UINT8
|
||||||
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD 1
|
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD 1
|
||||||
#define ORTE_SNAPC_FULL_UPDATE_PROC_STATE_CMD 2
|
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_QUICK_CMD 2
|
||||||
#define ORTE_SNAPC_FULL_VPID_ASSOC_CMD 3
|
#define ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_CMD 3
|
||||||
#define ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD 4
|
#define ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_QUICK_CMD 4
|
||||||
|
#define ORTE_SNAPC_FULL_VPID_ASSOC_CMD 5
|
||||||
|
#define ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD 6
|
||||||
|
#define ORTE_SNAPC_FULL_MAX 7
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Local Component structures
|
* Local Component structures
|
||||||
*/
|
*/
|
||||||
struct orte_snapc_full_component_t {
|
struct orte_snapc_full_component_t {
|
||||||
orte_snapc_base_component_t super; /** Base SNAPC component */
|
orte_snapc_base_component_t super; /** Base SNAPC component */
|
||||||
|
|
||||||
};
|
};
|
||||||
typedef struct orte_snapc_full_component_t orte_snapc_full_component_t;
|
typedef struct orte_snapc_full_component_t orte_snapc_full_component_t;
|
||||||
OPAL_MODULE_DECLSPEC extern orte_snapc_full_component_t mca_snapc_full_component;
|
OPAL_MODULE_DECLSPEC extern orte_snapc_full_component_t mca_snapc_full_component;
|
||||||
|
|
||||||
struct orte_snapc_full_global_snapshot_t {
|
/*
|
||||||
|
* Global Coordinator per orted metadata
|
||||||
|
*/
|
||||||
|
struct orte_snapc_full_orted_snapshot_t {
|
||||||
/** Base SNAPC Global snapshot type */
|
/** Base SNAPC Global snapshot type */
|
||||||
orte_snapc_base_snapshot_t super;
|
orte_snapc_base_global_snapshot_t super;
|
||||||
|
|
||||||
/** Local coordinator associated with this vpid */
|
/** ORTE Process name */
|
||||||
orte_process_name_t local_coord;
|
orte_process_name_t process_name;
|
||||||
|
|
||||||
|
/** State of the checkpoint */
|
||||||
|
int state;
|
||||||
|
|
||||||
|
/** OPAL CRS Component */
|
||||||
|
char * opal_crs;
|
||||||
|
|
||||||
|
/** Term flag */
|
||||||
|
bool term;
|
||||||
|
|
||||||
|
/** FileM request */
|
||||||
|
orte_filem_base_request_t *filem_request;
|
||||||
};
|
};
|
||||||
typedef struct orte_snapc_full_global_snapshot_t orte_snapc_full_global_snapshot_t;
|
typedef struct orte_snapc_full_orted_snapshot_t orte_snapc_full_orted_snapshot_t;
|
||||||
|
OBJ_CLASS_DECLARATION(orte_snapc_full_orted_snapshot_t);
|
||||||
|
|
||||||
OBJ_CLASS_DECLARATION(orte_snapc_full_global_snapshot_t);
|
/*
|
||||||
|
* Local Coordinator per app metadata
|
||||||
struct orte_snapc_full_local_snapshot_t {
|
*/
|
||||||
|
struct orte_snapc_full_app_snapshot_t {
|
||||||
/** Base SNAPC Global snapshot type */
|
/** Base SNAPC Global snapshot type */
|
||||||
orte_snapc_base_snapshot_t super;
|
orte_snapc_base_local_snapshot_t super;
|
||||||
|
|
||||||
/** Named Pipe Read and Write */
|
/** Named Pipe Read and Write */
|
||||||
char * comm_pipe_r;
|
char * comm_pipe_r;
|
||||||
@ -79,14 +99,18 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
|
|||||||
struct opal_event comm_pipe_r_eh;
|
struct opal_event comm_pipe_r_eh;
|
||||||
bool is_eh_active;
|
bool is_eh_active;
|
||||||
|
|
||||||
/** State of the process wrt checkpointing */
|
/** Process pid */
|
||||||
int ckpt_state;
|
pid_t process_pid;
|
||||||
};
|
|
||||||
typedef struct orte_snapc_full_local_snapshot_t orte_snapc_full_local_snapshot_t;
|
|
||||||
|
|
||||||
OBJ_CLASS_DECLARATION(orte_snapc_full_local_snapshot_t);
|
/** Term */
|
||||||
|
bool term;
|
||||||
|
};
|
||||||
|
typedef struct orte_snapc_full_app_snapshot_t orte_snapc_full_app_snapshot_t;
|
||||||
|
OBJ_CLASS_DECLARATION(orte_snapc_full_app_snapshot_t);
|
||||||
|
|
||||||
extern bool orte_snapc_full_skip_filem;
|
extern bool orte_snapc_full_skip_filem;
|
||||||
|
extern bool orte_snapc_full_skip_app;
|
||||||
|
extern bool orte_snapc_full_timing_enabled;
|
||||||
|
|
||||||
int orte_snapc_full_component_query(mca_base_module_t **module, int *priority);
|
int orte_snapc_full_component_query(mca_base_module_t **module, int *priority);
|
||||||
|
|
||||||
@ -108,12 +132,11 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
|
|||||||
int global_coord_finalize(void);
|
int global_coord_finalize(void);
|
||||||
int global_coord_setup_job(orte_jobid_t jobid);
|
int global_coord_setup_job(orte_jobid_t jobid);
|
||||||
int global_coord_release_job(orte_jobid_t jobid);
|
int global_coord_release_job(orte_jobid_t jobid);
|
||||||
int global_coord_vpid_assoc_update(orte_process_name_t local_coord,
|
int global_coord_orted_state_update(orte_process_name_t proc_name,
|
||||||
orte_process_name_t proc_name);
|
int proc_ckpt_state,
|
||||||
int global_coord_vpid_state_update(orte_process_name_t proc_name,
|
|
||||||
size_t proc_ckpt_state,
|
|
||||||
char **proc_ckpt_ref,
|
char **proc_ckpt_ref,
|
||||||
char **proc_ckpt_loc);
|
char **proc_ckpt_loc,
|
||||||
|
char **agent_ckpt);
|
||||||
/*
|
/*
|
||||||
* Local Coordinator Functionality
|
* Local Coordinator Functionality
|
||||||
*/
|
*/
|
||||||
@ -122,7 +145,7 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
|
|||||||
int local_coord_setup_job(orte_jobid_t jobid);
|
int local_coord_setup_job(orte_jobid_t jobid);
|
||||||
int local_coord_release_job(orte_jobid_t jobid);
|
int local_coord_release_job(orte_jobid_t jobid);
|
||||||
int local_coord_job_state_update(orte_jobid_t jobid,
|
int local_coord_job_state_update(orte_jobid_t jobid,
|
||||||
size_t job_ckpt_state,
|
int job_ckpt_state,
|
||||||
char **job_ckpt_ref,
|
char **job_ckpt_ref,
|
||||||
char **job_ckpt_loc);
|
char **job_ckpt_loc);
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2007 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -219,7 +219,12 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
|||||||
opal_cr_currently_stalled = false;
|
opal_cr_currently_stalled = false;
|
||||||
|
|
||||||
app_pid = getpid();
|
app_pid = getpid();
|
||||||
|
if( orte_snapc_full_skip_app ) {
|
||||||
|
ret = ORTE_SUCCESS;
|
||||||
|
cr_state = OPAL_CRS_CONTINUE;
|
||||||
|
} else {
|
||||||
ret = opal_cr_inc_core(app_pid, local_snapshot, app_term, &cr_state);
|
ret = opal_cr_inc_core(app_pid, local_snapshot, app_term, &cr_state);
|
||||||
|
}
|
||||||
if( OPAL_EXISTS == ret ) {
|
if( OPAL_EXISTS == ret ) {
|
||||||
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
|
||||||
"App) notify_response: Stalling the checkpoint progress until state is stable again (PID = %d)\n",
|
"App) notify_response: Stalling the checkpoint progress until state is stable again (PID = %d)\n",
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -35,6 +35,8 @@ static int snapc_full_open(void);
|
|||||||
static int snapc_full_close(void);
|
static int snapc_full_close(void);
|
||||||
|
|
||||||
bool orte_snapc_full_skip_filem = false;
|
bool orte_snapc_full_skip_filem = false;
|
||||||
|
bool orte_snapc_full_skip_app = false;
|
||||||
|
bool orte_snapc_full_timing_enabled = false;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Instantiate the public struct with all of our public information
|
* Instantiate the public struct with all of our public information
|
||||||
@ -113,6 +115,22 @@ static int snapc_full_open(void)
|
|||||||
&value);
|
&value);
|
||||||
orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value);
|
orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
|
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
||||||
|
"skip_app",
|
||||||
|
"Not for general use! For debugging only! Shortcut app level coord. [Default = disabled]",
|
||||||
|
false, false,
|
||||||
|
0,
|
||||||
|
&value);
|
||||||
|
orte_snapc_full_skip_app = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
|
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
||||||
|
"enable_timing",
|
||||||
|
"Enable timing information. [Default = disabled]",
|
||||||
|
false, false,
|
||||||
|
0,
|
||||||
|
&value);
|
||||||
|
orte_snapc_full_timing_enabled = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Debug Output
|
* Debug Output
|
||||||
*/
|
*/
|
||||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -50,24 +50,24 @@ static orte_snapc_base_module_t loc_module = {
|
|||||||
/*
|
/*
|
||||||
* Global Snapshot structure
|
* Global Snapshot structure
|
||||||
*/
|
*/
|
||||||
void orte_snapc_full_global_construct(orte_snapc_full_global_snapshot_t *obj);
|
void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *obj);
|
||||||
void orte_snapc_full_global_destruct( orte_snapc_full_global_snapshot_t *obj);
|
void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *obj);
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(orte_snapc_full_global_snapshot_t,
|
OBJ_CLASS_INSTANCE(orte_snapc_full_orted_snapshot_t,
|
||||||
orte_snapc_base_snapshot_t,
|
orte_snapc_base_global_snapshot_t,
|
||||||
orte_snapc_full_global_construct,
|
orte_snapc_full_orted_construct,
|
||||||
orte_snapc_full_global_destruct);
|
orte_snapc_full_orted_destruct);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Local Snapshot structure
|
* Local Snapshot structure
|
||||||
*/
|
*/
|
||||||
void orte_snapc_full_local_construct(orte_snapc_full_local_snapshot_t *obj);
|
void orte_snapc_full_app_construct(orte_snapc_full_app_snapshot_t *obj);
|
||||||
void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj);
|
void orte_snapc_full_app_destruct( orte_snapc_full_app_snapshot_t *obj);
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(orte_snapc_full_local_snapshot_t,
|
OBJ_CLASS_INSTANCE(orte_snapc_full_app_snapshot_t,
|
||||||
orte_snapc_base_snapshot_t,
|
orte_snapc_base_local_snapshot_t,
|
||||||
orte_snapc_full_local_construct,
|
orte_snapc_full_app_construct,
|
||||||
orte_snapc_full_local_destruct);
|
orte_snapc_full_app_destruct);
|
||||||
|
|
||||||
/************************************
|
/************************************
|
||||||
* Locally Global vars & functions :)
|
* Locally Global vars & functions :)
|
||||||
@ -77,29 +77,53 @@ OBJ_CLASS_INSTANCE(orte_snapc_full_local_snapshot_t,
|
|||||||
/************************
|
/************************
|
||||||
* Function Definitions
|
* Function Definitions
|
||||||
************************/
|
************************/
|
||||||
void orte_snapc_full_global_construct(orte_snapc_full_global_snapshot_t *snapshot) {
|
void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot) {
|
||||||
snapshot->local_coord.vpid = 0;
|
snapshot->process_name.jobid = 0;
|
||||||
snapshot->local_coord.jobid = 0;
|
snapshot->process_name.vpid = 0;
|
||||||
|
|
||||||
|
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||||
|
|
||||||
|
snapshot->opal_crs = NULL;
|
||||||
|
|
||||||
|
snapshot->term = false;
|
||||||
|
|
||||||
|
snapshot->filem_request = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void orte_snapc_full_global_destruct( orte_snapc_full_global_snapshot_t *snapshot) {
|
void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *snapshot) {
|
||||||
snapshot->local_coord.vpid = 0;
|
snapshot->process_name.jobid = 0;
|
||||||
snapshot->local_coord.jobid = 0;
|
snapshot->process_name.vpid = 0;
|
||||||
|
|
||||||
|
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||||
|
|
||||||
|
if( NULL != snapshot->opal_crs ) {
|
||||||
|
free( snapshot->opal_crs );
|
||||||
|
snapshot->opal_crs = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshot->term = false;
|
||||||
|
|
||||||
|
if( NULL != snapshot->filem_request ) {
|
||||||
|
OBJ_RELEASE(snapshot->filem_request);
|
||||||
|
snapshot->filem_request = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void orte_snapc_full_local_construct(orte_snapc_full_local_snapshot_t *obj) {
|
void orte_snapc_full_app_construct(orte_snapc_full_app_snapshot_t *obj) {
|
||||||
obj->comm_pipe_r = NULL;
|
obj->comm_pipe_r = NULL;
|
||||||
obj->comm_pipe_w = NULL;
|
obj->comm_pipe_w = NULL;
|
||||||
|
|
||||||
obj->comm_pipe_r_fd = -1;
|
obj->comm_pipe_r_fd = -1;
|
||||||
obj->comm_pipe_w_fd = -1;
|
obj->comm_pipe_w_fd = -1;
|
||||||
|
|
||||||
obj->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
|
|
||||||
|
|
||||||
obj->is_eh_active = false;
|
obj->is_eh_active = false;
|
||||||
|
|
||||||
|
obj->process_pid = 0;
|
||||||
|
|
||||||
|
obj->term = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj) {
|
void orte_snapc_full_app_destruct( orte_snapc_full_app_snapshot_t *obj) {
|
||||||
if( NULL != obj->comm_pipe_r ) {
|
if( NULL != obj->comm_pipe_r ) {
|
||||||
free(obj->comm_pipe_r);
|
free(obj->comm_pipe_r);
|
||||||
obj->comm_pipe_r = NULL;
|
obj->comm_pipe_r = NULL;
|
||||||
@ -113,9 +137,11 @@ void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj) {
|
|||||||
obj->comm_pipe_r_fd = -1;
|
obj->comm_pipe_r_fd = -1;
|
||||||
obj->comm_pipe_w_fd = -1;
|
obj->comm_pipe_w_fd = -1;
|
||||||
|
|
||||||
obj->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
|
|
||||||
|
|
||||||
obj->is_eh_active = false;
|
obj->is_eh_active = false;
|
||||||
|
|
||||||
|
obj->process_pid = 0;
|
||||||
|
|
||||||
|
obj->term = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -89,45 +89,61 @@ BEGIN_C_DECLS
|
|||||||
/**
|
/**
|
||||||
* States that a process can be in while checkpointing
|
* States that a process can be in while checkpointing
|
||||||
*/
|
*/
|
||||||
/* Doing no checkpoint -- Quiet state */
|
|
||||||
#define ORTE_SNAPC_CKPT_STATE_NONE 0
|
|
||||||
/* There has been a request for a checkpoint from one of the applications */
|
|
||||||
#define ORTE_SNAPC_CKPT_STATE_REQUEST 1
|
|
||||||
/* There is a Pending checkpoint for this process */
|
|
||||||
#define ORTE_SNAPC_CKPT_STATE_PENDING 2
|
|
||||||
/* There is a Pending checkpoint for this process, terminate the process after checkpoint */
|
|
||||||
#define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 3
|
|
||||||
/* Running the checkpoint */
|
|
||||||
#define ORTE_SNAPC_CKPT_STATE_RUNNING 4
|
|
||||||
/* Finished the checkpoint */
|
|
||||||
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 5
|
|
||||||
/* Finished the checkpoint */
|
|
||||||
#define ORTE_SNAPC_CKPT_STATE_FINISHED 6
|
|
||||||
/* Unable to checkpoint this job */
|
|
||||||
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 7
|
|
||||||
/* Reached an error */
|
/* Reached an error */
|
||||||
#define ORTE_SNAPC_CKPT_STATE_ERROR 8
|
#define ORTE_SNAPC_CKPT_STATE_ERROR 0
|
||||||
|
|
||||||
|
/* Doing no checkpoint -- Quiet state */
|
||||||
|
#define ORTE_SNAPC_CKPT_STATE_NONE 1
|
||||||
|
/* There has been a request for a checkpoint from one of the applications */
|
||||||
|
#define ORTE_SNAPC_CKPT_STATE_REQUEST 2
|
||||||
|
/* There is a Pending checkpoint for this process */
|
||||||
|
#define ORTE_SNAPC_CKPT_STATE_PENDING 3
|
||||||
|
/* There is a Pending checkpoint for this process, terminate the process after checkpoint */
|
||||||
|
#define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 4
|
||||||
|
/* Running the checkpoint */
|
||||||
|
#define ORTE_SNAPC_CKPT_STATE_RUNNING 5
|
||||||
|
/* Finished the checkpoint locally */
|
||||||
|
#define ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL 6
|
||||||
|
/* File Transfer in progress */
|
||||||
|
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 8
|
||||||
|
/* Finished the checkpoint */
|
||||||
|
#define ORTE_SNAPC_CKPT_STATE_FINISHED 9
|
||||||
|
/* Unable to checkpoint this job */
|
||||||
|
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 10
|
||||||
|
#define ORTE_SNAPC_CKPT_MAX 11
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Definition of a orte local snapshot.
|
* Definition of a orte local snapshot.
|
||||||
* Similar to the opal_crs_base_snapshot_t except that it
|
* Similar to the opal_crs_base_snapshot_t except that it
|
||||||
* contains process contact information.
|
* contains process contact information.
|
||||||
*/
|
*/
|
||||||
struct orte_snapc_base_snapshot_1_0_0_t {
|
struct orte_snapc_base_local_snapshot_1_0_0_t {
|
||||||
opal_crs_base_snapshot_t crs_snapshot_super;
|
/** List super object */
|
||||||
|
opal_list_item_t super;
|
||||||
|
|
||||||
/** ORTE Process name */
|
/** ORTE Process name */
|
||||||
orte_process_name_t process_name;
|
orte_process_name_t process_name;
|
||||||
/** PID of the application process that generated this snapshot */
|
|
||||||
pid_t process_pid;
|
|
||||||
/** State of the checkpoint */
|
|
||||||
size_t state;
|
|
||||||
/** Terminate this process after a checkpoint */
|
|
||||||
bool term;
|
|
||||||
};
|
|
||||||
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_1_0_0_t;
|
|
||||||
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_t;
|
|
||||||
|
|
||||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_snapshot_t);
|
/** State of the checkpoint */
|
||||||
|
int state;
|
||||||
|
|
||||||
|
/** Unique name of the local snapshot */
|
||||||
|
char * reference_name;
|
||||||
|
|
||||||
|
/** Local location of the local snapshot Absolute path */
|
||||||
|
char * local_location;
|
||||||
|
|
||||||
|
/** Remote location of the local snapshot Absolute path */
|
||||||
|
char * remote_location;
|
||||||
|
|
||||||
|
/** CRS agent */
|
||||||
|
char * opal_crs;
|
||||||
|
|
||||||
|
};
|
||||||
|
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_1_0_0_t;
|
||||||
|
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_t;
|
||||||
|
|
||||||
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_local_snapshot_t);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Definition of the global snapshot.
|
* Definition of the global snapshot.
|
||||||
@ -138,11 +154,8 @@ struct orte_snapc_base_global_snapshot_1_0_0_t {
|
|||||||
/** This is an object, so must have super */
|
/** This is an object, so must have super */
|
||||||
opal_list_item_t super;
|
opal_list_item_t super;
|
||||||
|
|
||||||
/** A list of orte_snapc_base_snapshot_ts */
|
/** A list of orte_snapc_base_snapshot_t's */
|
||||||
opal_list_t snapshots;
|
opal_list_t local_snapshots;
|
||||||
|
|
||||||
/* ORTE SnapC Component used to generate the global snapshot */
|
|
||||||
char * component_name;
|
|
||||||
|
|
||||||
/** Unique name of the global snapshot */
|
/** Unique name of the global snapshot */
|
||||||
char * reference_name;
|
char * reference_name;
|
||||||
@ -152,12 +165,6 @@ struct orte_snapc_base_global_snapshot_1_0_0_t {
|
|||||||
|
|
||||||
/** Sequence Number */
|
/** Sequence Number */
|
||||||
int seq_num;
|
int seq_num;
|
||||||
|
|
||||||
/** Beginning timestamp */
|
|
||||||
char * start_time;
|
|
||||||
|
|
||||||
/** Ending timestamp */
|
|
||||||
char * end_time;
|
|
||||||
};
|
};
|
||||||
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
|
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
|
||||||
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;
|
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -76,6 +76,8 @@
|
|||||||
#include "orte/mca/snapc/snapc.h"
|
#include "orte/mca/snapc/snapc.h"
|
||||||
#include "orte/mca/snapc/base/base.h"
|
#include "orte/mca/snapc/base/base.h"
|
||||||
|
|
||||||
|
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||||
|
|
||||||
/******************
|
/******************
|
||||||
* Local Functions
|
* Local Functions
|
||||||
******************/
|
******************/
|
||||||
@ -108,11 +110,16 @@ static int global_sequence_num = 0;
|
|||||||
*****************************************/
|
*****************************************/
|
||||||
static bool listener_started = false;
|
static bool listener_started = false;
|
||||||
|
|
||||||
|
static double timer_start = 0;
|
||||||
|
static double timer_last = 0;
|
||||||
|
static double get_time(void);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
bool help;
|
bool help;
|
||||||
int pid;
|
int pid;
|
||||||
bool term;
|
bool term;
|
||||||
bool verbose;
|
bool verbose;
|
||||||
|
int verbose_level;
|
||||||
orte_jobid_t req_hnp; /**< User Requested HNP */
|
orte_jobid_t req_hnp; /**< User Requested HNP */
|
||||||
bool nowait; /* Do not wait for checkpoint to complete before returning */
|
bool nowait; /* Do not wait for checkpoint to complete before returning */
|
||||||
bool status; /* Display status messages while checkpoint is progressing */
|
bool status; /* Display status messages while checkpoint is progressing */
|
||||||
@ -135,6 +142,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
|
|||||||
&orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
&orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Be Verbose" },
|
"Be Verbose" },
|
||||||
|
|
||||||
|
{ NULL, NULL, NULL,
|
||||||
|
'V', NULL, NULL,
|
||||||
|
1,
|
||||||
|
&orte_checkpoint_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Set the verbosity level (For additional debugging information)" },
|
||||||
|
|
||||||
{ NULL, NULL, NULL,
|
{ NULL, NULL, NULL,
|
||||||
'\0', NULL, "term",
|
'\0', NULL, "term",
|
||||||
0,
|
0,
|
||||||
@ -279,6 +292,7 @@ static int parse_args(int argc, char *argv[]) {
|
|||||||
orte_checkpoint_globals.pid = -1;
|
orte_checkpoint_globals.pid = -1;
|
||||||
orte_checkpoint_globals.term = false;
|
orte_checkpoint_globals.term = false;
|
||||||
orte_checkpoint_globals.verbose = false;
|
orte_checkpoint_globals.verbose = false;
|
||||||
|
orte_checkpoint_globals.verbose_level = 0;
|
||||||
orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID;
|
orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID;
|
||||||
orte_checkpoint_globals.nowait = false;
|
orte_checkpoint_globals.nowait = false;
|
||||||
orte_checkpoint_globals.status = false;
|
orte_checkpoint_globals.status = false;
|
||||||
@ -344,6 +358,14 @@ static int parse_args(int argc, char *argv[]) {
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(orte_checkpoint_globals.verbose_level < 0 ) {
|
||||||
|
orte_checkpoint_globals.verbose_level = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(orte_checkpoint_globals.verbose_level > 0) {
|
||||||
|
orte_checkpoint_globals.verbose = true;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the user did not supply an hnp jobid, then they must
|
* If the user did not supply an hnp jobid, then they must
|
||||||
* supply the PID of MPIRUN
|
* supply the PID of MPIRUN
|
||||||
@ -474,7 +496,7 @@ static int ckpt_init(int argc, char *argv[]) {
|
|||||||
*/
|
*/
|
||||||
if( orte_checkpoint_globals.verbose ) {
|
if( orte_checkpoint_globals.verbose ) {
|
||||||
orte_checkpoint_globals.output = opal_output_open(NULL);
|
orte_checkpoint_globals.output = opal_output_open(NULL);
|
||||||
opal_output_set_verbosity(orte_checkpoint_globals.output, 10);
|
opal_output_set_verbosity(orte_checkpoint_globals.output, orte_checkpoint_globals.verbose_level);
|
||||||
} else {
|
} else {
|
||||||
orte_checkpoint_globals.output = 0; /* Default=STDERR */
|
orte_checkpoint_globals.output = 0; /* Default=STDERR */
|
||||||
}
|
}
|
||||||
@ -661,6 +683,8 @@ notify_process_for_checkpoint(int term)
|
|||||||
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
|
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
|
||||||
orte_checkpoint_globals.pid);
|
orte_checkpoint_globals.pid);
|
||||||
|
|
||||||
|
timer_start = get_time();
|
||||||
|
|
||||||
/***********************************
|
/***********************************
|
||||||
* Notify HNP of checkpoint request
|
* Notify HNP of checkpoint request
|
||||||
* Send:
|
* Send:
|
||||||
@ -709,18 +733,51 @@ notify_process_for_checkpoint(int term)
|
|||||||
/***************
|
/***************
|
||||||
* Pretty Print
|
* Pretty Print
|
||||||
***************/
|
***************/
|
||||||
|
static double get_time(void) {
|
||||||
|
double wtime;
|
||||||
|
|
||||||
|
#if OPAL_TIMER_USEC_NATIVE
|
||||||
|
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
|
||||||
|
#else
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
wtime = tv.tv_sec;
|
||||||
|
wtime += (double)tv.tv_usec / 1000000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return wtime;
|
||||||
|
}
|
||||||
|
|
||||||
static int pretty_print_status(void) {
|
static int pretty_print_status(void) {
|
||||||
char * state_str = NULL;
|
char * state_str = NULL;
|
||||||
|
double cur_time;
|
||||||
|
|
||||||
state_str = orte_snapc_ckpt_state_str(orte_checkpoint_globals.ckpt_status);
|
cur_time = get_time();
|
||||||
|
|
||||||
|
if( timer_last == 0 ) {
|
||||||
|
timer_last = cur_time;
|
||||||
|
}
|
||||||
|
|
||||||
|
orte_snapc_ckpt_state_str(&state_str, orte_checkpoint_globals.ckpt_status);
|
||||||
|
|
||||||
|
if( NULL != global_snapshot_handle ) {
|
||||||
opal_output(0,
|
opal_output(0,
|
||||||
"%*s - Global Snapshot Reference: %s\n",
|
"[%6.2f / %6.2f] %*s - %s\n",
|
||||||
|
(cur_time - timer_last), (cur_time - timer_start),
|
||||||
25, state_str, global_snapshot_handle);
|
25, state_str, global_snapshot_handle);
|
||||||
|
} else {
|
||||||
|
opal_output(0,
|
||||||
|
"[%6.2f / %6.2f] %*s - ...\n",
|
||||||
|
(cur_time - timer_last), (cur_time - timer_start),
|
||||||
|
25, state_str);
|
||||||
|
}
|
||||||
|
|
||||||
if( NULL != state_str) {
|
if( NULL != state_str) {
|
||||||
free(state_str);
|
free(state_str);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
timer_last = cur_time;
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -491,6 +491,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
|
|||||||
orte_std_cntr_t i;
|
orte_std_cntr_t i;
|
||||||
char *jobstr;
|
char *jobstr;
|
||||||
orte_jobid_t mask=0x0000ffff;
|
orte_jobid_t mask=0x0000ffff;
|
||||||
|
char * state_str = NULL;
|
||||||
|
|
||||||
for(i=0; i < num_jobs; i++) {
|
for(i=0; i < num_jobs; i++) {
|
||||||
job = jobs[i];
|
job = jobs[i];
|
||||||
@ -513,9 +514,10 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
|
|||||||
len_slots = 6;
|
len_slots = 6;
|
||||||
len_vpid_r = (int) strlen("Num Procs");
|
len_vpid_r = (int) strlen("Num Procs");
|
||||||
#if OPAL_ENABLE_FT == 1
|
#if OPAL_ENABLE_FT == 1
|
||||||
len_ckpt_s = (int) (strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) < strlen("Ckpt State") ?
|
orte_snapc_ckpt_state_str(&state_str, job->ckpt_state);
|
||||||
|
len_ckpt_s = (int) (strlen(state_str) < strlen("Ckpt State") ?
|
||||||
strlen("Ckpt State") :
|
strlen("Ckpt State") :
|
||||||
strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) );
|
strlen(state_str) );
|
||||||
len_ckpt_r = (int) (NULL == job->ckpt_snapshot_ref ? strlen("Ckpt Ref") :
|
len_ckpt_r = (int) (NULL == job->ckpt_snapshot_ref ? strlen("Ckpt Ref") :
|
||||||
(strlen(job->ckpt_snapshot_ref) < strlen("Ckpt Ref") ?
|
(strlen(job->ckpt_snapshot_ref) < strlen("Ckpt Ref") ?
|
||||||
strlen("Ckpt Ref") :
|
strlen("Ckpt Ref") :
|
||||||
@ -525,6 +527,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
|
|||||||
strlen("Ckpt Loc") :
|
strlen("Ckpt Loc") :
|
||||||
strlen(job->ckpt_snapshot_loc) ) );
|
strlen(job->ckpt_snapshot_loc) ) );
|
||||||
#else
|
#else
|
||||||
|
state_str = NULL;
|
||||||
len_ckpt_s = -3;
|
len_ckpt_s = -3;
|
||||||
len_ckpt_r = -3;
|
len_ckpt_r = -3;
|
||||||
len_ckpt_l = -3;
|
len_ckpt_l = -3;
|
||||||
@ -564,7 +567,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
|
|||||||
printf("%*d | ", len_slots , (uint)job->total_slots_alloc);
|
printf("%*d | ", len_slots , (uint)job->total_slots_alloc);
|
||||||
printf("%*d | ", len_vpid_r, job->num_procs);
|
printf("%*d | ", len_vpid_r, job->num_procs);
|
||||||
#if OPAL_ENABLE_FT == 1
|
#if OPAL_ENABLE_FT == 1
|
||||||
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(job->ckpt_state));
|
printf("%*s | ", len_ckpt_s, state_str);
|
||||||
printf("%*s | ", len_ckpt_r, (NULL == job->ckpt_snapshot_ref ?
|
printf("%*s | ", len_ckpt_r, (NULL == job->ckpt_snapshot_ref ?
|
||||||
"" :
|
"" :
|
||||||
job->ckpt_snapshot_ref) );
|
job->ckpt_snapshot_ref) );
|
||||||
@ -597,6 +600,7 @@ static int pretty_print_vpids(orte_job_t *job) {
|
|||||||
orte_proc_t *vpid;
|
orte_proc_t *vpid;
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
char *o_proc_name;
|
char *o_proc_name;
|
||||||
|
char *state_str = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Caculate segment lengths
|
* Caculate segment lengths
|
||||||
@ -663,8 +667,9 @@ static int pretty_print_vpids(orte_job_t *job) {
|
|||||||
len_state = strlen(pretty_vpid_state(vpid->state));
|
len_state = strlen(pretty_vpid_state(vpid->state));
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT == 1
|
#if OPAL_ENABLE_FT == 1
|
||||||
if( (int)strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state)) > len_ckpt_s)
|
orte_snapc_ckpt_state_str(&state_str, vpid->ckpt_state);
|
||||||
len_ckpt_s = strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state));
|
if( (int)strlen(state_str) > len_ckpt_s)
|
||||||
|
len_ckpt_s = strlen(state_str);
|
||||||
|
|
||||||
if( NULL != vpid->ckpt_snapshot_ref &&
|
if( NULL != vpid->ckpt_snapshot_ref &&
|
||||||
(int)strlen(vpid->ckpt_snapshot_ref) > len_ckpt_r)
|
(int)strlen(vpid->ckpt_snapshot_ref) > len_ckpt_r)
|
||||||
@ -673,6 +678,8 @@ static int pretty_print_vpids(orte_job_t *job) {
|
|||||||
if( NULL != vpid->ckpt_snapshot_loc &&
|
if( NULL != vpid->ckpt_snapshot_loc &&
|
||||||
(int)strlen(vpid->ckpt_snapshot_loc) > len_ckpt_l)
|
(int)strlen(vpid->ckpt_snapshot_loc) > len_ckpt_l)
|
||||||
len_ckpt_l = strlen(vpid->ckpt_snapshot_loc);
|
len_ckpt_l = strlen(vpid->ckpt_snapshot_loc);
|
||||||
|
#else
|
||||||
|
state_str = NULL;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -739,7 +746,7 @@ static int pretty_print_vpids(orte_job_t *job) {
|
|||||||
printf("%*s | ", len_state , pretty_vpid_state(vpid->state));
|
printf("%*s | ", len_state , pretty_vpid_state(vpid->state));
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT == 1
|
#if OPAL_ENABLE_FT == 1
|
||||||
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(vpid->ckpt_state));
|
printf("%*s | ", len_ckpt_s, state_str);
|
||||||
printf("%*s | ", len_ckpt_r, (NULL == vpid->ckpt_snapshot_ref ?
|
printf("%*s | ", len_ckpt_r, (NULL == vpid->ckpt_snapshot_ref ?
|
||||||
"" :
|
"" :
|
||||||
vpid->ckpt_snapshot_ref));
|
vpid->ckpt_snapshot_ref));
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -151,8 +151,9 @@ int
|
|||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
int ret, exit_status = ORTE_SUCCESS;
|
int ret, exit_status = ORTE_SUCCESS;
|
||||||
pid_t child_pid;
|
pid_t child_pid = 0;
|
||||||
orte_snapc_base_global_snapshot_t *snapshot = NULL;
|
orte_snapc_base_global_snapshot_t *snapshot = NULL;
|
||||||
|
char *tmp_str = NULL;
|
||||||
|
|
||||||
/***************
|
/***************
|
||||||
* Initialize
|
* Initialize
|
||||||
@ -164,7 +165,10 @@ main(int argc, char *argv[])
|
|||||||
|
|
||||||
snapshot = OBJ_NEW(orte_snapc_base_global_snapshot_t);
|
snapshot = OBJ_NEW(orte_snapc_base_global_snapshot_t);
|
||||||
snapshot->reference_name = strdup(orte_restart_globals.filename);
|
snapshot->reference_name = strdup(orte_restart_globals.filename);
|
||||||
snapshot->local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(snapshot->reference_name));
|
orte_snapc_base_get_global_snapshot_directory(&tmp_str, snapshot->reference_name);
|
||||||
|
snapshot->local_location = opal_dirname(tmp_str);
|
||||||
|
free(tmp_str);
|
||||||
|
tmp_str = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check for existence of the file
|
* Check for existence of the file
|
||||||
@ -453,11 +457,11 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(item = opal_list_get_first(&snapshot->snapshots);
|
for(item = opal_list_get_first(&snapshot->local_snapshots);
|
||||||
item != opal_list_get_end(&snapshot->snapshots);
|
item != opal_list_get_end(&snapshot->local_snapshots);
|
||||||
item = opal_list_get_next(item) ) {
|
item = opal_list_get_next(item) ) {
|
||||||
orte_snapc_base_snapshot_t *vpid_snapshot;
|
orte_snapc_base_local_snapshot_t *vpid_snapshot;
|
||||||
vpid_snapshot = (orte_snapc_base_snapshot_t*)item;
|
vpid_snapshot = (orte_snapc_base_local_snapshot_t*)item;
|
||||||
|
|
||||||
fprintf(appfile, "#\n");
|
fprintf(appfile, "#\n");
|
||||||
fprintf(appfile, "# Old Process Name: %u.%u\n",
|
fprintf(appfile, "# Old Process Name: %u.%u\n",
|
||||||
@ -467,13 +471,15 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
|
|||||||
fprintf(appfile, "-np 1 ");
|
fprintf(appfile, "-np 1 ");
|
||||||
if(orte_restart_globals.preload) {
|
if(orte_restart_globals.preload) {
|
||||||
fprintf(appfile, "--preload-files %s/%s ",
|
fprintf(appfile, "--preload-files %s/%s ",
|
||||||
vpid_snapshot->crs_snapshot_super.local_location,
|
vpid_snapshot->local_location,
|
||||||
vpid_snapshot->crs_snapshot_super.reference_name);
|
vpid_snapshot->reference_name);
|
||||||
fprintf(appfile, "--preload-files-dest-dir . ");
|
fprintf(appfile, "--preload-files-dest-dir . ");
|
||||||
}
|
}
|
||||||
/* JJH: Make this match what the user originally specified on the command line */
|
/* JJH: Make this match what the user originally specified on the command line */
|
||||||
fprintf(appfile, "-am ft-enable-cr ");
|
fprintf(appfile, "-am ft-enable-cr ");
|
||||||
|
|
||||||
fprintf(appfile, " opal-restart ");
|
fprintf(appfile, " opal-restart ");
|
||||||
|
|
||||||
/* JJH: Make sure this changes if ever the default location of the local file is changed,
|
/* JJH: Make sure this changes if ever the default location of the local file is changed,
|
||||||
* currently it is safe to assume that it is in the current working directory.
|
* currently it is safe to assume that it is in the current working directory.
|
||||||
*
|
*
|
||||||
@ -486,9 +492,9 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
|
|||||||
else {
|
else {
|
||||||
/* If we are *not* preloading the files, the point to the original checkpoint
|
/* If we are *not* preloading the files, the point to the original checkpoint
|
||||||
* directory to access the checkpoint files. */
|
* directory to access the checkpoint files. */
|
||||||
fprintf(appfile, "-mca crs_base_snapshot_dir %s ", vpid_snapshot->crs_snapshot_super.local_location);
|
fprintf(appfile, "-mca crs_base_snapshot_dir %s ", vpid_snapshot->local_location);
|
||||||
}
|
}
|
||||||
fprintf(appfile, "%s\n", vpid_snapshot->crs_snapshot_super.reference_name);
|
fprintf(appfile, "%s\n", vpid_snapshot->reference_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user