A bunch of improvements focused on Snapshot Coordination (SnapC) and File Management (FileM).
* Improved timing in SnapC Full Global Coordinator * Improved scalability of the SnapC Full protocol * Minor improvements to the error reporting mechanisms in SnapC and FileM * Improved the memory usage of the metadata routines - now the owner of the data is more explicit. * Added a FileM hint to indicate when files stored locally can be moved to/from a globally mounted file system using just the 'cp' command instead of the 'rcp/scp' command. Slightly improves performance, but not too drastically. Can be set using the following SnapC MCA parameter: {{{snapc_base_global_shared=1}}} * Implement the ability to throttle the number of outgoing connections in FileM. At larger scales this type of explicit throttling helps prevent overwhelming the HNP machine. Default: 10, set via MCA parameter: {{{filem_rsh_max_outgoing}}} * Add a few diagnostic/debugging features to SnapC and FileM. This commit was SVN r21131.
Этот коммит содержится в:
родитель
38aca518bd
Коммит
0deb009225
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -71,7 +71,11 @@ ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_file_set_t,
|
||||
|
||||
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *req) {
|
||||
req->local_target = NULL;
|
||||
req->local_hint = ORTE_FILEM_HINT_NONE;
|
||||
|
||||
req->remote_target = NULL;
|
||||
req->remote_hint = ORTE_FILEM_HINT_NONE;
|
||||
|
||||
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
||||
|
||||
}
|
||||
@ -81,11 +85,13 @@ ORTE_DECLSPEC void orte_filem_base_file_set_destruct( orte_filem_base_file_set_t
|
||||
free(req->local_target);
|
||||
req->local_target = NULL;
|
||||
}
|
||||
req->local_hint = ORTE_FILEM_HINT_NONE;
|
||||
|
||||
if( NULL != req->remote_target ) {
|
||||
free(req->remote_target);
|
||||
req->remote_target = NULL;
|
||||
}
|
||||
req->remote_hint = ORTE_FILEM_HINT_NONE;
|
||||
|
||||
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -77,6 +77,7 @@ int orte_filem_base_open(void)
|
||||
NULL, &str_value);
|
||||
if( NULL != str_value ) {
|
||||
free(str_value);
|
||||
str_value = NULL;
|
||||
}
|
||||
|
||||
/* Open up all available components */
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -54,6 +54,13 @@ extern "C" {
|
||||
#define ORTE_FILEM_MOVE_TYPE_RM 2
|
||||
#define ORTE_FILEM_MOVE_TYPE_UNKNOWN 3
|
||||
|
||||
/**
|
||||
* Hints that describe the local or remote file target for
|
||||
* optimization purposes.
|
||||
*/
|
||||
#define ORTE_FILEM_HINT_NONE 0
|
||||
#define ORTE_FILEM_HINT_SHARED 1
|
||||
|
||||
/**
|
||||
* Define a Process Set
|
||||
*
|
||||
@ -92,9 +99,15 @@ struct orte_filem_base_file_set_1_0_0_t {
|
||||
/* Local file reference */
|
||||
char * local_target;
|
||||
|
||||
/* Local file reference hints */
|
||||
int local_hint;
|
||||
|
||||
/* Remove file reference */
|
||||
char * remote_target;
|
||||
|
||||
/* Remote file reference hints */
|
||||
int remote_hint;
|
||||
|
||||
/* Type of file to move */
|
||||
int target_flag;
|
||||
};
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -37,7 +37,7 @@ static int filem_rsh_open(void);
|
||||
static int filem_rsh_close(void);
|
||||
|
||||
int orte_filem_rsh_max_incomming = 10;
|
||||
int orte_filem_rsh_max_outgoing = 10;
|
||||
int orte_filem_rsh_max_outgoing = 10;
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -42,6 +42,7 @@
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/basename.h"
|
||||
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/threads/condition.h"
|
||||
@ -632,41 +633,51 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
|
||||
}
|
||||
/* Do not check a local get() operation, to help supress the warnings from the HNP */
|
||||
else if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) {
|
||||
char *base = NULL;
|
||||
asprintf(&base, "%s/%s", f_set->local_target, opal_basename(f_set->remote_target));
|
||||
/*
|
||||
* The file should not exist if we are getting a file with the
|
||||
* same name since we do not want to overwrite the filename
|
||||
* without the users consent.
|
||||
*/
|
||||
if( 0 == access(f_set->local_target, R_OK) ) {
|
||||
if( 0 == access(base, R_OK) ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
||||
"filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination\n",
|
||||
"filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination (%s)\n",
|
||||
ORTE_NAME_PRINT(&p_set->source),
|
||||
ORTE_NAME_PRINT(&p_set->sink),
|
||||
f_set->remote_target,
|
||||
f_set->local_target));
|
||||
f_set->local_target, base));
|
||||
orte_show_help("help-orte-filem-rsh.txt",
|
||||
"orte-filem-rsh:get-file-exists",
|
||||
true, f_set->local_target, orte_process_info.nodename);
|
||||
free(base);
|
||||
base = NULL;
|
||||
request->is_done[cur_index] = true;
|
||||
request->is_active[cur_index] = true;
|
||||
request->exit_status[cur_index] = -1;
|
||||
goto continue_set;
|
||||
}
|
||||
free(base);
|
||||
base = NULL;
|
||||
}
|
||||
|
||||
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
||||
"filem:rsh: copy(): %s -> %s: Moving file %s to %s\n",
|
||||
"filem:rsh: copy(): %s -> %s: Moving file %s %s to %s %s\n",
|
||||
ORTE_NAME_PRINT(&p_set->source),
|
||||
ORTE_NAME_PRINT(&p_set->sink),
|
||||
(f_set->local_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
|
||||
f_set->local_target,
|
||||
(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
|
||||
f_set->remote_target));
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
||||
"filem:rsh: copy(): %s -> %s: Moving file %s to %s\n",
|
||||
"filem:rsh: copy(): %s -> %s: Moving file %s %s to %s %s\n",
|
||||
ORTE_NAME_PRINT(&p_set->source),
|
||||
ORTE_NAME_PRINT(&p_set->sink),
|
||||
(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
|
||||
f_set->remote_target,
|
||||
(f_set->local_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
|
||||
f_set->local_target));
|
||||
}
|
||||
|
||||
@ -736,12 +747,20 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
|
||||
* If this is the put() routine
|
||||
*/
|
||||
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
|
||||
asprintf(&command, "%s %s %s %s:%s ",
|
||||
mca_filem_rsh_component.cp_command,
|
||||
dir_arg,
|
||||
f_set->local_target,
|
||||
remote_machine,
|
||||
remote_file);
|
||||
/* Use a local 'cp' when able */
|
||||
if(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ) {
|
||||
asprintf(&command, "cp %s %s %s ",
|
||||
dir_arg,
|
||||
f_set->local_target,
|
||||
remote_file);
|
||||
} else {
|
||||
asprintf(&command, "%s %s %s %s:%s ",
|
||||
mca_filem_rsh_component.cp_command,
|
||||
dir_arg,
|
||||
f_set->local_target,
|
||||
remote_machine,
|
||||
remote_file);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
|
||||
"filem:rsh:put about to execute [%s]", command));
|
||||
|
||||
@ -758,13 +777,23 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
|
||||
* ow it is the get() routine
|
||||
*/
|
||||
else {
|
||||
asprintf(&command, "%s %s %s:%s %s ",
|
||||
mca_filem_rsh_component.cp_command,
|
||||
dir_arg,
|
||||
remote_machine,
|
||||
remote_file,
|
||||
f_set->local_target);
|
||||
|
||||
/* Use a local 'cp' when able */
|
||||
if(f_set->local_hint == ORTE_FILEM_HINT_SHARED ) {
|
||||
asprintf(&command, "%s %s cp %s %s %s ",
|
||||
mca_filem_rsh_component.remote_sh_command,
|
||||
remote_machine,
|
||||
dir_arg,
|
||||
remote_file,
|
||||
f_set->local_target);
|
||||
} else {
|
||||
asprintf(&command, "%s %s %s:%s %s ",
|
||||
mca_filem_rsh_component.cp_command,
|
||||
dir_arg,
|
||||
remote_machine,
|
||||
remote_file,
|
||||
f_set->local_target);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
|
||||
"filem:rsh:get about to execute [%s]", command));
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -75,8 +75,8 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
|
||||
/**
|
||||
* Global Snapshot Object Maintenance functions
|
||||
*/
|
||||
void orte_snapc_base_snapshot_construct(orte_snapc_base_snapshot_t *obj);
|
||||
void orte_snapc_base_snapshot_destruct( orte_snapc_base_snapshot_t *obj);
|
||||
void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *obj);
|
||||
void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *obj);
|
||||
|
||||
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *obj);
|
||||
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *obj);
|
||||
@ -132,24 +132,26 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
|
||||
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
|
||||
ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq;
|
||||
ORTE_DECLSPEC extern bool orte_snapc_base_establish_global_snapshot_dir;
|
||||
ORTE_DECLSPEC extern bool orte_snapc_base_is_global_dir_shared;
|
||||
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;
|
||||
|
||||
|
||||
/**
|
||||
* Some utility functions
|
||||
*/
|
||||
ORTE_DECLSPEC char * orte_snapc_ckpt_state_str(size_t state);
|
||||
ORTE_DECLSPEC int orte_snapc_ckpt_state_str(char ** state_str, int state);
|
||||
|
||||
ORTE_DECLSPEC char * orte_snapc_base_unique_global_snapshot_name(pid_t pid);
|
||||
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name);
|
||||
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_directory(char *uniq_global_snapshot_name);
|
||||
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
|
||||
ORTE_DECLSPEC int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid);
|
||||
ORTE_DECLSPEC int orte_snapc_base_get_global_snapshot_metadata_file(char **file_name, char *uniq_snapshot_name);
|
||||
ORTE_DECLSPEC int orte_snapc_base_get_global_snapshot_directory(char **dir_name, char *uniq_global_snapshot_name);
|
||||
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
|
||||
bool empty_metadata);
|
||||
ORTE_DECLSPEC int orte_snapc_base_add_timestamp(char * global_snapshot_ref);
|
||||
ORTE_DECLSPEC int orte_snapc_base_add_vpid_metadata(orte_process_name_t *proc,
|
||||
char * global_snapshot_ref,
|
||||
char *snapshot_ref,
|
||||
char *snapshot_location);
|
||||
char *snapshot_location,
|
||||
char *crs_agent);
|
||||
ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref);
|
||||
ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot);
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -41,6 +41,7 @@
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
||||
@ -68,27 +69,51 @@ size_t orte_snapc_base_snapshot_seq_number = 0;
|
||||
/******************
|
||||
* Object stuff
|
||||
******************/
|
||||
OBJ_CLASS_INSTANCE(orte_snapc_base_snapshot_t,
|
||||
opal_crs_base_snapshot_t,
|
||||
orte_snapc_base_snapshot_construct,
|
||||
orte_snapc_base_snapshot_destruct);
|
||||
OBJ_CLASS_INSTANCE(orte_snapc_base_local_snapshot_t,
|
||||
opal_list_item_t,
|
||||
orte_snapc_base_local_snapshot_construct,
|
||||
orte_snapc_base_local_snapshot_destruct);
|
||||
|
||||
void orte_snapc_base_snapshot_construct(orte_snapc_base_snapshot_t *snapshot)
|
||||
void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *snapshot)
|
||||
{
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
snapshot->process_pid = 0;
|
||||
|
||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
snapshot->term = false;
|
||||
|
||||
snapshot->reference_name = NULL;
|
||||
snapshot->local_location = NULL;
|
||||
snapshot->remote_location = NULL;
|
||||
|
||||
snapshot->opal_crs = NULL;
|
||||
}
|
||||
|
||||
void orte_snapc_base_snapshot_destruct( orte_snapc_base_snapshot_t *snapshot)
|
||||
void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot)
|
||||
{
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
snapshot->process_pid = 0;
|
||||
|
||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
snapshot->term = false;
|
||||
|
||||
if( NULL != snapshot->reference_name ) {
|
||||
free(snapshot->reference_name);
|
||||
snapshot->reference_name = NULL;
|
||||
}
|
||||
|
||||
if( NULL != snapshot->local_location ) {
|
||||
free(snapshot->local_location);
|
||||
snapshot->local_location = NULL;
|
||||
}
|
||||
|
||||
if( NULL != snapshot->remote_location ) {
|
||||
free(snapshot->remote_location);
|
||||
snapshot->remote_location = NULL;
|
||||
}
|
||||
|
||||
if( NULL != snapshot->opal_crs ) {
|
||||
free(snapshot->opal_crs);
|
||||
snapshot->opal_crs = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/****/
|
||||
@ -99,51 +124,38 @@ OBJ_CLASS_INSTANCE(orte_snapc_base_global_snapshot_t,
|
||||
|
||||
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot)
|
||||
{
|
||||
OBJ_CONSTRUCT(&(snapshot->snapshots), opal_list_t);
|
||||
char *tmp_dir = NULL;
|
||||
|
||||
snapshot->component_name = NULL;
|
||||
snapshot->reference_name = orte_snapc_base_unique_global_snapshot_name(getpid());
|
||||
snapshot->local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(snapshot->reference_name));
|
||||
OBJ_CONSTRUCT(&(snapshot->local_snapshots), opal_list_t);
|
||||
|
||||
orte_snapc_base_unique_global_snapshot_name(&(snapshot->reference_name), getpid());
|
||||
|
||||
orte_snapc_base_get_global_snapshot_directory(&tmp_dir, snapshot->reference_name);
|
||||
snapshot->local_location = opal_dirname(tmp_dir);
|
||||
free(tmp_dir);
|
||||
|
||||
snapshot->seq_num = 0;
|
||||
snapshot->start_time = NULL;
|
||||
snapshot->end_time = NULL;
|
||||
}
|
||||
|
||||
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
|
||||
{
|
||||
opal_list_item_t* item = NULL;
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&snapshot->snapshots))) {
|
||||
while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&(snapshot->snapshots));
|
||||
OBJ_DESTRUCT(&(snapshot->local_snapshots));
|
||||
|
||||
if(NULL != snapshot->reference_name) {
|
||||
free(snapshot->reference_name);
|
||||
snapshot->reference_name = NULL;
|
||||
}
|
||||
|
||||
if(NULL != snapshot->component_name) {
|
||||
free(snapshot->component_name);
|
||||
snapshot->component_name = NULL;
|
||||
}
|
||||
|
||||
if(NULL != snapshot->local_location) {
|
||||
free(snapshot->local_location);
|
||||
snapshot->local_location = NULL;
|
||||
}
|
||||
|
||||
if(NULL != snapshot->start_time) {
|
||||
free(snapshot->start_time);
|
||||
snapshot->start_time = NULL;
|
||||
}
|
||||
|
||||
if(NULL != snapshot->end_time) {
|
||||
free(snapshot->end_time);
|
||||
snapshot->end_time = NULL;
|
||||
}
|
||||
|
||||
snapshot->seq_num = 0;
|
||||
}
|
||||
|
||||
@ -198,6 +210,7 @@ int orte_snapc_base_none_setup_job(orte_jobid_t jobid)
|
||||
ORTE_RML_PERSISTENT,
|
||||
snapc_none_global_cmdline_request,
|
||||
NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
exit_status = rc;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -238,6 +251,7 @@ static void snapc_none_global_cmdline_request(int status,
|
||||
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -251,6 +265,7 @@ static void snapc_none_global_cmdline_request(int status,
|
||||
* Do the basic handshake with the orte_checkpoint command
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, &term, &jobid)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -259,6 +274,7 @@ static void snapc_none_global_cmdline_request(int status,
|
||||
* Respond with an invalid response
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, -1, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -312,6 +328,7 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
|
||||
"%s) base:ckpt_init_cmd: Error: DSS Unpack (term) Failure (ret = %d) (LINE = %d)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
ret, __LINE__);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -322,6 +339,7 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
|
||||
"%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
ret, __LINE__);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -373,6 +391,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -382,6 +401,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
||||
"%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
ret, __LINE__);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -393,6 +413,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
||||
"%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
ret, __LINE__);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -401,6 +422,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
||||
"%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
ret, __LINE__);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -411,6 +433,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
||||
"%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
ret, __LINE__);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -433,42 +456,36 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
||||
/*****************************
|
||||
* Snapshot metadata functions
|
||||
*****************************/
|
||||
char * orte_snapc_base_unique_global_snapshot_name(pid_t pid)
|
||||
int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid)
|
||||
{
|
||||
char * uniq_name;
|
||||
|
||||
if( NULL == orte_snapc_base_global_snapshot_ref ) {
|
||||
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid);
|
||||
asprintf(name_str, "ompi_global_snapshot_%d.ckpt", pid);
|
||||
}
|
||||
else {
|
||||
uniq_name = strdup(orte_snapc_base_global_snapshot_ref);
|
||||
*name_str = strdup(orte_snapc_base_global_snapshot_ref);
|
||||
}
|
||||
|
||||
return uniq_name;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name)
|
||||
int orte_snapc_base_get_global_snapshot_metadata_file(char **file_name, char *uniq_snapshot_name)
|
||||
{
|
||||
char * path = NULL;
|
||||
|
||||
asprintf(&path, "%s/%s/%s",
|
||||
asprintf(file_name, "%s/%s/%s",
|
||||
orte_snapc_base_global_snapshot_dir,
|
||||
uniq_snapshot_name,
|
||||
orte_snapc_base_metadata_filename);
|
||||
|
||||
return path;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
char * orte_snapc_base_get_global_snapshot_directory(char *uniq_snapshot_name)
|
||||
int orte_snapc_base_get_global_snapshot_directory(char **dir_name, char *uniq_snapshot_name)
|
||||
{
|
||||
char * dir_name = NULL;
|
||||
|
||||
asprintf(&dir_name, "%s/%s/%d",
|
||||
asprintf(dir_name, "%s/%s/%d",
|
||||
orte_snapc_base_global_snapshot_dir,
|
||||
uniq_snapshot_name,
|
||||
(int)orte_snapc_base_snapshot_seq_number);
|
||||
|
||||
return dir_name;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, bool empty_metadata)
|
||||
@ -482,8 +499,9 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
|
||||
/*
|
||||
* Make the snapshot directory from the uniq_global_snapshot_name
|
||||
*/
|
||||
dir_name = orte_snapc_base_get_global_snapshot_directory(uniq_global_snapshot_name);
|
||||
orte_snapc_base_get_global_snapshot_directory(&dir_name, uniq_global_snapshot_name);
|
||||
if(OPAL_SUCCESS != (ret = opal_os_dirpath_create(dir_name, my_mode)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -491,13 +509,14 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
|
||||
/*
|
||||
* Initialize the metadata file at the top of that directory.
|
||||
*/
|
||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(uniq_global_snapshot_name);
|
||||
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, uniq_global_snapshot_name);
|
||||
|
||||
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
||||
opal_output(orte_snapc_base_output,
|
||||
"%s) base:init_global_snapshot_directory: Error: Unable to open the file (%s)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
meta_data_fname);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -529,7 +548,7 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
|
||||
if(NULL != meta_data_fname)
|
||||
free(meta_data_fname);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -575,13 +594,14 @@ int orte_snapc_base_add_timestamp(char * global_snapshot_ref)
|
||||
char * meta_data_fname = NULL;
|
||||
time_t timestamp;
|
||||
|
||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
|
||||
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
|
||||
|
||||
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
||||
opal_output(orte_snapc_base_output,
|
||||
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
meta_data_fname);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -607,13 +627,14 @@ int orte_snapc_base_finalize_metadata(char * global_snapshot_ref)
|
||||
/* Add the final timestamp */
|
||||
orte_snapc_base_add_timestamp(global_snapshot_ref);
|
||||
|
||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
|
||||
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
|
||||
|
||||
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
||||
opal_output(orte_snapc_base_output,
|
||||
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
meta_data_fname);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -633,23 +654,28 @@ int orte_snapc_base_finalize_metadata(char * global_snapshot_ref)
|
||||
int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
||||
char * global_snapshot_ref,
|
||||
char *snapshot_ref,
|
||||
char *snapshot_location)
|
||||
char *snapshot_location,
|
||||
char *crs_agent)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
FILE * meta_data = NULL;
|
||||
char * meta_data_fname = NULL;
|
||||
char * crs_comp = NULL;
|
||||
char * local_dir = NULL;
|
||||
char * proc_name = NULL;
|
||||
int prev_pid = 0;
|
||||
|
||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
|
||||
if( NULL == snapshot_location ) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
|
||||
|
||||
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
|
||||
opal_output(orte_snapc_base_output,
|
||||
"%s) base:add_metadata: Error: Unable to open the file (%s)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
||||
meta_data_fname);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -663,20 +689,21 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
||||
orte_util_convert_process_name_to_string(&proc_name, proc);
|
||||
|
||||
/* Extract the checkpointer */
|
||||
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) {
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
if( NULL == crs_agent ) {
|
||||
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) {
|
||||
exit_status = ret;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
crs_comp = strdup(crs_agent);
|
||||
}
|
||||
|
||||
/* get the base of the location */
|
||||
local_dir = strdup(snapshot_location);
|
||||
local_dir = opal_dirname(local_dir);
|
||||
|
||||
/* Write the string */
|
||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_PROCESS, proc_name);
|
||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_CRS_COMP, crs_comp);
|
||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_REF, snapshot_ref);
|
||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, local_dir);
|
||||
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, snapshot_location);
|
||||
|
||||
cleanup:
|
||||
if( NULL != meta_data )
|
||||
@ -684,9 +711,6 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
||||
if( NULL != meta_data_fname)
|
||||
free(meta_data_fname);
|
||||
|
||||
if( NULL != local_dir)
|
||||
free(local_dir);
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
@ -698,13 +722,14 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
|
||||
int next_seq_int;
|
||||
char * token = NULL;
|
||||
char * value = NULL;
|
||||
orte_snapc_base_snapshot_t *vpid_snapshot = NULL;
|
||||
orte_snapc_base_local_snapshot_t *vpid_snapshot = NULL;
|
||||
|
||||
/*
|
||||
* Open the metadata file
|
||||
*/
|
||||
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot->reference_name);
|
||||
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot->reference_name);
|
||||
if (NULL == (meta_data = fopen(meta_data_fname, "r")) ) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -742,12 +767,7 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
|
||||
break;
|
||||
}
|
||||
else if(0 == strncmp(SNAPC_METADATA_TIME, token, strlen(SNAPC_METADATA_TIME)) ) {
|
||||
if( NULL == global_snapshot->start_time) {
|
||||
global_snapshot->start_time = strdup(value);
|
||||
}
|
||||
else {
|
||||
global_snapshot->end_time = strdup(value);
|
||||
}
|
||||
;
|
||||
}
|
||||
else if(0 == strncmp(SNAPC_METADATA_PROCESS, token, strlen(SNAPC_METADATA_PROCESS)) ) {
|
||||
orte_process_name_t proc;
|
||||
@ -756,29 +776,29 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
|
||||
|
||||
/* Not the first process, so append it to the list */
|
||||
if( NULL != vpid_snapshot) {
|
||||
opal_list_append(&global_snapshot->snapshots, &(vpid_snapshot->crs_snapshot_super.super));
|
||||
opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
|
||||
}
|
||||
|
||||
vpid_snapshot = OBJ_NEW(orte_snapc_base_snapshot_t);
|
||||
vpid_snapshot = OBJ_NEW(orte_snapc_base_local_snapshot_t);
|
||||
|
||||
vpid_snapshot->process_name.jobid = proc.jobid;
|
||||
vpid_snapshot->process_name.vpid = proc.vpid;
|
||||
}
|
||||
else if(0 == strncmp(SNAPC_METADATA_CRS_COMP, token, strlen(SNAPC_METADATA_CRS_COMP)) ) {
|
||||
vpid_snapshot->crs_snapshot_super.component_name = strdup(value);
|
||||
vpid_snapshot->opal_crs = strdup(value);
|
||||
}
|
||||
else if(0 == strncmp(SNAPC_METADATA_SNAP_REF, token, strlen(SNAPC_METADATA_SNAP_REF)) ) {
|
||||
vpid_snapshot->crs_snapshot_super.reference_name = strdup(value);
|
||||
vpid_snapshot->reference_name = strdup(value);
|
||||
}
|
||||
else if(0 == strncmp(SNAPC_METADATA_SNAP_LOC, token, strlen(SNAPC_METADATA_SNAP_LOC)) ) {
|
||||
vpid_snapshot->crs_snapshot_super.local_location = strdup(value);
|
||||
vpid_snapshot->crs_snapshot_super.remote_location = strdup(value);
|
||||
vpid_snapshot->local_location = strdup(value);
|
||||
vpid_snapshot->remote_location = strdup(value);
|
||||
}
|
||||
} while(0 == feof(meta_data) );
|
||||
|
||||
/* Append the last item */
|
||||
if( NULL != vpid_snapshot) {
|
||||
opal_list_append(&global_snapshot->snapshots, &(vpid_snapshot->crs_snapshot_super.super));
|
||||
opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
|
||||
}
|
||||
|
||||
cleanup:
|
||||
@ -960,34 +980,40 @@ static int metadata_extract_next_token(FILE *file, char **token, char **value)
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
char * orte_snapc_ckpt_state_str(size_t state)
|
||||
int orte_snapc_ckpt_state_str(char ** state_str, int state)
|
||||
{
|
||||
switch(state) {
|
||||
case ORTE_SNAPC_CKPT_STATE_NONE:
|
||||
return strdup(" -- ");
|
||||
*state_str = strdup(" -- ");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_REQUEST:
|
||||
return strdup("Requested");
|
||||
*state_str = strdup("Requested");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_PENDING_TERM:
|
||||
return strdup("Pending (Termination)");
|
||||
*state_str = strdup("Pending (Termination)");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_PENDING:
|
||||
return strdup("Pending");
|
||||
*state_str = strdup("Pending");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_RUNNING:
|
||||
return strdup("Running");
|
||||
*state_str = strdup("Running");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_FILE_XFER:
|
||||
return strdup("File Transfer");
|
||||
*state_str = strdup("File Transfer");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_FINISHED:
|
||||
return strdup("Finished");
|
||||
*state_str = strdup("Finished");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
|
||||
*state_str = strdup("Locally Finished");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_ERROR:
|
||||
return strdup("Error");
|
||||
*state_str = strdup("Error");
|
||||
break;
|
||||
default:
|
||||
return strdup("Unknown");
|
||||
asprintf(state_str, "Unknown %d", state);
|
||||
break;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2008 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -74,6 +74,7 @@ char * orte_snapc_base_global_snapshot_ref = NULL;
|
||||
bool orte_snapc_base_store_in_place = true;
|
||||
bool orte_snapc_base_store_only_one_seq = false;
|
||||
bool orte_snapc_base_establish_global_snapshot_dir = false;
|
||||
bool orte_snapc_base_is_global_dir_shared = false;
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components,
|
||||
@ -97,9 +98,20 @@ int orte_snapc_base_open(void)
|
||||
opal_home_directory(),
|
||||
&orte_snapc_base_global_snapshot_dir);
|
||||
|
||||
mca_base_param_reg_int_name("snapc",
|
||||
"base_global_shared",
|
||||
"If the global_snapshot_dir is on a shared file system all nodes can access, "
|
||||
"then the checkpoint files can be copied more efficiently when FileM is used."
|
||||
" [Default = disabled]",
|
||||
false, false,
|
||||
0,
|
||||
&value);
|
||||
orte_snapc_base_is_global_dir_shared = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_snapc_base_output,
|
||||
"snapc:base: open: base_global_snapshot_dir = %s",
|
||||
orte_snapc_base_global_snapshot_dir));
|
||||
"snapc:base: open: base_global_snapshot_dir = %s (%s)",
|
||||
orte_snapc_base_global_snapshot_dir,
|
||||
(orte_snapc_base_is_global_dir_shared ? "Shared" : "Local") ));
|
||||
|
||||
/*
|
||||
* Store the checkpoint files in their final location.
|
||||
@ -173,8 +185,8 @@ int orte_snapc_base_open(void)
|
||||
if( NULL == orte_snapc_base_global_snapshot_loc ) {
|
||||
char *t1 = NULL;
|
||||
char *t2 = NULL;
|
||||
t1 = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
||||
t2 = orte_snapc_base_get_global_snapshot_directory( t1 );
|
||||
orte_snapc_base_unique_global_snapshot_name(&t1, getpid() );
|
||||
orte_snapc_base_get_global_snapshot_directory(&t2, t1 );
|
||||
orte_snapc_base_global_snapshot_loc = strdup(t2);
|
||||
free(t1);
|
||||
free(t2);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -30,6 +30,7 @@
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/event/event.h"
|
||||
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -39,35 +40,54 @@ BEGIN_C_DECLS
|
||||
*/
|
||||
typedef uint8_t orte_snapc_full_cmd_flag_t;
|
||||
#define ORTE_SNAPC_FULL_CMD OPAL_UINT8
|
||||
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD 1
|
||||
#define ORTE_SNAPC_FULL_UPDATE_PROC_STATE_CMD 2
|
||||
#define ORTE_SNAPC_FULL_VPID_ASSOC_CMD 3
|
||||
#define ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD 4
|
||||
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD 1
|
||||
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_QUICK_CMD 2
|
||||
#define ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_CMD 3
|
||||
#define ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_QUICK_CMD 4
|
||||
#define ORTE_SNAPC_FULL_VPID_ASSOC_CMD 5
|
||||
#define ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD 6
|
||||
#define ORTE_SNAPC_FULL_MAX 7
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
struct orte_snapc_full_component_t {
|
||||
orte_snapc_base_component_t super; /** Base SNAPC component */
|
||||
|
||||
};
|
||||
typedef struct orte_snapc_full_component_t orte_snapc_full_component_t;
|
||||
OPAL_MODULE_DECLSPEC extern orte_snapc_full_component_t mca_snapc_full_component;
|
||||
|
||||
struct orte_snapc_full_global_snapshot_t {
|
||||
/*
|
||||
* Global Coordinator per orted metadata
|
||||
*/
|
||||
struct orte_snapc_full_orted_snapshot_t {
|
||||
/** Base SNAPC Global snapshot type */
|
||||
orte_snapc_base_snapshot_t super;
|
||||
orte_snapc_base_global_snapshot_t super;
|
||||
|
||||
/** Local coordinator associated with this vpid */
|
||||
orte_process_name_t local_coord;
|
||||
/** ORTE Process name */
|
||||
orte_process_name_t process_name;
|
||||
|
||||
/** State of the checkpoint */
|
||||
int state;
|
||||
|
||||
/** OPAL CRS Component */
|
||||
char * opal_crs;
|
||||
|
||||
/** Term flag */
|
||||
bool term;
|
||||
|
||||
/** FileM request */
|
||||
orte_filem_base_request_t *filem_request;
|
||||
};
|
||||
typedef struct orte_snapc_full_global_snapshot_t orte_snapc_full_global_snapshot_t;
|
||||
typedef struct orte_snapc_full_orted_snapshot_t orte_snapc_full_orted_snapshot_t;
|
||||
OBJ_CLASS_DECLARATION(orte_snapc_full_orted_snapshot_t);
|
||||
|
||||
OBJ_CLASS_DECLARATION(orte_snapc_full_global_snapshot_t);
|
||||
|
||||
struct orte_snapc_full_local_snapshot_t {
|
||||
/*
|
||||
* Local Coordinator per app metadata
|
||||
*/
|
||||
struct orte_snapc_full_app_snapshot_t {
|
||||
/** Base SNAPC Global snapshot type */
|
||||
orte_snapc_base_snapshot_t super;
|
||||
orte_snapc_base_local_snapshot_t super;
|
||||
|
||||
/** Named Pipe Read and Write */
|
||||
char * comm_pipe_r;
|
||||
@ -79,14 +99,18 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
|
||||
struct opal_event comm_pipe_r_eh;
|
||||
bool is_eh_active;
|
||||
|
||||
/** State of the process wrt checkpointing */
|
||||
int ckpt_state;
|
||||
};
|
||||
typedef struct orte_snapc_full_local_snapshot_t orte_snapc_full_local_snapshot_t;
|
||||
/** Process pid */
|
||||
pid_t process_pid;
|
||||
|
||||
OBJ_CLASS_DECLARATION(orte_snapc_full_local_snapshot_t);
|
||||
/** Term */
|
||||
bool term;
|
||||
};
|
||||
typedef struct orte_snapc_full_app_snapshot_t orte_snapc_full_app_snapshot_t;
|
||||
OBJ_CLASS_DECLARATION(orte_snapc_full_app_snapshot_t);
|
||||
|
||||
extern bool orte_snapc_full_skip_filem;
|
||||
extern bool orte_snapc_full_skip_app;
|
||||
extern bool orte_snapc_full_timing_enabled;
|
||||
|
||||
int orte_snapc_full_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
@ -108,12 +132,11 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
|
||||
int global_coord_finalize(void);
|
||||
int global_coord_setup_job(orte_jobid_t jobid);
|
||||
int global_coord_release_job(orte_jobid_t jobid);
|
||||
int global_coord_vpid_assoc_update(orte_process_name_t local_coord,
|
||||
orte_process_name_t proc_name);
|
||||
int global_coord_vpid_state_update(orte_process_name_t proc_name,
|
||||
size_t proc_ckpt_state,
|
||||
char **proc_ckpt_ref,
|
||||
char **proc_ckpt_loc);
|
||||
int global_coord_orted_state_update(orte_process_name_t proc_name,
|
||||
int proc_ckpt_state,
|
||||
char **proc_ckpt_ref,
|
||||
char **proc_ckpt_loc,
|
||||
char **agent_ckpt);
|
||||
/*
|
||||
* Local Coordinator Functionality
|
||||
*/
|
||||
@ -122,7 +145,7 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
|
||||
int local_coord_setup_job(orte_jobid_t jobid);
|
||||
int local_coord_release_job(orte_jobid_t jobid);
|
||||
int local_coord_job_state_update(orte_jobid_t jobid,
|
||||
size_t job_ckpt_state,
|
||||
int job_ckpt_state,
|
||||
char **job_ckpt_ref,
|
||||
char **job_ckpt_loc);
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -219,7 +219,12 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
opal_cr_currently_stalled = false;
|
||||
|
||||
app_pid = getpid();
|
||||
ret = opal_cr_inc_core(app_pid, local_snapshot, app_term, &cr_state);
|
||||
if( orte_snapc_full_skip_app ) {
|
||||
ret = ORTE_SUCCESS;
|
||||
cr_state = OPAL_CRS_CONTINUE;
|
||||
} else {
|
||||
ret = opal_cr_inc_core(app_pid, local_snapshot, app_term, &cr_state);
|
||||
}
|
||||
if( OPAL_EXISTS == ret ) {
|
||||
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
|
||||
"App) notify_response: Stalling the checkpoint progress until state is stable again (PID = %d)\n",
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -35,6 +35,8 @@ static int snapc_full_open(void);
|
||||
static int snapc_full_close(void);
|
||||
|
||||
bool orte_snapc_full_skip_filem = false;
|
||||
bool orte_snapc_full_skip_app = false;
|
||||
bool orte_snapc_full_timing_enabled = false;
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
@ -113,6 +115,22 @@ static int snapc_full_open(void)
|
||||
&value);
|
||||
orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
||||
"skip_app",
|
||||
"Not for general use! For debugging only! Shortcut app level coord. [Default = disabled]",
|
||||
false, false,
|
||||
0,
|
||||
&value);
|
||||
orte_snapc_full_skip_app = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
||||
"enable_timing",
|
||||
"Enable timing information. [Default = disabled]",
|
||||
false, false,
|
||||
0,
|
||||
&value);
|
||||
orte_snapc_full_timing_enabled = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/*
|
||||
* Debug Output
|
||||
*/
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
@ -50,24 +50,24 @@ static orte_snapc_base_module_t loc_module = {
|
||||
/*
|
||||
* Global Snapshot structure
|
||||
*/
|
||||
void orte_snapc_full_global_construct(orte_snapc_full_global_snapshot_t *obj);
|
||||
void orte_snapc_full_global_destruct( orte_snapc_full_global_snapshot_t *obj);
|
||||
void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *obj);
|
||||
void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *obj);
|
||||
|
||||
OBJ_CLASS_INSTANCE(orte_snapc_full_global_snapshot_t,
|
||||
orte_snapc_base_snapshot_t,
|
||||
orte_snapc_full_global_construct,
|
||||
orte_snapc_full_global_destruct);
|
||||
OBJ_CLASS_INSTANCE(orte_snapc_full_orted_snapshot_t,
|
||||
orte_snapc_base_global_snapshot_t,
|
||||
orte_snapc_full_orted_construct,
|
||||
orte_snapc_full_orted_destruct);
|
||||
|
||||
/*
|
||||
* Local Snapshot structure
|
||||
*/
|
||||
void orte_snapc_full_local_construct(orte_snapc_full_local_snapshot_t *obj);
|
||||
void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj);
|
||||
void orte_snapc_full_app_construct(orte_snapc_full_app_snapshot_t *obj);
|
||||
void orte_snapc_full_app_destruct( orte_snapc_full_app_snapshot_t *obj);
|
||||
|
||||
OBJ_CLASS_INSTANCE(orte_snapc_full_local_snapshot_t,
|
||||
orte_snapc_base_snapshot_t,
|
||||
orte_snapc_full_local_construct,
|
||||
orte_snapc_full_local_destruct);
|
||||
OBJ_CLASS_INSTANCE(orte_snapc_full_app_snapshot_t,
|
||||
orte_snapc_base_local_snapshot_t,
|
||||
orte_snapc_full_app_construct,
|
||||
orte_snapc_full_app_destruct);
|
||||
|
||||
/************************************
|
||||
* Locally Global vars & functions :)
|
||||
@ -77,29 +77,53 @@ OBJ_CLASS_INSTANCE(orte_snapc_full_local_snapshot_t,
|
||||
/************************
|
||||
* Function Definitions
|
||||
************************/
|
||||
void orte_snapc_full_global_construct(orte_snapc_full_global_snapshot_t *snapshot) {
|
||||
snapshot->local_coord.vpid = 0;
|
||||
snapshot->local_coord.jobid = 0;
|
||||
void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot) {
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
|
||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
|
||||
snapshot->opal_crs = NULL;
|
||||
|
||||
snapshot->term = false;
|
||||
|
||||
snapshot->filem_request = NULL;
|
||||
}
|
||||
|
||||
void orte_snapc_full_global_destruct( orte_snapc_full_global_snapshot_t *snapshot) {
|
||||
snapshot->local_coord.vpid = 0;
|
||||
snapshot->local_coord.jobid = 0;
|
||||
void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *snapshot) {
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
|
||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
|
||||
if( NULL != snapshot->opal_crs ) {
|
||||
free( snapshot->opal_crs );
|
||||
snapshot->opal_crs = NULL;
|
||||
}
|
||||
|
||||
snapshot->term = false;
|
||||
|
||||
if( NULL != snapshot->filem_request ) {
|
||||
OBJ_RELEASE(snapshot->filem_request);
|
||||
snapshot->filem_request = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void orte_snapc_full_local_construct(orte_snapc_full_local_snapshot_t *obj) {
|
||||
void orte_snapc_full_app_construct(orte_snapc_full_app_snapshot_t *obj) {
|
||||
obj->comm_pipe_r = NULL;
|
||||
obj->comm_pipe_w = NULL;
|
||||
|
||||
obj->comm_pipe_r_fd = -1;
|
||||
obj->comm_pipe_w_fd = -1;
|
||||
|
||||
obj->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
|
||||
obj->is_eh_active = false;
|
||||
|
||||
obj->process_pid = 0;
|
||||
|
||||
obj->term = false;
|
||||
}
|
||||
|
||||
void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj) {
|
||||
void orte_snapc_full_app_destruct( orte_snapc_full_app_snapshot_t *obj) {
|
||||
if( NULL != obj->comm_pipe_r ) {
|
||||
free(obj->comm_pipe_r);
|
||||
obj->comm_pipe_r = NULL;
|
||||
@ -113,9 +137,11 @@ void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj) {
|
||||
obj->comm_pipe_r_fd = -1;
|
||||
obj->comm_pipe_w_fd = -1;
|
||||
|
||||
obj->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
|
||||
obj->is_eh_active = false;
|
||||
|
||||
obj->process_pid = 0;
|
||||
|
||||
obj->term = false;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -89,45 +89,61 @@ BEGIN_C_DECLS
|
||||
/**
|
||||
* States that a process can be in while checkpointing
|
||||
*/
|
||||
/* Doing no checkpoint -- Quiet state */
|
||||
#define ORTE_SNAPC_CKPT_STATE_NONE 0
|
||||
/* There has been a request for a checkpoint from one of the applications */
|
||||
#define ORTE_SNAPC_CKPT_STATE_REQUEST 1
|
||||
/* There is a Pending checkpoint for this process */
|
||||
#define ORTE_SNAPC_CKPT_STATE_PENDING 2
|
||||
/* There is a Pending checkpoint for this process, terminate the process after checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 3
|
||||
/* Running the checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_RUNNING 4
|
||||
/* Finished the checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 5
|
||||
/* Finished the checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_FINISHED 6
|
||||
/* Unable to checkpoint this job */
|
||||
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 7
|
||||
/* Reached an error */
|
||||
#define ORTE_SNAPC_CKPT_STATE_ERROR 8
|
||||
#define ORTE_SNAPC_CKPT_STATE_ERROR 0
|
||||
|
||||
/* Doing no checkpoint -- Quiet state */
|
||||
#define ORTE_SNAPC_CKPT_STATE_NONE 1
|
||||
/* There has been a request for a checkpoint from one of the applications */
|
||||
#define ORTE_SNAPC_CKPT_STATE_REQUEST 2
|
||||
/* There is a Pending checkpoint for this process */
|
||||
#define ORTE_SNAPC_CKPT_STATE_PENDING 3
|
||||
/* There is a Pending checkpoint for this process, terminate the process after checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 4
|
||||
/* Running the checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_RUNNING 5
|
||||
/* Finished the checkpoint locally */
|
||||
#define ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL 6
|
||||
/* File Transfer in progress */
|
||||
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 8
|
||||
/* Finished the checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_FINISHED 9
|
||||
/* Unable to checkpoint this job */
|
||||
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 10
|
||||
#define ORTE_SNAPC_CKPT_MAX 11
|
||||
|
||||
/**
|
||||
* Definition of a orte local snapshot.
|
||||
* Similar to the opal_crs_base_snapshot_t except that it
|
||||
* contains process contact information.
|
||||
*/
|
||||
struct orte_snapc_base_snapshot_1_0_0_t {
|
||||
opal_crs_base_snapshot_t crs_snapshot_super;
|
||||
struct orte_snapc_base_local_snapshot_1_0_0_t {
|
||||
/** List super object */
|
||||
opal_list_item_t super;
|
||||
|
||||
/** ORTE Process name */
|
||||
orte_process_name_t process_name;
|
||||
/** PID of the application process that generated this snapshot */
|
||||
pid_t process_pid;
|
||||
/** State of the checkpoint */
|
||||
size_t state;
|
||||
/** Terminate this process after a checkpoint */
|
||||
bool term;
|
||||
};
|
||||
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_1_0_0_t;
|
||||
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_t;
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_snapshot_t);
|
||||
/** State of the checkpoint */
|
||||
int state;
|
||||
|
||||
/** Unique name of the local snapshot */
|
||||
char * reference_name;
|
||||
|
||||
/** Local location of the local snapshot Absolute path */
|
||||
char * local_location;
|
||||
|
||||
/** Remote location of the local snapshot Absolute path */
|
||||
char * remote_location;
|
||||
|
||||
/** CRS agent */
|
||||
char * opal_crs;
|
||||
|
||||
};
|
||||
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_1_0_0_t;
|
||||
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_t;
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_local_snapshot_t);
|
||||
|
||||
/**
|
||||
* Definition of the global snapshot.
|
||||
@ -138,12 +154,9 @@ struct orte_snapc_base_global_snapshot_1_0_0_t {
|
||||
/** This is an object, so must have super */
|
||||
opal_list_item_t super;
|
||||
|
||||
/** A list of orte_snapc_base_snapshot_ts */
|
||||
opal_list_t snapshots;
|
||||
/** A list of orte_snapc_base_snapshot_t's */
|
||||
opal_list_t local_snapshots;
|
||||
|
||||
/* ORTE SnapC Component used to generate the global snapshot */
|
||||
char * component_name;
|
||||
|
||||
/** Unique name of the global snapshot */
|
||||
char * reference_name;
|
||||
|
||||
@ -152,12 +165,6 @@ struct orte_snapc_base_global_snapshot_1_0_0_t {
|
||||
|
||||
/** Sequence Number */
|
||||
int seq_num;
|
||||
|
||||
/** Beginning timestamp */
|
||||
char * start_time;
|
||||
|
||||
/** Ending timestamp */
|
||||
char * end_time;
|
||||
};
|
||||
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
|
||||
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -76,6 +76,8 @@
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
|
||||
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||
|
||||
/******************
|
||||
* Local Functions
|
||||
******************/
|
||||
@ -108,11 +110,16 @@ static int global_sequence_num = 0;
|
||||
*****************************************/
|
||||
static bool listener_started = false;
|
||||
|
||||
static double timer_start = 0;
|
||||
static double timer_last = 0;
|
||||
static double get_time(void);
|
||||
|
||||
typedef struct {
|
||||
bool help;
|
||||
int pid;
|
||||
bool term;
|
||||
bool verbose;
|
||||
int verbose_level;
|
||||
orte_jobid_t req_hnp; /**< User Requested HNP */
|
||||
bool nowait; /* Do not wait for checkpoint to complete before returning */
|
||||
bool status; /* Display status messages while checkpoint is progressing */
|
||||
@ -135,6 +142,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
|
||||
&orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be Verbose" },
|
||||
|
||||
{ NULL, NULL, NULL,
|
||||
'V', NULL, NULL,
|
||||
1,
|
||||
&orte_checkpoint_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Set the verbosity level (For additional debugging information)" },
|
||||
|
||||
{ NULL, NULL, NULL,
|
||||
'\0', NULL, "term",
|
||||
0,
|
||||
@ -279,6 +292,7 @@ static int parse_args(int argc, char *argv[]) {
|
||||
orte_checkpoint_globals.pid = -1;
|
||||
orte_checkpoint_globals.term = false;
|
||||
orte_checkpoint_globals.verbose = false;
|
||||
orte_checkpoint_globals.verbose_level = 0;
|
||||
orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID;
|
||||
orte_checkpoint_globals.nowait = false;
|
||||
orte_checkpoint_globals.status = false;
|
||||
@ -344,6 +358,14 @@ static int parse_args(int argc, char *argv[]) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if(orte_checkpoint_globals.verbose_level < 0 ) {
|
||||
orte_checkpoint_globals.verbose_level = 0;
|
||||
}
|
||||
|
||||
if(orte_checkpoint_globals.verbose_level > 0) {
|
||||
orte_checkpoint_globals.verbose = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the user did not supply an hnp jobid, then they must
|
||||
* supply the PID of MPIRUN
|
||||
@ -474,7 +496,7 @@ static int ckpt_init(int argc, char *argv[]) {
|
||||
*/
|
||||
if( orte_checkpoint_globals.verbose ) {
|
||||
orte_checkpoint_globals.output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(orte_checkpoint_globals.output, 10);
|
||||
opal_output_set_verbosity(orte_checkpoint_globals.output, orte_checkpoint_globals.verbose_level);
|
||||
} else {
|
||||
orte_checkpoint_globals.output = 0; /* Default=STDERR */
|
||||
}
|
||||
@ -661,6 +683,8 @@ notify_process_for_checkpoint(int term)
|
||||
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
|
||||
orte_checkpoint_globals.pid);
|
||||
|
||||
timer_start = get_time();
|
||||
|
||||
/***********************************
|
||||
* Notify HNP of checkpoint request
|
||||
* Send:
|
||||
@ -709,18 +733,51 @@ notify_process_for_checkpoint(int term)
|
||||
/***************
|
||||
* Pretty Print
|
||||
***************/
|
||||
static double get_time(void) {
|
||||
double wtime;
|
||||
|
||||
#if OPAL_TIMER_USEC_NATIVE
|
||||
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
|
||||
#else
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
wtime = tv.tv_sec;
|
||||
wtime += (double)tv.tv_usec / 1000000.0;
|
||||
#endif
|
||||
|
||||
return wtime;
|
||||
}
|
||||
|
||||
static int pretty_print_status(void) {
|
||||
char * state_str = NULL;
|
||||
double cur_time;
|
||||
|
||||
state_str = orte_snapc_ckpt_state_str(orte_checkpoint_globals.ckpt_status);
|
||||
cur_time = get_time();
|
||||
|
||||
if( timer_last == 0 ) {
|
||||
timer_last = cur_time;
|
||||
}
|
||||
|
||||
orte_snapc_ckpt_state_str(&state_str, orte_checkpoint_globals.ckpt_status);
|
||||
|
||||
if( NULL != global_snapshot_handle ) {
|
||||
opal_output(0,
|
||||
"[%6.2f / %6.2f] %*s - %s\n",
|
||||
(cur_time - timer_last), (cur_time - timer_start),
|
||||
25, state_str, global_snapshot_handle);
|
||||
} else {
|
||||
opal_output(0,
|
||||
"[%6.2f / %6.2f] %*s - ...\n",
|
||||
(cur_time - timer_last), (cur_time - timer_start),
|
||||
25, state_str);
|
||||
}
|
||||
|
||||
opal_output(0,
|
||||
"%*s - Global Snapshot Reference: %s\n",
|
||||
25, state_str, global_snapshot_handle);
|
||||
if( NULL != state_str) {
|
||||
free(state_str);
|
||||
}
|
||||
|
||||
|
||||
timer_last = cur_time;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -491,6 +491,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
|
||||
orte_std_cntr_t i;
|
||||
char *jobstr;
|
||||
orte_jobid_t mask=0x0000ffff;
|
||||
char * state_str = NULL;
|
||||
|
||||
for(i=0; i < num_jobs; i++) {
|
||||
job = jobs[i];
|
||||
@ -513,9 +514,10 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
|
||||
len_slots = 6;
|
||||
len_vpid_r = (int) strlen("Num Procs");
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
len_ckpt_s = (int) (strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) < strlen("Ckpt State") ?
|
||||
orte_snapc_ckpt_state_str(&state_str, job->ckpt_state);
|
||||
len_ckpt_s = (int) (strlen(state_str) < strlen("Ckpt State") ?
|
||||
strlen("Ckpt State") :
|
||||
strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) );
|
||||
strlen(state_str) );
|
||||
len_ckpt_r = (int) (NULL == job->ckpt_snapshot_ref ? strlen("Ckpt Ref") :
|
||||
(strlen(job->ckpt_snapshot_ref) < strlen("Ckpt Ref") ?
|
||||
strlen("Ckpt Ref") :
|
||||
@ -525,6 +527,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
|
||||
strlen("Ckpt Loc") :
|
||||
strlen(job->ckpt_snapshot_loc) ) );
|
||||
#else
|
||||
state_str = NULL;
|
||||
len_ckpt_s = -3;
|
||||
len_ckpt_r = -3;
|
||||
len_ckpt_l = -3;
|
||||
@ -564,7 +567,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
|
||||
printf("%*d | ", len_slots , (uint)job->total_slots_alloc);
|
||||
printf("%*d | ", len_vpid_r, job->num_procs);
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(job->ckpt_state));
|
||||
printf("%*s | ", len_ckpt_s, state_str);
|
||||
printf("%*s | ", len_ckpt_r, (NULL == job->ckpt_snapshot_ref ?
|
||||
"" :
|
||||
job->ckpt_snapshot_ref) );
|
||||
@ -597,6 +600,7 @@ static int pretty_print_vpids(orte_job_t *job) {
|
||||
orte_proc_t *vpid;
|
||||
orte_app_context_t *app;
|
||||
char *o_proc_name;
|
||||
char *state_str = NULL;
|
||||
|
||||
/*
|
||||
* Caculate segment lengths
|
||||
@ -663,8 +667,9 @@ static int pretty_print_vpids(orte_job_t *job) {
|
||||
len_state = strlen(pretty_vpid_state(vpid->state));
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
if( (int)strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state)) > len_ckpt_s)
|
||||
len_ckpt_s = strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state));
|
||||
orte_snapc_ckpt_state_str(&state_str, vpid->ckpt_state);
|
||||
if( (int)strlen(state_str) > len_ckpt_s)
|
||||
len_ckpt_s = strlen(state_str);
|
||||
|
||||
if( NULL != vpid->ckpt_snapshot_ref &&
|
||||
(int)strlen(vpid->ckpt_snapshot_ref) > len_ckpt_r)
|
||||
@ -673,6 +678,8 @@ static int pretty_print_vpids(orte_job_t *job) {
|
||||
if( NULL != vpid->ckpt_snapshot_loc &&
|
||||
(int)strlen(vpid->ckpt_snapshot_loc) > len_ckpt_l)
|
||||
len_ckpt_l = strlen(vpid->ckpt_snapshot_loc);
|
||||
#else
|
||||
state_str = NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -739,7 +746,7 @@ static int pretty_print_vpids(orte_job_t *job) {
|
||||
printf("%*s | ", len_state , pretty_vpid_state(vpid->state));
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(vpid->ckpt_state));
|
||||
printf("%*s | ", len_ckpt_s, state_str);
|
||||
printf("%*s | ", len_ckpt_r, (NULL == vpid->ckpt_snapshot_ref ?
|
||||
"" :
|
||||
vpid->ckpt_snapshot_ref));
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -151,9 +151,10 @@ int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
pid_t child_pid;
|
||||
pid_t child_pid = 0;
|
||||
orte_snapc_base_global_snapshot_t *snapshot = NULL;
|
||||
|
||||
char *tmp_str = NULL;
|
||||
|
||||
/***************
|
||||
* Initialize
|
||||
***************/
|
||||
@ -164,7 +165,10 @@ main(int argc, char *argv[])
|
||||
|
||||
snapshot = OBJ_NEW(orte_snapc_base_global_snapshot_t);
|
||||
snapshot->reference_name = strdup(orte_restart_globals.filename);
|
||||
snapshot->local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(snapshot->reference_name));
|
||||
orte_snapc_base_get_global_snapshot_directory(&tmp_str, snapshot->reference_name);
|
||||
snapshot->local_location = opal_dirname(tmp_str);
|
||||
free(tmp_str);
|
||||
tmp_str = NULL;
|
||||
|
||||
/*
|
||||
* Check for existence of the file
|
||||
@ -453,11 +457,11 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
for(item = opal_list_get_first(&snapshot->snapshots);
|
||||
item != opal_list_get_end(&snapshot->snapshots);
|
||||
for(item = opal_list_get_first(&snapshot->local_snapshots);
|
||||
item != opal_list_get_end(&snapshot->local_snapshots);
|
||||
item = opal_list_get_next(item) ) {
|
||||
orte_snapc_base_snapshot_t *vpid_snapshot;
|
||||
vpid_snapshot = (orte_snapc_base_snapshot_t*)item;
|
||||
orte_snapc_base_local_snapshot_t *vpid_snapshot;
|
||||
vpid_snapshot = (orte_snapc_base_local_snapshot_t*)item;
|
||||
|
||||
fprintf(appfile, "#\n");
|
||||
fprintf(appfile, "# Old Process Name: %u.%u\n",
|
||||
@ -467,13 +471,15 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
|
||||
fprintf(appfile, "-np 1 ");
|
||||
if(orte_restart_globals.preload) {
|
||||
fprintf(appfile, "--preload-files %s/%s ",
|
||||
vpid_snapshot->crs_snapshot_super.local_location,
|
||||
vpid_snapshot->crs_snapshot_super.reference_name);
|
||||
vpid_snapshot->local_location,
|
||||
vpid_snapshot->reference_name);
|
||||
fprintf(appfile, "--preload-files-dest-dir . ");
|
||||
}
|
||||
/* JJH: Make this match what the user originally specified on the command line */
|
||||
fprintf(appfile, "-am ft-enable-cr ");
|
||||
|
||||
fprintf(appfile, " opal-restart ");
|
||||
|
||||
/* JJH: Make sure this changes if ever the default location of the local file is changed,
|
||||
* currently it is safe to assume that it is in the current working directory.
|
||||
*
|
||||
@ -486,9 +492,9 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
|
||||
else {
|
||||
/* If we are *not* preloading the files, the point to the original checkpoint
|
||||
* directory to access the checkpoint files. */
|
||||
fprintf(appfile, "-mca crs_base_snapshot_dir %s ", vpid_snapshot->crs_snapshot_super.local_location);
|
||||
fprintf(appfile, "-mca crs_base_snapshot_dir %s ", vpid_snapshot->local_location);
|
||||
}
|
||||
fprintf(appfile, "%s\n", vpid_snapshot->crs_snapshot_super.reference_name);
|
||||
fprintf(appfile, "%s\n", vpid_snapshot->reference_name);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user