1
1

A bunch of improvements focused on Snapshot Coordination (SnapC) and File Management (FileM).

* Improved timing in SnapC Full Global Coordinator
 * Improved scalability of the SnapC Full protocol
 * Minor improvements to the error reporting mechanisms in SnapC and FileM
 * Improved the memory usage of the metadata routines - now the owner of the data is more explicit.
 * Added a FileM hint to indicate when files stored locally can be moved to/from a globally mounted file system using just the 'cp' command instead of the 'rcp/scp' command. Slightly improves performance, but not too drastically. Can be set using the following SnapC MCA parameter: {{{snapc_base_global_shared=1}}}
 * Implement the ability to throttle the number of outgoing connections in FileM. At larger scales this type of explicit throttling helps prevent overwhelming the HNP machine. Default: 10, set via MCA parameter: {{{filem_rsh_max_outgoing}}}
 * Add a few diagnostic/debugging features to SnapC and FileM.

This commit was SVN r21131.
Этот коммит содержится в:
Josh Hursey 2009-04-30 16:55:39 +00:00
родитель 38aca518bd
Коммит 0deb009225
18 изменённых файлов: 2549 добавлений и 1976 удалений

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
@ -71,7 +71,11 @@ ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_file_set_t,
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *req) {
req->local_target = NULL;
req->local_hint = ORTE_FILEM_HINT_NONE;
req->remote_target = NULL;
req->remote_hint = ORTE_FILEM_HINT_NONE;
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
}
@ -81,11 +85,13 @@ ORTE_DECLSPEC void orte_filem_base_file_set_destruct( orte_filem_base_file_set_t
free(req->local_target);
req->local_target = NULL;
}
req->local_hint = ORTE_FILEM_HINT_NONE;
if( NULL != req->remote_target ) {
free(req->remote_target);
req->remote_target = NULL;
}
req->remote_hint = ORTE_FILEM_HINT_NONE;
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
@ -77,6 +77,7 @@ int orte_filem_base_open(void)
NULL, &str_value);
if( NULL != str_value ) {
free(str_value);
str_value = NULL;
}
/* Open up all available components */

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -54,6 +54,13 @@ extern "C" {
#define ORTE_FILEM_MOVE_TYPE_RM 2
#define ORTE_FILEM_MOVE_TYPE_UNKNOWN 3
/**
* Hints that describe the local or remote file target for
* optimization purposes.
*/
#define ORTE_FILEM_HINT_NONE 0
#define ORTE_FILEM_HINT_SHARED 1
/**
* Define a Process Set
*
@ -92,9 +99,15 @@ struct orte_filem_base_file_set_1_0_0_t {
/* Local file reference */
char * local_target;
/* Local file reference hints */
int local_hint;
/* Remove file reference */
char * remote_target;
/* Remote file reference hints */
int remote_hint;
/* Type of file to move */
int target_flag;
};

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
@ -37,7 +37,7 @@ static int filem_rsh_open(void);
static int filem_rsh_close(void);
int orte_filem_rsh_max_incomming = 10;
int orte_filem_rsh_max_outgoing = 10;
int orte_filem_rsh_max_outgoing = 10;
/*
* Instantiate the public struct with all of our public information

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
@ -42,6 +42,7 @@
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
@ -632,41 +633,51 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
}
/* Do not check a local get() operation, to help supress the warnings from the HNP */
else if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) {
char *base = NULL;
asprintf(&base, "%s/%s", f_set->local_target, opal_basename(f_set->remote_target));
/*
* The file should not exist if we are getting a file with the
* same name since we do not want to overwrite the filename
* without the users consent.
*/
if( 0 == access(f_set->local_target, R_OK) ) {
if( 0 == access(base, R_OK) ) {
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination\n",
"filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination (%s)\n",
ORTE_NAME_PRINT(&p_set->source),
ORTE_NAME_PRINT(&p_set->sink),
f_set->remote_target,
f_set->local_target));
f_set->local_target, base));
orte_show_help("help-orte-filem-rsh.txt",
"orte-filem-rsh:get-file-exists",
true, f_set->local_target, orte_process_info.nodename);
free(base);
base = NULL;
request->is_done[cur_index] = true;
request->is_active[cur_index] = true;
request->exit_status[cur_index] = -1;
goto continue_set;
}
free(base);
base = NULL;
}
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): %s -> %s: Moving file %s to %s\n",
"filem:rsh: copy(): %s -> %s: Moving file %s %s to %s %s\n",
ORTE_NAME_PRINT(&p_set->source),
ORTE_NAME_PRINT(&p_set->sink),
(f_set->local_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
f_set->local_target,
(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
f_set->remote_target));
} else {
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): %s -> %s: Moving file %s to %s\n",
"filem:rsh: copy(): %s -> %s: Moving file %s %s to %s %s\n",
ORTE_NAME_PRINT(&p_set->source),
ORTE_NAME_PRINT(&p_set->sink),
(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
f_set->remote_target,
(f_set->local_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
f_set->local_target));
}
@ -736,12 +747,20 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
* If this is the put() routine
*/
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
asprintf(&command, "%s %s %s %s:%s ",
mca_filem_rsh_component.cp_command,
dir_arg,
f_set->local_target,
remote_machine,
remote_file);
/* Use a local 'cp' when able */
if(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ) {
asprintf(&command, "cp %s %s %s ",
dir_arg,
f_set->local_target,
remote_file);
} else {
asprintf(&command, "%s %s %s %s:%s ",
mca_filem_rsh_component.cp_command,
dir_arg,
f_set->local_target,
remote_machine,
remote_file);
}
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
"filem:rsh:put about to execute [%s]", command));
@ -758,13 +777,23 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
* ow it is the get() routine
*/
else {
asprintf(&command, "%s %s %s:%s %s ",
mca_filem_rsh_component.cp_command,
dir_arg,
remote_machine,
remote_file,
f_set->local_target);
/* Use a local 'cp' when able */
if(f_set->local_hint == ORTE_FILEM_HINT_SHARED ) {
asprintf(&command, "%s %s cp %s %s %s ",
mca_filem_rsh_component.remote_sh_command,
remote_machine,
dir_arg,
remote_file,
f_set->local_target);
} else {
asprintf(&command, "%s %s %s:%s %s ",
mca_filem_rsh_component.cp_command,
dir_arg,
remote_machine,
remote_file,
f_set->local_target);
}
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
"filem:rsh:get about to execute [%s]", command));

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -75,8 +75,8 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
/**
* Global Snapshot Object Maintenance functions
*/
void orte_snapc_base_snapshot_construct(orte_snapc_base_snapshot_t *obj);
void orte_snapc_base_snapshot_destruct( orte_snapc_base_snapshot_t *obj);
void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *obj);
void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *obj);
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *obj);
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *obj);
@ -132,24 +132,26 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq;
ORTE_DECLSPEC extern bool orte_snapc_base_establish_global_snapshot_dir;
ORTE_DECLSPEC extern bool orte_snapc_base_is_global_dir_shared;
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;
/**
* Some utility functions
*/
ORTE_DECLSPEC char * orte_snapc_ckpt_state_str(size_t state);
ORTE_DECLSPEC int orte_snapc_ckpt_state_str(char ** state_str, int state);
ORTE_DECLSPEC char * orte_snapc_base_unique_global_snapshot_name(pid_t pid);
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name);
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_directory(char *uniq_global_snapshot_name);
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
ORTE_DECLSPEC int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid);
ORTE_DECLSPEC int orte_snapc_base_get_global_snapshot_metadata_file(char **file_name, char *uniq_snapshot_name);
ORTE_DECLSPEC int orte_snapc_base_get_global_snapshot_directory(char **dir_name, char *uniq_global_snapshot_name);
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
bool empty_metadata);
ORTE_DECLSPEC int orte_snapc_base_add_timestamp(char * global_snapshot_ref);
ORTE_DECLSPEC int orte_snapc_base_add_vpid_metadata(orte_process_name_t *proc,
char * global_snapshot_ref,
char *snapshot_ref,
char *snapshot_location);
char *snapshot_location,
char *crs_agent);
ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref);
ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot);

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
@ -41,6 +41,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
@ -68,27 +69,51 @@ size_t orte_snapc_base_snapshot_seq_number = 0;
/******************
* Object stuff
******************/
OBJ_CLASS_INSTANCE(orte_snapc_base_snapshot_t,
opal_crs_base_snapshot_t,
orte_snapc_base_snapshot_construct,
orte_snapc_base_snapshot_destruct);
OBJ_CLASS_INSTANCE(orte_snapc_base_local_snapshot_t,
opal_list_item_t,
orte_snapc_base_local_snapshot_construct,
orte_snapc_base_local_snapshot_destruct);
void orte_snapc_base_snapshot_construct(orte_snapc_base_snapshot_t *snapshot)
void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *snapshot)
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_pid = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
snapshot->term = false;
snapshot->reference_name = NULL;
snapshot->local_location = NULL;
snapshot->remote_location = NULL;
snapshot->opal_crs = NULL;
}
void orte_snapc_base_snapshot_destruct( orte_snapc_base_snapshot_t *snapshot)
void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot)
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_pid = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
snapshot->term = false;
if( NULL != snapshot->reference_name ) {
free(snapshot->reference_name);
snapshot->reference_name = NULL;
}
if( NULL != snapshot->local_location ) {
free(snapshot->local_location);
snapshot->local_location = NULL;
}
if( NULL != snapshot->remote_location ) {
free(snapshot->remote_location);
snapshot->remote_location = NULL;
}
if( NULL != snapshot->opal_crs ) {
free(snapshot->opal_crs);
snapshot->opal_crs = NULL;
}
}
/****/
@ -99,51 +124,38 @@ OBJ_CLASS_INSTANCE(orte_snapc_base_global_snapshot_t,
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot)
{
OBJ_CONSTRUCT(&(snapshot->snapshots), opal_list_t);
char *tmp_dir = NULL;
snapshot->component_name = NULL;
snapshot->reference_name = orte_snapc_base_unique_global_snapshot_name(getpid());
snapshot->local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(snapshot->reference_name));
OBJ_CONSTRUCT(&(snapshot->local_snapshots), opal_list_t);
orte_snapc_base_unique_global_snapshot_name(&(snapshot->reference_name), getpid());
orte_snapc_base_get_global_snapshot_directory(&tmp_dir, snapshot->reference_name);
snapshot->local_location = opal_dirname(tmp_dir);
free(tmp_dir);
snapshot->seq_num = 0;
snapshot->start_time = NULL;
snapshot->end_time = NULL;
}
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
{
opal_list_item_t* item = NULL;
while (NULL != (item = opal_list_remove_first(&snapshot->snapshots))) {
while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&(snapshot->snapshots));
OBJ_DESTRUCT(&(snapshot->local_snapshots));
if(NULL != snapshot->reference_name) {
free(snapshot->reference_name);
snapshot->reference_name = NULL;
}
if(NULL != snapshot->component_name) {
free(snapshot->component_name);
snapshot->component_name = NULL;
}
if(NULL != snapshot->local_location) {
free(snapshot->local_location);
snapshot->local_location = NULL;
}
if(NULL != snapshot->start_time) {
free(snapshot->start_time);
snapshot->start_time = NULL;
}
if(NULL != snapshot->end_time) {
free(snapshot->end_time);
snapshot->end_time = NULL;
}
snapshot->seq_num = 0;
}
@ -198,6 +210,7 @@ int orte_snapc_base_none_setup_job(orte_jobid_t jobid)
ORTE_RML_PERSISTENT,
snapc_none_global_cmdline_request,
NULL))) {
ORTE_ERROR_LOG(rc);
exit_status = rc;
goto cleanup;
}
@ -238,6 +251,7 @@ static void snapc_none_global_cmdline_request(int status,
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -251,6 +265,7 @@ static void snapc_none_global_cmdline_request(int status,
* Do the basic handshake with the orte_checkpoint command
*/
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, &term, &jobid)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -259,6 +274,7 @@ static void snapc_none_global_cmdline_request(int status,
* Respond with an invalid response
*/
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, -1, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -312,6 +328,7 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
"%s) base:ckpt_init_cmd: Error: DSS Unpack (term) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -322,6 +339,7 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
"%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -373,6 +391,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -382,6 +401,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
"%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -393,6 +413,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
"%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -401,6 +422,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
"%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -411,6 +433,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
"%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -433,42 +456,36 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
/*****************************
* Snapshot metadata functions
*****************************/
char * orte_snapc_base_unique_global_snapshot_name(pid_t pid)
int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid)
{
char * uniq_name;
if( NULL == orte_snapc_base_global_snapshot_ref ) {
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid);
asprintf(name_str, "ompi_global_snapshot_%d.ckpt", pid);
}
else {
uniq_name = strdup(orte_snapc_base_global_snapshot_ref);
*name_str = strdup(orte_snapc_base_global_snapshot_ref);
}
return uniq_name;
return ORTE_SUCCESS;
}
char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name)
int orte_snapc_base_get_global_snapshot_metadata_file(char **file_name, char *uniq_snapshot_name)
{
char * path = NULL;
asprintf(&path, "%s/%s/%s",
asprintf(file_name, "%s/%s/%s",
orte_snapc_base_global_snapshot_dir,
uniq_snapshot_name,
orte_snapc_base_metadata_filename);
return path;
return ORTE_SUCCESS;
}
char * orte_snapc_base_get_global_snapshot_directory(char *uniq_snapshot_name)
int orte_snapc_base_get_global_snapshot_directory(char **dir_name, char *uniq_snapshot_name)
{
char * dir_name = NULL;
asprintf(&dir_name, "%s/%s/%d",
asprintf(dir_name, "%s/%s/%d",
orte_snapc_base_global_snapshot_dir,
uniq_snapshot_name,
(int)orte_snapc_base_snapshot_seq_number);
return dir_name;
return ORTE_SUCCESS;
}
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, bool empty_metadata)
@ -482,8 +499,9 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
/*
* Make the snapshot directory from the uniq_global_snapshot_name
*/
dir_name = orte_snapc_base_get_global_snapshot_directory(uniq_global_snapshot_name);
orte_snapc_base_get_global_snapshot_directory(&dir_name, uniq_global_snapshot_name);
if(OPAL_SUCCESS != (ret = opal_os_dirpath_create(dir_name, my_mode)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -491,13 +509,14 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
/*
* Initialize the metadata file at the top of that directory.
*/
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(uniq_global_snapshot_name);
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, uniq_global_snapshot_name);
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
opal_output(orte_snapc_base_output,
"%s) base:init_global_snapshot_directory: Error: Unable to open the file (%s)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
meta_data_fname);
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
@ -529,7 +548,7 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
if(NULL != meta_data_fname)
free(meta_data_fname);
return OPAL_SUCCESS;
return ORTE_SUCCESS;
}
/*
@ -575,13 +594,14 @@ int orte_snapc_base_add_timestamp(char * global_snapshot_ref)
char * meta_data_fname = NULL;
time_t timestamp;
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
opal_output(orte_snapc_base_output,
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
meta_data_fname);
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
@ -607,13 +627,14 @@ int orte_snapc_base_finalize_metadata(char * global_snapshot_ref)
/* Add the final timestamp */
orte_snapc_base_add_timestamp(global_snapshot_ref);
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
opal_output(orte_snapc_base_output,
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
meta_data_fname);
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
@ -633,23 +654,28 @@ int orte_snapc_base_finalize_metadata(char * global_snapshot_ref)
int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
char * global_snapshot_ref,
char *snapshot_ref,
char *snapshot_location)
char *snapshot_location,
char *crs_agent)
{
int ret, exit_status = ORTE_SUCCESS;
FILE * meta_data = NULL;
char * meta_data_fname = NULL;
char * crs_comp = NULL;
char * local_dir = NULL;
char * proc_name = NULL;
int prev_pid = 0;
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
if( NULL == snapshot_location ) {
return ORTE_ERROR;
}
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
opal_output(orte_snapc_base_output,
"%s) base:add_metadata: Error: Unable to open the file (%s)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
meta_data_fname);
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
@ -663,20 +689,21 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
orte_util_convert_process_name_to_string(&proc_name, proc);
/* Extract the checkpointer */
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) {
exit_status = ORTE_ERROR;
goto cleanup;
if( NULL == crs_agent ) {
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) {
exit_status = ret;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
} else {
crs_comp = strdup(crs_agent);
}
/* get the base of the location */
local_dir = strdup(snapshot_location);
local_dir = opal_dirname(local_dir);
/* Write the string */
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_PROCESS, proc_name);
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_CRS_COMP, crs_comp);
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_REF, snapshot_ref);
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, local_dir);
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, snapshot_location);
cleanup:
if( NULL != meta_data )
@ -684,9 +711,6 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
if( NULL != meta_data_fname)
free(meta_data_fname);
if( NULL != local_dir)
free(local_dir);
return exit_status;
}
@ -698,13 +722,14 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
int next_seq_int;
char * token = NULL;
char * value = NULL;
orte_snapc_base_snapshot_t *vpid_snapshot = NULL;
orte_snapc_base_local_snapshot_t *vpid_snapshot = NULL;
/*
* Open the metadata file
*/
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot->reference_name);
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot->reference_name);
if (NULL == (meta_data = fopen(meta_data_fname, "r")) ) {
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
@ -742,12 +767,7 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
break;
}
else if(0 == strncmp(SNAPC_METADATA_TIME, token, strlen(SNAPC_METADATA_TIME)) ) {
if( NULL == global_snapshot->start_time) {
global_snapshot->start_time = strdup(value);
}
else {
global_snapshot->end_time = strdup(value);
}
;
}
else if(0 == strncmp(SNAPC_METADATA_PROCESS, token, strlen(SNAPC_METADATA_PROCESS)) ) {
orte_process_name_t proc;
@ -756,29 +776,29 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
/* Not the first process, so append it to the list */
if( NULL != vpid_snapshot) {
opal_list_append(&global_snapshot->snapshots, &(vpid_snapshot->crs_snapshot_super.super));
opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
}
vpid_snapshot = OBJ_NEW(orte_snapc_base_snapshot_t);
vpid_snapshot = OBJ_NEW(orte_snapc_base_local_snapshot_t);
vpid_snapshot->process_name.jobid = proc.jobid;
vpid_snapshot->process_name.vpid = proc.vpid;
}
else if(0 == strncmp(SNAPC_METADATA_CRS_COMP, token, strlen(SNAPC_METADATA_CRS_COMP)) ) {
vpid_snapshot->crs_snapshot_super.component_name = strdup(value);
vpid_snapshot->opal_crs = strdup(value);
}
else if(0 == strncmp(SNAPC_METADATA_SNAP_REF, token, strlen(SNAPC_METADATA_SNAP_REF)) ) {
vpid_snapshot->crs_snapshot_super.reference_name = strdup(value);
vpid_snapshot->reference_name = strdup(value);
}
else if(0 == strncmp(SNAPC_METADATA_SNAP_LOC, token, strlen(SNAPC_METADATA_SNAP_LOC)) ) {
vpid_snapshot->crs_snapshot_super.local_location = strdup(value);
vpid_snapshot->crs_snapshot_super.remote_location = strdup(value);
vpid_snapshot->local_location = strdup(value);
vpid_snapshot->remote_location = strdup(value);
}
} while(0 == feof(meta_data) );
/* Append the last item */
if( NULL != vpid_snapshot) {
opal_list_append(&global_snapshot->snapshots, &(vpid_snapshot->crs_snapshot_super.super));
opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
}
cleanup:
@ -960,34 +980,40 @@ static int metadata_extract_next_token(FILE *file, char **token, char **value)
return exit_status;
}
char * orte_snapc_ckpt_state_str(size_t state)
int orte_snapc_ckpt_state_str(char ** state_str, int state)
{
switch(state) {
case ORTE_SNAPC_CKPT_STATE_NONE:
return strdup(" -- ");
*state_str = strdup(" -- ");
break;
case ORTE_SNAPC_CKPT_STATE_REQUEST:
return strdup("Requested");
*state_str = strdup("Requested");
break;
case ORTE_SNAPC_CKPT_STATE_PENDING_TERM:
return strdup("Pending (Termination)");
*state_str = strdup("Pending (Termination)");
break;
case ORTE_SNAPC_CKPT_STATE_PENDING:
return strdup("Pending");
*state_str = strdup("Pending");
break;
case ORTE_SNAPC_CKPT_STATE_RUNNING:
return strdup("Running");
*state_str = strdup("Running");
break;
case ORTE_SNAPC_CKPT_STATE_FILE_XFER:
return strdup("File Transfer");
*state_str = strdup("File Transfer");
break;
case ORTE_SNAPC_CKPT_STATE_FINISHED:
return strdup("Finished");
*state_str = strdup("Finished");
break;
case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
*state_str = strdup("Locally Finished");
break;
case ORTE_SNAPC_CKPT_STATE_ERROR:
return strdup("Error");
*state_str = strdup("Error");
break;
default:
return strdup("Unknown");
asprintf(state_str, "Unknown %d", state);
break;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2008 The Trustees of the University of Tennessee.
* All rights reserved.
@ -74,6 +74,7 @@ char * orte_snapc_base_global_snapshot_ref = NULL;
bool orte_snapc_base_store_in_place = true;
bool orte_snapc_base_store_only_one_seq = false;
bool orte_snapc_base_establish_global_snapshot_dir = false;
bool orte_snapc_base_is_global_dir_shared = false;
/**
* Function for finding and opening either all MCA components,
@ -97,9 +98,20 @@ int orte_snapc_base_open(void)
opal_home_directory(),
&orte_snapc_base_global_snapshot_dir);
mca_base_param_reg_int_name("snapc",
"base_global_shared",
"If the global_snapshot_dir is on a shared file system all nodes can access, "
"then the checkpoint files can be copied more efficiently when FileM is used."
" [Default = disabled]",
false, false,
0,
&value);
orte_snapc_base_is_global_dir_shared = OPAL_INT_TO_BOOL(value);
OPAL_OUTPUT_VERBOSE((20, orte_snapc_base_output,
"snapc:base: open: base_global_snapshot_dir = %s",
orte_snapc_base_global_snapshot_dir));
"snapc:base: open: base_global_snapshot_dir = %s (%s)",
orte_snapc_base_global_snapshot_dir,
(orte_snapc_base_is_global_dir_shared ? "Shared" : "Local") ));
/*
* Store the checkpoint files in their final location.
@ -173,8 +185,8 @@ int orte_snapc_base_open(void)
if( NULL == orte_snapc_base_global_snapshot_loc ) {
char *t1 = NULL;
char *t2 = NULL;
t1 = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
t2 = orte_snapc_base_get_global_snapshot_directory( t1 );
orte_snapc_base_unique_global_snapshot_name(&t1, getpid() );
orte_snapc_base_get_global_snapshot_directory(&t2, t1 );
orte_snapc_base_global_snapshot_loc = strdup(t2);
free(t1);
free(t2);

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
@ -30,6 +30,7 @@
#include "opal/mca/mca.h"
#include "opal/event/event.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/snapc/snapc.h"
BEGIN_C_DECLS
@ -39,35 +40,54 @@ BEGIN_C_DECLS
*/
typedef uint8_t orte_snapc_full_cmd_flag_t;
#define ORTE_SNAPC_FULL_CMD OPAL_UINT8
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD 1
#define ORTE_SNAPC_FULL_UPDATE_PROC_STATE_CMD 2
#define ORTE_SNAPC_FULL_VPID_ASSOC_CMD 3
#define ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD 4
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD 1
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_QUICK_CMD 2
#define ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_CMD 3
#define ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_QUICK_CMD 4
#define ORTE_SNAPC_FULL_VPID_ASSOC_CMD 5
#define ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD 6
#define ORTE_SNAPC_FULL_MAX 7
/*
* Local Component structures
*/
struct orte_snapc_full_component_t {
orte_snapc_base_component_t super; /** Base SNAPC component */
};
typedef struct orte_snapc_full_component_t orte_snapc_full_component_t;
OPAL_MODULE_DECLSPEC extern orte_snapc_full_component_t mca_snapc_full_component;
struct orte_snapc_full_global_snapshot_t {
/*
* Global Coordinator per orted metadata
*/
struct orte_snapc_full_orted_snapshot_t {
/** Base SNAPC Global snapshot type */
orte_snapc_base_snapshot_t super;
orte_snapc_base_global_snapshot_t super;
/** Local coordinator associated with this vpid */
orte_process_name_t local_coord;
/** ORTE Process name */
orte_process_name_t process_name;
/** State of the checkpoint */
int state;
/** OPAL CRS Component */
char * opal_crs;
/** Term flag */
bool term;
/** FileM request */
orte_filem_base_request_t *filem_request;
};
typedef struct orte_snapc_full_global_snapshot_t orte_snapc_full_global_snapshot_t;
typedef struct orte_snapc_full_orted_snapshot_t orte_snapc_full_orted_snapshot_t;
OBJ_CLASS_DECLARATION(orte_snapc_full_orted_snapshot_t);
OBJ_CLASS_DECLARATION(orte_snapc_full_global_snapshot_t);
struct orte_snapc_full_local_snapshot_t {
/*
* Local Coordinator per app metadata
*/
struct orte_snapc_full_app_snapshot_t {
/** Base SNAPC Global snapshot type */
orte_snapc_base_snapshot_t super;
orte_snapc_base_local_snapshot_t super;
/** Named Pipe Read and Write */
char * comm_pipe_r;
@ -79,14 +99,18 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
struct opal_event comm_pipe_r_eh;
bool is_eh_active;
/** State of the process wrt checkpointing */
int ckpt_state;
};
typedef struct orte_snapc_full_local_snapshot_t orte_snapc_full_local_snapshot_t;
/** Process pid */
pid_t process_pid;
OBJ_CLASS_DECLARATION(orte_snapc_full_local_snapshot_t);
/** Term */
bool term;
};
typedef struct orte_snapc_full_app_snapshot_t orte_snapc_full_app_snapshot_t;
OBJ_CLASS_DECLARATION(orte_snapc_full_app_snapshot_t);
extern bool orte_snapc_full_skip_filem;
extern bool orte_snapc_full_skip_app;
extern bool orte_snapc_full_timing_enabled;
int orte_snapc_full_component_query(mca_base_module_t **module, int *priority);
@ -108,12 +132,11 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
int global_coord_finalize(void);
int global_coord_setup_job(orte_jobid_t jobid);
int global_coord_release_job(orte_jobid_t jobid);
int global_coord_vpid_assoc_update(orte_process_name_t local_coord,
orte_process_name_t proc_name);
int global_coord_vpid_state_update(orte_process_name_t proc_name,
size_t proc_ckpt_state,
char **proc_ckpt_ref,
char **proc_ckpt_loc);
int global_coord_orted_state_update(orte_process_name_t proc_name,
int proc_ckpt_state,
char **proc_ckpt_ref,
char **proc_ckpt_loc,
char **agent_ckpt);
/*
* Local Coordinator Functionality
*/
@ -122,7 +145,7 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
int local_coord_setup_job(orte_jobid_t jobid);
int local_coord_release_job(orte_jobid_t jobid);
int local_coord_job_state_update(orte_jobid_t jobid,
size_t job_ckpt_state,
int job_ckpt_state,
char **job_ckpt_ref,
char **job_ckpt_loc);

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
@ -219,7 +219,12 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
opal_cr_currently_stalled = false;
app_pid = getpid();
ret = opal_cr_inc_core(app_pid, local_snapshot, app_term, &cr_state);
if( orte_snapc_full_skip_app ) {
ret = ORTE_SUCCESS;
cr_state = OPAL_CRS_CONTINUE;
} else {
ret = opal_cr_inc_core(app_pid, local_snapshot, app_term, &cr_state);
}
if( OPAL_EXISTS == ret ) {
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
"App) notify_response: Stalling the checkpoint progress until state is stable again (PID = %d)\n",

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
@ -35,6 +35,8 @@ static int snapc_full_open(void);
static int snapc_full_close(void);
bool orte_snapc_full_skip_filem = false;
bool orte_snapc_full_skip_app = false;
bool orte_snapc_full_timing_enabled = false;
/*
* Instantiate the public struct with all of our public information
@ -113,6 +115,22 @@ static int snapc_full_open(void)
&value);
orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
"skip_app",
"Not for general use! For debugging only! Shortcut app level coord. [Default = disabled]",
false, false,
0,
&value);
orte_snapc_full_skip_app = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
"enable_timing",
"Enable timing information. [Default = disabled]",
false, false,
0,
&value);
orte_snapc_full_timing_enabled = OPAL_INT_TO_BOOL(value);
/*
* Debug Output
*/

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
@ -50,24 +50,24 @@ static orte_snapc_base_module_t loc_module = {
/*
* Global Snapshot structure
*/
void orte_snapc_full_global_construct(orte_snapc_full_global_snapshot_t *obj);
void orte_snapc_full_global_destruct( orte_snapc_full_global_snapshot_t *obj);
void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *obj);
void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *obj);
OBJ_CLASS_INSTANCE(orte_snapc_full_global_snapshot_t,
orte_snapc_base_snapshot_t,
orte_snapc_full_global_construct,
orte_snapc_full_global_destruct);
OBJ_CLASS_INSTANCE(orte_snapc_full_orted_snapshot_t,
orte_snapc_base_global_snapshot_t,
orte_snapc_full_orted_construct,
orte_snapc_full_orted_destruct);
/*
* Local Snapshot structure
*/
void orte_snapc_full_local_construct(orte_snapc_full_local_snapshot_t *obj);
void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj);
void orte_snapc_full_app_construct(orte_snapc_full_app_snapshot_t *obj);
void orte_snapc_full_app_destruct( orte_snapc_full_app_snapshot_t *obj);
OBJ_CLASS_INSTANCE(orte_snapc_full_local_snapshot_t,
orte_snapc_base_snapshot_t,
orte_snapc_full_local_construct,
orte_snapc_full_local_destruct);
OBJ_CLASS_INSTANCE(orte_snapc_full_app_snapshot_t,
orte_snapc_base_local_snapshot_t,
orte_snapc_full_app_construct,
orte_snapc_full_app_destruct);
/************************************
* Locally Global vars & functions :)
@ -77,29 +77,53 @@ OBJ_CLASS_INSTANCE(orte_snapc_full_local_snapshot_t,
/************************
* Function Definitions
************************/
void orte_snapc_full_global_construct(orte_snapc_full_global_snapshot_t *snapshot) {
snapshot->local_coord.vpid = 0;
snapshot->local_coord.jobid = 0;
void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot) {
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
snapshot->opal_crs = NULL;
snapshot->term = false;
snapshot->filem_request = NULL;
}
void orte_snapc_full_global_destruct( orte_snapc_full_global_snapshot_t *snapshot) {
snapshot->local_coord.vpid = 0;
snapshot->local_coord.jobid = 0;
void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *snapshot) {
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
if( NULL != snapshot->opal_crs ) {
free( snapshot->opal_crs );
snapshot->opal_crs = NULL;
}
snapshot->term = false;
if( NULL != snapshot->filem_request ) {
OBJ_RELEASE(snapshot->filem_request);
snapshot->filem_request = NULL;
}
}
void orte_snapc_full_local_construct(orte_snapc_full_local_snapshot_t *obj) {
void orte_snapc_full_app_construct(orte_snapc_full_app_snapshot_t *obj) {
obj->comm_pipe_r = NULL;
obj->comm_pipe_w = NULL;
obj->comm_pipe_r_fd = -1;
obj->comm_pipe_w_fd = -1;
obj->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
obj->is_eh_active = false;
obj->process_pid = 0;
obj->term = false;
}
void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj) {
void orte_snapc_full_app_destruct( orte_snapc_full_app_snapshot_t *obj) {
if( NULL != obj->comm_pipe_r ) {
free(obj->comm_pipe_r);
obj->comm_pipe_r = NULL;
@ -113,9 +137,11 @@ void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj) {
obj->comm_pipe_r_fd = -1;
obj->comm_pipe_w_fd = -1;
obj->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
obj->is_eh_active = false;
obj->process_pid = 0;
obj->term = false;
}
/*

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -89,45 +89,61 @@ BEGIN_C_DECLS
/**
* States that a process can be in while checkpointing
*/
/* Doing no checkpoint -- Quiet state */
#define ORTE_SNAPC_CKPT_STATE_NONE 0
/* There has been a request for a checkpoint from one of the applications */
#define ORTE_SNAPC_CKPT_STATE_REQUEST 1
/* There is a Pending checkpoint for this process */
#define ORTE_SNAPC_CKPT_STATE_PENDING 2
/* There is a Pending checkpoint for this process, terminate the process after checkpoint */
#define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 3
/* Running the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_RUNNING 4
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 5
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FINISHED 6
/* Unable to checkpoint this job */
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 7
/* Reached an error */
#define ORTE_SNAPC_CKPT_STATE_ERROR 8
#define ORTE_SNAPC_CKPT_STATE_ERROR 0
/* Doing no checkpoint -- Quiet state */
#define ORTE_SNAPC_CKPT_STATE_NONE 1
/* There has been a request for a checkpoint from one of the applications */
#define ORTE_SNAPC_CKPT_STATE_REQUEST 2
/* There is a Pending checkpoint for this process */
#define ORTE_SNAPC_CKPT_STATE_PENDING 3
/* There is a Pending checkpoint for this process, terminate the process after checkpoint */
#define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 4
/* Running the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_RUNNING 5
/* Finished the checkpoint locally */
#define ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL 6
/* File Transfer in progress */
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 8
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FINISHED 9
/* Unable to checkpoint this job */
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 10
#define ORTE_SNAPC_CKPT_MAX 11
/**
* Definition of a orte local snapshot.
* Similar to the opal_crs_base_snapshot_t except that it
* contains process contact information.
*/
struct orte_snapc_base_snapshot_1_0_0_t {
opal_crs_base_snapshot_t crs_snapshot_super;
struct orte_snapc_base_local_snapshot_1_0_0_t {
/** List super object */
opal_list_item_t super;
/** ORTE Process name */
orte_process_name_t process_name;
/** PID of the application process that generated this snapshot */
pid_t process_pid;
/** State of the checkpoint */
size_t state;
/** Terminate this process after a checkpoint */
bool term;
};
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_1_0_0_t;
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_snapshot_t);
/** State of the checkpoint */
int state;
/** Unique name of the local snapshot */
char * reference_name;
/** Local location of the local snapshot Absolute path */
char * local_location;
/** Remote location of the local snapshot Absolute path */
char * remote_location;
/** CRS agent */
char * opal_crs;
};
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_1_0_0_t;
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_local_snapshot_t);
/**
* Definition of the global snapshot.
@ -138,12 +154,9 @@ struct orte_snapc_base_global_snapshot_1_0_0_t {
/** This is an object, so must have super */
opal_list_item_t super;
/** A list of orte_snapc_base_snapshot_ts */
opal_list_t snapshots;
/** A list of orte_snapc_base_snapshot_t's */
opal_list_t local_snapshots;
/* ORTE SnapC Component used to generate the global snapshot */
char * component_name;
/** Unique name of the global snapshot */
char * reference_name;
@ -152,12 +165,6 @@ struct orte_snapc_base_global_snapshot_1_0_0_t {
/** Sequence Number */
int seq_num;
/** Beginning timestamp */
char * start_time;
/** Ending timestamp */
char * end_time;
};
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -76,6 +76,8 @@
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/******************
* Local Functions
******************/
@ -108,11 +110,16 @@ static int global_sequence_num = 0;
*****************************************/
static bool listener_started = false;
static double timer_start = 0;
static double timer_last = 0;
static double get_time(void);
typedef struct {
bool help;
int pid;
bool term;
bool verbose;
int verbose_level;
orte_jobid_t req_hnp; /**< User Requested HNP */
bool nowait; /* Do not wait for checkpoint to complete before returning */
bool status; /* Display status messages while checkpoint is progressing */
@ -135,6 +142,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
&orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL, NULL, NULL,
'V', NULL, NULL,
1,
&orte_checkpoint_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
"Set the verbosity level (For additional debugging information)" },
{ NULL, NULL, NULL,
'\0', NULL, "term",
0,
@ -279,6 +292,7 @@ static int parse_args(int argc, char *argv[]) {
orte_checkpoint_globals.pid = -1;
orte_checkpoint_globals.term = false;
orte_checkpoint_globals.verbose = false;
orte_checkpoint_globals.verbose_level = 0;
orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID;
orte_checkpoint_globals.nowait = false;
orte_checkpoint_globals.status = false;
@ -344,6 +358,14 @@ static int parse_args(int argc, char *argv[]) {
goto cleanup;
}
if(orte_checkpoint_globals.verbose_level < 0 ) {
orte_checkpoint_globals.verbose_level = 0;
}
if(orte_checkpoint_globals.verbose_level > 0) {
orte_checkpoint_globals.verbose = true;
}
/*
* If the user did not supply an hnp jobid, then they must
* supply the PID of MPIRUN
@ -474,7 +496,7 @@ static int ckpt_init(int argc, char *argv[]) {
*/
if( orte_checkpoint_globals.verbose ) {
orte_checkpoint_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_checkpoint_globals.output, 10);
opal_output_set_verbosity(orte_checkpoint_globals.output, orte_checkpoint_globals.verbose_level);
} else {
orte_checkpoint_globals.output = 0; /* Default=STDERR */
}
@ -661,6 +683,8 @@ notify_process_for_checkpoint(int term)
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
orte_checkpoint_globals.pid);
timer_start = get_time();
/***********************************
* Notify HNP of checkpoint request
* Send:
@ -709,18 +733,51 @@ notify_process_for_checkpoint(int term)
/***************
* Pretty Print
***************/
static double get_time(void) {
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static int pretty_print_status(void) {
char * state_str = NULL;
double cur_time;
state_str = orte_snapc_ckpt_state_str(orte_checkpoint_globals.ckpt_status);
cur_time = get_time();
if( timer_last == 0 ) {
timer_last = cur_time;
}
orte_snapc_ckpt_state_str(&state_str, orte_checkpoint_globals.ckpt_status);
if( NULL != global_snapshot_handle ) {
opal_output(0,
"[%6.2f / %6.2f] %*s - %s\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str, global_snapshot_handle);
} else {
opal_output(0,
"[%6.2f / %6.2f] %*s - ...\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str);
}
opal_output(0,
"%*s - Global Snapshot Reference: %s\n",
25, state_str, global_snapshot_handle);
if( NULL != state_str) {
free(state_str);
}
timer_last = cur_time;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -491,6 +491,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
orte_std_cntr_t i;
char *jobstr;
orte_jobid_t mask=0x0000ffff;
char * state_str = NULL;
for(i=0; i < num_jobs; i++) {
job = jobs[i];
@ -513,9 +514,10 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
len_slots = 6;
len_vpid_r = (int) strlen("Num Procs");
#if OPAL_ENABLE_FT == 1
len_ckpt_s = (int) (strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) < strlen("Ckpt State") ?
orte_snapc_ckpt_state_str(&state_str, job->ckpt_state);
len_ckpt_s = (int) (strlen(state_str) < strlen("Ckpt State") ?
strlen("Ckpt State") :
strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) );
strlen(state_str) );
len_ckpt_r = (int) (NULL == job->ckpt_snapshot_ref ? strlen("Ckpt Ref") :
(strlen(job->ckpt_snapshot_ref) < strlen("Ckpt Ref") ?
strlen("Ckpt Ref") :
@ -525,6 +527,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
strlen("Ckpt Loc") :
strlen(job->ckpt_snapshot_loc) ) );
#else
state_str = NULL;
len_ckpt_s = -3;
len_ckpt_r = -3;
len_ckpt_l = -3;
@ -564,7 +567,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
printf("%*d | ", len_slots , (uint)job->total_slots_alloc);
printf("%*d | ", len_vpid_r, job->num_procs);
#if OPAL_ENABLE_FT == 1
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(job->ckpt_state));
printf("%*s | ", len_ckpt_s, state_str);
printf("%*s | ", len_ckpt_r, (NULL == job->ckpt_snapshot_ref ?
"" :
job->ckpt_snapshot_ref) );
@ -597,6 +600,7 @@ static int pretty_print_vpids(orte_job_t *job) {
orte_proc_t *vpid;
orte_app_context_t *app;
char *o_proc_name;
char *state_str = NULL;
/*
* Caculate segment lengths
@ -663,8 +667,9 @@ static int pretty_print_vpids(orte_job_t *job) {
len_state = strlen(pretty_vpid_state(vpid->state));
#if OPAL_ENABLE_FT == 1
if( (int)strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state)) > len_ckpt_s)
len_ckpt_s = strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state));
orte_snapc_ckpt_state_str(&state_str, vpid->ckpt_state);
if( (int)strlen(state_str) > len_ckpt_s)
len_ckpt_s = strlen(state_str);
if( NULL != vpid->ckpt_snapshot_ref &&
(int)strlen(vpid->ckpt_snapshot_ref) > len_ckpt_r)
@ -673,6 +678,8 @@ static int pretty_print_vpids(orte_job_t *job) {
if( NULL != vpid->ckpt_snapshot_loc &&
(int)strlen(vpid->ckpt_snapshot_loc) > len_ckpt_l)
len_ckpt_l = strlen(vpid->ckpt_snapshot_loc);
#else
state_str = NULL;
#endif
}
@ -739,7 +746,7 @@ static int pretty_print_vpids(orte_job_t *job) {
printf("%*s | ", len_state , pretty_vpid_state(vpid->state));
#if OPAL_ENABLE_FT == 1
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(vpid->ckpt_state));
printf("%*s | ", len_ckpt_s, state_str);
printf("%*s | ", len_ckpt_r, (NULL == vpid->ckpt_snapshot_ref ?
"" :
vpid->ckpt_snapshot_ref));

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -151,9 +151,10 @@ int
main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
pid_t child_pid;
pid_t child_pid = 0;
orte_snapc_base_global_snapshot_t *snapshot = NULL;
char *tmp_str = NULL;
/***************
* Initialize
***************/
@ -164,7 +165,10 @@ main(int argc, char *argv[])
snapshot = OBJ_NEW(orte_snapc_base_global_snapshot_t);
snapshot->reference_name = strdup(orte_restart_globals.filename);
snapshot->local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(snapshot->reference_name));
orte_snapc_base_get_global_snapshot_directory(&tmp_str, snapshot->reference_name);
snapshot->local_location = opal_dirname(tmp_str);
free(tmp_str);
tmp_str = NULL;
/*
* Check for existence of the file
@ -453,11 +457,11 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
goto cleanup;
}
for(item = opal_list_get_first(&snapshot->snapshots);
item != opal_list_get_end(&snapshot->snapshots);
for(item = opal_list_get_first(&snapshot->local_snapshots);
item != opal_list_get_end(&snapshot->local_snapshots);
item = opal_list_get_next(item) ) {
orte_snapc_base_snapshot_t *vpid_snapshot;
vpid_snapshot = (orte_snapc_base_snapshot_t*)item;
orte_snapc_base_local_snapshot_t *vpid_snapshot;
vpid_snapshot = (orte_snapc_base_local_snapshot_t*)item;
fprintf(appfile, "#\n");
fprintf(appfile, "# Old Process Name: %u.%u\n",
@ -467,13 +471,15 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
fprintf(appfile, "-np 1 ");
if(orte_restart_globals.preload) {
fprintf(appfile, "--preload-files %s/%s ",
vpid_snapshot->crs_snapshot_super.local_location,
vpid_snapshot->crs_snapshot_super.reference_name);
vpid_snapshot->local_location,
vpid_snapshot->reference_name);
fprintf(appfile, "--preload-files-dest-dir . ");
}
/* JJH: Make this match what the user originally specified on the command line */
fprintf(appfile, "-am ft-enable-cr ");
fprintf(appfile, " opal-restart ");
/* JJH: Make sure this changes if ever the default location of the local file is changed,
* currently it is safe to assume that it is in the current working directory.
*
@ -486,9 +492,9 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
else {
/* If we are *not* preloading the files, the point to the original checkpoint
* directory to access the checkpoint files. */
fprintf(appfile, "-mca crs_base_snapshot_dir %s ", vpid_snapshot->crs_snapshot_super.local_location);
fprintf(appfile, "-mca crs_base_snapshot_dir %s ", vpid_snapshot->local_location);
}
fprintf(appfile, "%s\n", vpid_snapshot->crs_snapshot_super.reference_name);
fprintf(appfile, "%s\n", vpid_snapshot->reference_name);
}
cleanup: