1
1

A bunch of improvements focused on Snapshot Coordination (SnapC) and File Management (FileM).

* Improved timing in SnapC Full Global Coordinator
 * Improved scalability of the SnapC Full protocol
 * Minor improvements to the error reporting mechanisms in SnapC and FileM
 * Improved the memory usage of the metadata routines - now the owner of the data is more explicit.
 * Added a FileM hint to indicate when files stored locally can be moved to/from a globally mounted file system using just the 'cp' command instead of the 'rcp/scp' command. Slightly improves performance, but not too drastically. Can be set using the following SnapC MCA parameter: {{{snapc_base_global_shared=1}}}
 * Implement the ability to throttle the number of outgoing connections in FileM. At larger scales this type of explicit throttling helps prevent overwhelming the HNP machine. Default: 10, set via MCA parameter: {{{filem_rsh_max_outgoing}}}
 * Add a few diagnostic/debugging features to SnapC and FileM.

This commit was SVN r21131.
Этот коммит содержится в:
Josh Hursey 2009-04-30 16:55:39 +00:00
родитель 38aca518bd
Коммит 0deb009225
18 изменённых файлов: 2549 добавлений и 1976 удалений

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -71,7 +71,11 @@ ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_file_set_t,
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *req) { ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *req) {
req->local_target = NULL; req->local_target = NULL;
req->local_hint = ORTE_FILEM_HINT_NONE;
req->remote_target = NULL; req->remote_target = NULL;
req->remote_hint = ORTE_FILEM_HINT_NONE;
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN; req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
} }
@ -81,11 +85,13 @@ ORTE_DECLSPEC void orte_filem_base_file_set_destruct( orte_filem_base_file_set_t
free(req->local_target); free(req->local_target);
req->local_target = NULL; req->local_target = NULL;
} }
req->local_hint = ORTE_FILEM_HINT_NONE;
if( NULL != req->remote_target ) { if( NULL != req->remote_target ) {
free(req->remote_target); free(req->remote_target);
req->remote_target = NULL; req->remote_target = NULL;
} }
req->remote_hint = ORTE_FILEM_HINT_NONE;
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN; req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
} }

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2007 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -77,6 +77,7 @@ int orte_filem_base_open(void)
NULL, &str_value); NULL, &str_value);
if( NULL != str_value ) { if( NULL != str_value ) {
free(str_value); free(str_value);
str_value = NULL;
} }
/* Open up all available components */ /* Open up all available components */

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2005 The University of Tennessee and The University
@ -54,6 +54,13 @@ extern "C" {
#define ORTE_FILEM_MOVE_TYPE_RM 2 #define ORTE_FILEM_MOVE_TYPE_RM 2
#define ORTE_FILEM_MOVE_TYPE_UNKNOWN 3 #define ORTE_FILEM_MOVE_TYPE_UNKNOWN 3
/**
* Hints that describe the local or remote file target for
* optimization purposes.
*/
#define ORTE_FILEM_HINT_NONE 0
#define ORTE_FILEM_HINT_SHARED 1
/** /**
* Define a Process Set * Define a Process Set
* *
@ -92,9 +99,15 @@ struct orte_filem_base_file_set_1_0_0_t {
/* Local file reference */ /* Local file reference */
char * local_target; char * local_target;
/* Local file reference hints */
int local_hint;
/* Remove file reference */ /* Remove file reference */
char * remote_target; char * remote_target;
/* Remote file reference hints */
int remote_hint;
/* Type of file to move */ /* Type of file to move */
int target_flag; int target_flag;
}; };

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -37,7 +37,7 @@ static int filem_rsh_open(void);
static int filem_rsh_close(void); static int filem_rsh_close(void);
int orte_filem_rsh_max_incomming = 10; int orte_filem_rsh_max_incomming = 10;
int orte_filem_rsh_max_outgoing = 10; int orte_filem_rsh_max_outgoing = 10;
/* /*
* Instantiate the public struct with all of our public information * Instantiate the public struct with all of our public information

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -42,6 +42,7 @@
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/opal_environ.h" #include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/threads/mutex.h" #include "opal/threads/mutex.h"
#include "opal/threads/condition.h" #include "opal/threads/condition.h"
@ -632,41 +633,51 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
} }
/* Do not check a local get() operation, to help supress the warnings from the HNP */ /* Do not check a local get() operation, to help supress the warnings from the HNP */
else if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) { else if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &p_set->source, &p_set->sink) ) {
char *base = NULL;
asprintf(&base, "%s/%s", f_set->local_target, opal_basename(f_set->remote_target));
/* /*
* The file should not exist if we are getting a file with the * The file should not exist if we are getting a file with the
* same name since we do not want to overwrite the filename * same name since we do not want to overwrite the filename
* without the users consent. * without the users consent.
*/ */
if( 0 == access(f_set->local_target, R_OK) ) { if( 0 == access(base, R_OK) ) {
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle, OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination\n", "filem:rsh: copy(): %s -> %s: Error: Cannot move file %s to %s. Already exists at destination (%s)\n",
ORTE_NAME_PRINT(&p_set->source), ORTE_NAME_PRINT(&p_set->source),
ORTE_NAME_PRINT(&p_set->sink), ORTE_NAME_PRINT(&p_set->sink),
f_set->remote_target, f_set->remote_target,
f_set->local_target)); f_set->local_target, base));
orte_show_help("help-orte-filem-rsh.txt", orte_show_help("help-orte-filem-rsh.txt",
"orte-filem-rsh:get-file-exists", "orte-filem-rsh:get-file-exists",
true, f_set->local_target, orte_process_info.nodename); true, f_set->local_target, orte_process_info.nodename);
free(base);
base = NULL;
request->is_done[cur_index] = true; request->is_done[cur_index] = true;
request->is_active[cur_index] = true; request->is_active[cur_index] = true;
request->exit_status[cur_index] = -1; request->exit_status[cur_index] = -1;
goto continue_set; goto continue_set;
} }
free(base);
base = NULL;
} }
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) { if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle, OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): %s -> %s: Moving file %s to %s\n", "filem:rsh: copy(): %s -> %s: Moving file %s %s to %s %s\n",
ORTE_NAME_PRINT(&p_set->source), ORTE_NAME_PRINT(&p_set->source),
ORTE_NAME_PRINT(&p_set->sink), ORTE_NAME_PRINT(&p_set->sink),
(f_set->local_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
f_set->local_target, f_set->local_target,
(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
f_set->remote_target)); f_set->remote_target));
} else { } else {
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle, OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: copy(): %s -> %s: Moving file %s to %s\n", "filem:rsh: copy(): %s -> %s: Moving file %s %s to %s %s\n",
ORTE_NAME_PRINT(&p_set->source), ORTE_NAME_PRINT(&p_set->source),
ORTE_NAME_PRINT(&p_set->sink), ORTE_NAME_PRINT(&p_set->sink),
(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
f_set->remote_target, f_set->remote_target,
(f_set->local_hint == ORTE_FILEM_HINT_SHARED ? "(S)" : ""),
f_set->local_target)); f_set->local_target));
} }
@ -736,12 +747,20 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
* If this is the put() routine * If this is the put() routine
*/ */
if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) { if( request->movement_type == ORTE_FILEM_MOVE_TYPE_PUT ) {
asprintf(&command, "%s %s %s %s:%s ", /* Use a local 'cp' when able */
mca_filem_rsh_component.cp_command, if(f_set->remote_hint == ORTE_FILEM_HINT_SHARED ) {
dir_arg, asprintf(&command, "cp %s %s %s ",
f_set->local_target, dir_arg,
remote_machine, f_set->local_target,
remote_file); remote_file);
} else {
asprintf(&command, "%s %s %s %s:%s ",
mca_filem_rsh_component.cp_command,
dir_arg,
f_set->local_target,
remote_machine,
remote_file);
}
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle, OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
"filem:rsh:put about to execute [%s]", command)); "filem:rsh:put about to execute [%s]", command));
@ -758,12 +777,22 @@ static int orte_filem_rsh_start_copy(orte_filem_base_request_t *request) {
* ow it is the get() routine * ow it is the get() routine
*/ */
else { else {
asprintf(&command, "%s %s %s:%s %s ", /* Use a local 'cp' when able */
mca_filem_rsh_component.cp_command, if(f_set->local_hint == ORTE_FILEM_HINT_SHARED ) {
dir_arg, asprintf(&command, "%s %s cp %s %s %s ",
remote_machine, mca_filem_rsh_component.remote_sh_command,
remote_file, remote_machine,
f_set->local_target); dir_arg,
remote_file,
f_set->local_target);
} else {
asprintf(&command, "%s %s %s:%s %s ",
mca_filem_rsh_component.cp_command,
dir_arg,
remote_machine,
remote_file,
f_set->local_target);
}
OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle, OPAL_OUTPUT_VERBOSE((17, mca_filem_rsh_component.super.output_handle,
"filem:rsh:get about to execute [%s]", command)); "filem:rsh:get about to execute [%s]", command));

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2005 The University of Tennessee and The University
@ -75,8 +75,8 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
/** /**
* Global Snapshot Object Maintenance functions * Global Snapshot Object Maintenance functions
*/ */
void orte_snapc_base_snapshot_construct(orte_snapc_base_snapshot_t *obj); void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *obj);
void orte_snapc_base_snapshot_destruct( orte_snapc_base_snapshot_t *obj); void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *obj);
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *obj); void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *obj);
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *obj); void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *obj);
@ -132,24 +132,26 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place; ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq; ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq;
ORTE_DECLSPEC extern bool orte_snapc_base_establish_global_snapshot_dir; ORTE_DECLSPEC extern bool orte_snapc_base_establish_global_snapshot_dir;
ORTE_DECLSPEC extern bool orte_snapc_base_is_global_dir_shared;
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number; ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;
/** /**
* Some utility functions * Some utility functions
*/ */
ORTE_DECLSPEC char * orte_snapc_ckpt_state_str(size_t state); ORTE_DECLSPEC int orte_snapc_ckpt_state_str(char ** state_str, int state);
ORTE_DECLSPEC char * orte_snapc_base_unique_global_snapshot_name(pid_t pid); ORTE_DECLSPEC int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid);
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name); ORTE_DECLSPEC int orte_snapc_base_get_global_snapshot_metadata_file(char **file_name, char *uniq_snapshot_name);
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_directory(char *uniq_global_snapshot_name); ORTE_DECLSPEC int orte_snapc_base_get_global_snapshot_directory(char **dir_name, char *uniq_global_snapshot_name);
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
bool empty_metadata); bool empty_metadata);
ORTE_DECLSPEC int orte_snapc_base_add_timestamp(char * global_snapshot_ref); ORTE_DECLSPEC int orte_snapc_base_add_timestamp(char * global_snapshot_ref);
ORTE_DECLSPEC int orte_snapc_base_add_vpid_metadata(orte_process_name_t *proc, ORTE_DECLSPEC int orte_snapc_base_add_vpid_metadata(orte_process_name_t *proc,
char * global_snapshot_ref, char * global_snapshot_ref,
char *snapshot_ref, char *snapshot_ref,
char *snapshot_location); char *snapshot_location,
char *crs_agent);
ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref); ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref);
ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot); ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot);

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -41,6 +41,7 @@
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h" #include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
@ -68,27 +69,51 @@ size_t orte_snapc_base_snapshot_seq_number = 0;
/****************** /******************
* Object stuff * Object stuff
******************/ ******************/
OBJ_CLASS_INSTANCE(orte_snapc_base_snapshot_t, OBJ_CLASS_INSTANCE(orte_snapc_base_local_snapshot_t,
opal_crs_base_snapshot_t, opal_list_item_t,
orte_snapc_base_snapshot_construct, orte_snapc_base_local_snapshot_construct,
orte_snapc_base_snapshot_destruct); orte_snapc_base_local_snapshot_destruct);
void orte_snapc_base_snapshot_construct(orte_snapc_base_snapshot_t *snapshot) void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *snapshot)
{ {
snapshot->process_name.jobid = 0; snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0; snapshot->process_name.vpid = 0;
snapshot->process_pid = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
snapshot->term = false;
snapshot->reference_name = NULL;
snapshot->local_location = NULL;
snapshot->remote_location = NULL;
snapshot->opal_crs = NULL;
} }
void orte_snapc_base_snapshot_destruct( orte_snapc_base_snapshot_t *snapshot) void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot)
{ {
snapshot->process_name.jobid = 0; snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0; snapshot->process_name.vpid = 0;
snapshot->process_pid = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
snapshot->term = false;
if( NULL != snapshot->reference_name ) {
free(snapshot->reference_name);
snapshot->reference_name = NULL;
}
if( NULL != snapshot->local_location ) {
free(snapshot->local_location);
snapshot->local_location = NULL;
}
if( NULL != snapshot->remote_location ) {
free(snapshot->remote_location);
snapshot->remote_location = NULL;
}
if( NULL != snapshot->opal_crs ) {
free(snapshot->opal_crs);
snapshot->opal_crs = NULL;
}
} }
/****/ /****/
@ -99,51 +124,38 @@ OBJ_CLASS_INSTANCE(orte_snapc_base_global_snapshot_t,
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot) void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot)
{ {
OBJ_CONSTRUCT(&(snapshot->snapshots), opal_list_t); char *tmp_dir = NULL;
snapshot->component_name = NULL; OBJ_CONSTRUCT(&(snapshot->local_snapshots), opal_list_t);
snapshot->reference_name = orte_snapc_base_unique_global_snapshot_name(getpid());
snapshot->local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(snapshot->reference_name)); orte_snapc_base_unique_global_snapshot_name(&(snapshot->reference_name), getpid());
orte_snapc_base_get_global_snapshot_directory(&tmp_dir, snapshot->reference_name);
snapshot->local_location = opal_dirname(tmp_dir);
free(tmp_dir);
snapshot->seq_num = 0; snapshot->seq_num = 0;
snapshot->start_time = NULL;
snapshot->end_time = NULL;
} }
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot) void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
{ {
opal_list_item_t* item = NULL; opal_list_item_t* item = NULL;
while (NULL != (item = opal_list_remove_first(&snapshot->snapshots))) { while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }
OBJ_DESTRUCT(&(snapshot->snapshots)); OBJ_DESTRUCT(&(snapshot->local_snapshots));
if(NULL != snapshot->reference_name) { if(NULL != snapshot->reference_name) {
free(snapshot->reference_name); free(snapshot->reference_name);
snapshot->reference_name = NULL; snapshot->reference_name = NULL;
} }
if(NULL != snapshot->component_name) {
free(snapshot->component_name);
snapshot->component_name = NULL;
}
if(NULL != snapshot->local_location) { if(NULL != snapshot->local_location) {
free(snapshot->local_location); free(snapshot->local_location);
snapshot->local_location = NULL; snapshot->local_location = NULL;
} }
if(NULL != snapshot->start_time) {
free(snapshot->start_time);
snapshot->start_time = NULL;
}
if(NULL != snapshot->end_time) {
free(snapshot->end_time);
snapshot->end_time = NULL;
}
snapshot->seq_num = 0; snapshot->seq_num = 0;
} }
@ -198,6 +210,7 @@ int orte_snapc_base_none_setup_job(orte_jobid_t jobid)
ORTE_RML_PERSISTENT, ORTE_RML_PERSISTENT,
snapc_none_global_cmdline_request, snapc_none_global_cmdline_request,
NULL))) { NULL))) {
ORTE_ERROR_LOG(rc);
exit_status = rc; exit_status = rc;
goto cleanup; goto cleanup;
} }
@ -238,6 +251,7 @@ static void snapc_none_global_cmdline_request(int status,
n = 1; n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) { if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -251,6 +265,7 @@ static void snapc_none_global_cmdline_request(int status,
* Do the basic handshake with the orte_checkpoint command * Do the basic handshake with the orte_checkpoint command
*/ */
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, &term, &jobid)) ) { if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, &term, &jobid)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -259,6 +274,7 @@ static void snapc_none_global_cmdline_request(int status,
* Respond with an invalid response * Respond with an invalid response
*/ */
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, -1, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) { if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, -1, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -312,6 +328,7 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
"%s) base:ckpt_init_cmd: Error: DSS Unpack (term) Failure (ret = %d) (LINE = %d)\n", "%s) base:ckpt_init_cmd: Error: DSS Unpack (term) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__); ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -322,6 +339,7 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
"%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n", "%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__); ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -373,6 +391,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
} }
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) { if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -382,6 +401,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
"%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n", "%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__); ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -393,6 +413,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
"%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n", "%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__); ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -401,6 +422,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
"%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n", "%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__); ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -411,6 +433,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
"%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n", "%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
ret, __LINE__); ret, __LINE__);
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -433,42 +456,36 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
/***************************** /*****************************
* Snapshot metadata functions * Snapshot metadata functions
*****************************/ *****************************/
char * orte_snapc_base_unique_global_snapshot_name(pid_t pid) int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid)
{ {
char * uniq_name;
if( NULL == orte_snapc_base_global_snapshot_ref ) { if( NULL == orte_snapc_base_global_snapshot_ref ) {
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid); asprintf(name_str, "ompi_global_snapshot_%d.ckpt", pid);
} }
else { else {
uniq_name = strdup(orte_snapc_base_global_snapshot_ref); *name_str = strdup(orte_snapc_base_global_snapshot_ref);
} }
return uniq_name; return ORTE_SUCCESS;
} }
char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name) int orte_snapc_base_get_global_snapshot_metadata_file(char **file_name, char *uniq_snapshot_name)
{ {
char * path = NULL; asprintf(file_name, "%s/%s/%s",
asprintf(&path, "%s/%s/%s",
orte_snapc_base_global_snapshot_dir, orte_snapc_base_global_snapshot_dir,
uniq_snapshot_name, uniq_snapshot_name,
orte_snapc_base_metadata_filename); orte_snapc_base_metadata_filename);
return path; return ORTE_SUCCESS;
} }
char * orte_snapc_base_get_global_snapshot_directory(char *uniq_snapshot_name) int orte_snapc_base_get_global_snapshot_directory(char **dir_name, char *uniq_snapshot_name)
{ {
char * dir_name = NULL; asprintf(dir_name, "%s/%s/%d",
asprintf(&dir_name, "%s/%s/%d",
orte_snapc_base_global_snapshot_dir, orte_snapc_base_global_snapshot_dir,
uniq_snapshot_name, uniq_snapshot_name,
(int)orte_snapc_base_snapshot_seq_number); (int)orte_snapc_base_snapshot_seq_number);
return dir_name; return ORTE_SUCCESS;
} }
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, bool empty_metadata) int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, bool empty_metadata)
@ -482,8 +499,9 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
/* /*
* Make the snapshot directory from the uniq_global_snapshot_name * Make the snapshot directory from the uniq_global_snapshot_name
*/ */
dir_name = orte_snapc_base_get_global_snapshot_directory(uniq_global_snapshot_name); orte_snapc_base_get_global_snapshot_directory(&dir_name, uniq_global_snapshot_name);
if(OPAL_SUCCESS != (ret = opal_os_dirpath_create(dir_name, my_mode)) ) { if(OPAL_SUCCESS != (ret = opal_os_dirpath_create(dir_name, my_mode)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -491,13 +509,14 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
/* /*
* Initialize the metadata file at the top of that directory. * Initialize the metadata file at the top of that directory.
*/ */
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(uniq_global_snapshot_name); orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, uniq_global_snapshot_name);
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) { if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
opal_output(orte_snapc_base_output, opal_output(orte_snapc_base_output,
"%s) base:init_global_snapshot_directory: Error: Unable to open the file (%s)\n", "%s) base:init_global_snapshot_directory: Error: Unable to open the file (%s)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
meta_data_fname); meta_data_fname);
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
goto cleanup; goto cleanup;
} }
@ -529,7 +548,7 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
if(NULL != meta_data_fname) if(NULL != meta_data_fname)
free(meta_data_fname); free(meta_data_fname);
return OPAL_SUCCESS; return ORTE_SUCCESS;
} }
/* /*
@ -575,13 +594,14 @@ int orte_snapc_base_add_timestamp(char * global_snapshot_ref)
char * meta_data_fname = NULL; char * meta_data_fname = NULL;
time_t timestamp; time_t timestamp;
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref); orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) { if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
opal_output(orte_snapc_base_output, opal_output(orte_snapc_base_output,
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n", "%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
meta_data_fname); meta_data_fname);
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
goto cleanup; goto cleanup;
} }
@ -607,13 +627,14 @@ int orte_snapc_base_finalize_metadata(char * global_snapshot_ref)
/* Add the final timestamp */ /* Add the final timestamp */
orte_snapc_base_add_timestamp(global_snapshot_ref); orte_snapc_base_add_timestamp(global_snapshot_ref);
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref); orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) { if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
opal_output(orte_snapc_base_output, opal_output(orte_snapc_base_output,
"%s) base:add_timestamp: Error: Unable to open the file (%s)\n", "%s) base:add_timestamp: Error: Unable to open the file (%s)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
meta_data_fname); meta_data_fname);
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
goto cleanup; goto cleanup;
} }
@ -633,23 +654,28 @@ int orte_snapc_base_finalize_metadata(char * global_snapshot_ref)
int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc, int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
char * global_snapshot_ref, char * global_snapshot_ref,
char *snapshot_ref, char *snapshot_ref,
char *snapshot_location) char *snapshot_location,
char *crs_agent)
{ {
int ret, exit_status = ORTE_SUCCESS; int ret, exit_status = ORTE_SUCCESS;
FILE * meta_data = NULL; FILE * meta_data = NULL;
char * meta_data_fname = NULL; char * meta_data_fname = NULL;
char * crs_comp = NULL; char * crs_comp = NULL;
char * local_dir = NULL;
char * proc_name = NULL; char * proc_name = NULL;
int prev_pid = 0; int prev_pid = 0;
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref); if( NULL == snapshot_location ) {
return ORTE_ERROR;
}
orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot_ref);
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) { if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
opal_output(orte_snapc_base_output, opal_output(orte_snapc_base_output,
"%s) base:add_metadata: Error: Unable to open the file (%s)\n", "%s) base:add_metadata: Error: Unable to open the file (%s)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
meta_data_fname); meta_data_fname);
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
goto cleanup; goto cleanup;
} }
@ -663,20 +689,21 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
orte_util_convert_process_name_to_string(&proc_name, proc); orte_util_convert_process_name_to_string(&proc_name, proc);
/* Extract the checkpointer */ /* Extract the checkpointer */
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) { if( NULL == crs_agent ) {
exit_status = ORTE_ERROR; if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) {
goto cleanup; exit_status = ret;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
} else {
crs_comp = strdup(crs_agent);
} }
/* get the base of the location */
local_dir = strdup(snapshot_location);
local_dir = opal_dirname(local_dir);
/* Write the string */ /* Write the string */
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_PROCESS, proc_name); fprintf(meta_data, "%s%s\n", SNAPC_METADATA_PROCESS, proc_name);
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_CRS_COMP, crs_comp); fprintf(meta_data, "%s%s\n", SNAPC_METADATA_CRS_COMP, crs_comp);
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_REF, snapshot_ref); fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_REF, snapshot_ref);
fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, local_dir); fprintf(meta_data, "%s%s\n", SNAPC_METADATA_SNAP_LOC, snapshot_location);
cleanup: cleanup:
if( NULL != meta_data ) if( NULL != meta_data )
@ -684,9 +711,6 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
if( NULL != meta_data_fname) if( NULL != meta_data_fname)
free(meta_data_fname); free(meta_data_fname);
if( NULL != local_dir)
free(local_dir);
return exit_status; return exit_status;
} }
@ -698,13 +722,14 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
int next_seq_int; int next_seq_int;
char * token = NULL; char * token = NULL;
char * value = NULL; char * value = NULL;
orte_snapc_base_snapshot_t *vpid_snapshot = NULL; orte_snapc_base_local_snapshot_t *vpid_snapshot = NULL;
/* /*
* Open the metadata file * Open the metadata file
*/ */
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot->reference_name); orte_snapc_base_get_global_snapshot_metadata_file(&meta_data_fname, global_snapshot->reference_name);
if (NULL == (meta_data = fopen(meta_data_fname, "r")) ) { if (NULL == (meta_data = fopen(meta_data_fname, "r")) ) {
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
goto cleanup; goto cleanup;
} }
@ -742,12 +767,7 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
break; break;
} }
else if(0 == strncmp(SNAPC_METADATA_TIME, token, strlen(SNAPC_METADATA_TIME)) ) { else if(0 == strncmp(SNAPC_METADATA_TIME, token, strlen(SNAPC_METADATA_TIME)) ) {
if( NULL == global_snapshot->start_time) { ;
global_snapshot->start_time = strdup(value);
}
else {
global_snapshot->end_time = strdup(value);
}
} }
else if(0 == strncmp(SNAPC_METADATA_PROCESS, token, strlen(SNAPC_METADATA_PROCESS)) ) { else if(0 == strncmp(SNAPC_METADATA_PROCESS, token, strlen(SNAPC_METADATA_PROCESS)) ) {
orte_process_name_t proc; orte_process_name_t proc;
@ -756,29 +776,29 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
/* Not the first process, so append it to the list */ /* Not the first process, so append it to the list */
if( NULL != vpid_snapshot) { if( NULL != vpid_snapshot) {
opal_list_append(&global_snapshot->snapshots, &(vpid_snapshot->crs_snapshot_super.super)); opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
} }
vpid_snapshot = OBJ_NEW(orte_snapc_base_snapshot_t); vpid_snapshot = OBJ_NEW(orte_snapc_base_local_snapshot_t);
vpid_snapshot->process_name.jobid = proc.jobid; vpid_snapshot->process_name.jobid = proc.jobid;
vpid_snapshot->process_name.vpid = proc.vpid; vpid_snapshot->process_name.vpid = proc.vpid;
} }
else if(0 == strncmp(SNAPC_METADATA_CRS_COMP, token, strlen(SNAPC_METADATA_CRS_COMP)) ) { else if(0 == strncmp(SNAPC_METADATA_CRS_COMP, token, strlen(SNAPC_METADATA_CRS_COMP)) ) {
vpid_snapshot->crs_snapshot_super.component_name = strdup(value); vpid_snapshot->opal_crs = strdup(value);
} }
else if(0 == strncmp(SNAPC_METADATA_SNAP_REF, token, strlen(SNAPC_METADATA_SNAP_REF)) ) { else if(0 == strncmp(SNAPC_METADATA_SNAP_REF, token, strlen(SNAPC_METADATA_SNAP_REF)) ) {
vpid_snapshot->crs_snapshot_super.reference_name = strdup(value); vpid_snapshot->reference_name = strdup(value);
} }
else if(0 == strncmp(SNAPC_METADATA_SNAP_LOC, token, strlen(SNAPC_METADATA_SNAP_LOC)) ) { else if(0 == strncmp(SNAPC_METADATA_SNAP_LOC, token, strlen(SNAPC_METADATA_SNAP_LOC)) ) {
vpid_snapshot->crs_snapshot_super.local_location = strdup(value); vpid_snapshot->local_location = strdup(value);
vpid_snapshot->crs_snapshot_super.remote_location = strdup(value); vpid_snapshot->remote_location = strdup(value);
} }
} while(0 == feof(meta_data) ); } while(0 == feof(meta_data) );
/* Append the last item */ /* Append the last item */
if( NULL != vpid_snapshot) { if( NULL != vpid_snapshot) {
opal_list_append(&global_snapshot->snapshots, &(vpid_snapshot->crs_snapshot_super.super)); opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
} }
cleanup: cleanup:
@ -960,34 +980,40 @@ static int metadata_extract_next_token(FILE *file, char **token, char **value)
return exit_status; return exit_status;
} }
char * orte_snapc_ckpt_state_str(size_t state) int orte_snapc_ckpt_state_str(char ** state_str, int state)
{ {
switch(state) { switch(state) {
case ORTE_SNAPC_CKPT_STATE_NONE: case ORTE_SNAPC_CKPT_STATE_NONE:
return strdup(" -- "); *state_str = strdup(" -- ");
break; break;
case ORTE_SNAPC_CKPT_STATE_REQUEST: case ORTE_SNAPC_CKPT_STATE_REQUEST:
return strdup("Requested"); *state_str = strdup("Requested");
break; break;
case ORTE_SNAPC_CKPT_STATE_PENDING_TERM: case ORTE_SNAPC_CKPT_STATE_PENDING_TERM:
return strdup("Pending (Termination)"); *state_str = strdup("Pending (Termination)");
break; break;
case ORTE_SNAPC_CKPT_STATE_PENDING: case ORTE_SNAPC_CKPT_STATE_PENDING:
return strdup("Pending"); *state_str = strdup("Pending");
break; break;
case ORTE_SNAPC_CKPT_STATE_RUNNING: case ORTE_SNAPC_CKPT_STATE_RUNNING:
return strdup("Running"); *state_str = strdup("Running");
break; break;
case ORTE_SNAPC_CKPT_STATE_FILE_XFER: case ORTE_SNAPC_CKPT_STATE_FILE_XFER:
return strdup("File Transfer"); *state_str = strdup("File Transfer");
break; break;
case ORTE_SNAPC_CKPT_STATE_FINISHED: case ORTE_SNAPC_CKPT_STATE_FINISHED:
return strdup("Finished"); *state_str = strdup("Finished");
break;
case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
*state_str = strdup("Locally Finished");
break; break;
case ORTE_SNAPC_CKPT_STATE_ERROR: case ORTE_SNAPC_CKPT_STATE_ERROR:
return strdup("Error"); *state_str = strdup("Error");
break; break;
default: default:
return strdup("Unknown"); asprintf(state_str, "Unknown %d", state);
break;
} }
return ORTE_SUCCESS;
} }

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2007 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2008 The Trustees of the University of Tennessee. * Copyright (c) 2004-2008 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -74,6 +74,7 @@ char * orte_snapc_base_global_snapshot_ref = NULL;
bool orte_snapc_base_store_in_place = true; bool orte_snapc_base_store_in_place = true;
bool orte_snapc_base_store_only_one_seq = false; bool orte_snapc_base_store_only_one_seq = false;
bool orte_snapc_base_establish_global_snapshot_dir = false; bool orte_snapc_base_establish_global_snapshot_dir = false;
bool orte_snapc_base_is_global_dir_shared = false;
/** /**
* Function for finding and opening either all MCA components, * Function for finding and opening either all MCA components,
@ -97,9 +98,20 @@ int orte_snapc_base_open(void)
opal_home_directory(), opal_home_directory(),
&orte_snapc_base_global_snapshot_dir); &orte_snapc_base_global_snapshot_dir);
mca_base_param_reg_int_name("snapc",
"base_global_shared",
"If the global_snapshot_dir is on a shared file system all nodes can access, "
"then the checkpoint files can be copied more efficiently when FileM is used."
" [Default = disabled]",
false, false,
0,
&value);
orte_snapc_base_is_global_dir_shared = OPAL_INT_TO_BOOL(value);
OPAL_OUTPUT_VERBOSE((20, orte_snapc_base_output, OPAL_OUTPUT_VERBOSE((20, orte_snapc_base_output,
"snapc:base: open: base_global_snapshot_dir = %s", "snapc:base: open: base_global_snapshot_dir = %s (%s)",
orte_snapc_base_global_snapshot_dir)); orte_snapc_base_global_snapshot_dir,
(orte_snapc_base_is_global_dir_shared ? "Shared" : "Local") ));
/* /*
* Store the checkpoint files in their final location. * Store the checkpoint files in their final location.
@ -173,8 +185,8 @@ int orte_snapc_base_open(void)
if( NULL == orte_snapc_base_global_snapshot_loc ) { if( NULL == orte_snapc_base_global_snapshot_loc ) {
char *t1 = NULL; char *t1 = NULL;
char *t2 = NULL; char *t2 = NULL;
t1 = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) ); orte_snapc_base_unique_global_snapshot_name(&t1, getpid() );
t2 = orte_snapc_base_get_global_snapshot_directory( t1 ); orte_snapc_base_get_global_snapshot_directory(&t2, t1 );
orte_snapc_base_global_snapshot_loc = strdup(t2); orte_snapc_base_global_snapshot_loc = strdup(t2);
free(t1); free(t1);
free(t2); free(t2);

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -30,6 +30,7 @@
#include "opal/mca/mca.h" #include "opal/mca/mca.h"
#include "opal/event/event.h" #include "opal/event/event.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/snapc.h"
BEGIN_C_DECLS BEGIN_C_DECLS
@ -39,35 +40,54 @@ BEGIN_C_DECLS
*/ */
typedef uint8_t orte_snapc_full_cmd_flag_t; typedef uint8_t orte_snapc_full_cmd_flag_t;
#define ORTE_SNAPC_FULL_CMD OPAL_UINT8 #define ORTE_SNAPC_FULL_CMD OPAL_UINT8
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD 1 #define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD 1
#define ORTE_SNAPC_FULL_UPDATE_PROC_STATE_CMD 2 #define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_QUICK_CMD 2
#define ORTE_SNAPC_FULL_VPID_ASSOC_CMD 3 #define ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_CMD 3
#define ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD 4 #define ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_QUICK_CMD 4
#define ORTE_SNAPC_FULL_VPID_ASSOC_CMD 5
#define ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD 6
#define ORTE_SNAPC_FULL_MAX 7
/* /*
* Local Component structures * Local Component structures
*/ */
struct orte_snapc_full_component_t { struct orte_snapc_full_component_t {
orte_snapc_base_component_t super; /** Base SNAPC component */ orte_snapc_base_component_t super; /** Base SNAPC component */
}; };
typedef struct orte_snapc_full_component_t orte_snapc_full_component_t; typedef struct orte_snapc_full_component_t orte_snapc_full_component_t;
OPAL_MODULE_DECLSPEC extern orte_snapc_full_component_t mca_snapc_full_component; OPAL_MODULE_DECLSPEC extern orte_snapc_full_component_t mca_snapc_full_component;
struct orte_snapc_full_global_snapshot_t { /*
* Global Coordinator per orted metadata
*/
struct orte_snapc_full_orted_snapshot_t {
/** Base SNAPC Global snapshot type */ /** Base SNAPC Global snapshot type */
orte_snapc_base_snapshot_t super; orte_snapc_base_global_snapshot_t super;
/** Local coordinator associated with this vpid */ /** ORTE Process name */
orte_process_name_t local_coord; orte_process_name_t process_name;
/** State of the checkpoint */
int state;
/** OPAL CRS Component */
char * opal_crs;
/** Term flag */
bool term;
/** FileM request */
orte_filem_base_request_t *filem_request;
}; };
typedef struct orte_snapc_full_global_snapshot_t orte_snapc_full_global_snapshot_t; typedef struct orte_snapc_full_orted_snapshot_t orte_snapc_full_orted_snapshot_t;
OBJ_CLASS_DECLARATION(orte_snapc_full_orted_snapshot_t);
OBJ_CLASS_DECLARATION(orte_snapc_full_global_snapshot_t); /*
* Local Coordinator per app metadata
struct orte_snapc_full_local_snapshot_t { */
struct orte_snapc_full_app_snapshot_t {
/** Base SNAPC Global snapshot type */ /** Base SNAPC Global snapshot type */
orte_snapc_base_snapshot_t super; orte_snapc_base_local_snapshot_t super;
/** Named Pipe Read and Write */ /** Named Pipe Read and Write */
char * comm_pipe_r; char * comm_pipe_r;
@ -79,14 +99,18 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
struct opal_event comm_pipe_r_eh; struct opal_event comm_pipe_r_eh;
bool is_eh_active; bool is_eh_active;
/** State of the process wrt checkpointing */ /** Process pid */
int ckpt_state; pid_t process_pid;
};
typedef struct orte_snapc_full_local_snapshot_t orte_snapc_full_local_snapshot_t;
OBJ_CLASS_DECLARATION(orte_snapc_full_local_snapshot_t); /** Term */
bool term;
};
typedef struct orte_snapc_full_app_snapshot_t orte_snapc_full_app_snapshot_t;
OBJ_CLASS_DECLARATION(orte_snapc_full_app_snapshot_t);
extern bool orte_snapc_full_skip_filem; extern bool orte_snapc_full_skip_filem;
extern bool orte_snapc_full_skip_app;
extern bool orte_snapc_full_timing_enabled;
int orte_snapc_full_component_query(mca_base_module_t **module, int *priority); int orte_snapc_full_component_query(mca_base_module_t **module, int *priority);
@ -108,12 +132,11 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
int global_coord_finalize(void); int global_coord_finalize(void);
int global_coord_setup_job(orte_jobid_t jobid); int global_coord_setup_job(orte_jobid_t jobid);
int global_coord_release_job(orte_jobid_t jobid); int global_coord_release_job(orte_jobid_t jobid);
int global_coord_vpid_assoc_update(orte_process_name_t local_coord, int global_coord_orted_state_update(orte_process_name_t proc_name,
orte_process_name_t proc_name); int proc_ckpt_state,
int global_coord_vpid_state_update(orte_process_name_t proc_name, char **proc_ckpt_ref,
size_t proc_ckpt_state, char **proc_ckpt_loc,
char **proc_ckpt_ref, char **agent_ckpt);
char **proc_ckpt_loc);
/* /*
* Local Coordinator Functionality * Local Coordinator Functionality
*/ */
@ -122,7 +145,7 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
int local_coord_setup_job(orte_jobid_t jobid); int local_coord_setup_job(orte_jobid_t jobid);
int local_coord_release_job(orte_jobid_t jobid); int local_coord_release_job(orte_jobid_t jobid);
int local_coord_job_state_update(orte_jobid_t jobid, int local_coord_job_state_update(orte_jobid_t jobid,
size_t job_ckpt_state, int job_ckpt_state,
char **job_ckpt_ref, char **job_ckpt_ref,
char **job_ckpt_loc); char **job_ckpt_loc);

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2007 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -219,7 +219,12 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
opal_cr_currently_stalled = false; opal_cr_currently_stalled = false;
app_pid = getpid(); app_pid = getpid();
ret = opal_cr_inc_core(app_pid, local_snapshot, app_term, &cr_state); if( orte_snapc_full_skip_app ) {
ret = ORTE_SUCCESS;
cr_state = OPAL_CRS_CONTINUE;
} else {
ret = opal_cr_inc_core(app_pid, local_snapshot, app_term, &cr_state);
}
if( OPAL_EXISTS == ret ) { if( OPAL_EXISTS == ret ) {
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
"App) notify_response: Stalling the checkpoint progress until state is stable again (PID = %d)\n", "App) notify_response: Stalling the checkpoint progress until state is stable again (PID = %d)\n",

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -35,6 +35,8 @@ static int snapc_full_open(void);
static int snapc_full_close(void); static int snapc_full_close(void);
bool orte_snapc_full_skip_filem = false; bool orte_snapc_full_skip_filem = false;
bool orte_snapc_full_skip_app = false;
bool orte_snapc_full_timing_enabled = false;
/* /*
* Instantiate the public struct with all of our public information * Instantiate the public struct with all of our public information
@ -113,6 +115,22 @@ static int snapc_full_open(void)
&value); &value);
orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value); orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
"skip_app",
"Not for general use! For debugging only! Shortcut app level coord. [Default = disabled]",
false, false,
0,
&value);
orte_snapc_full_skip_app = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
"enable_timing",
"Enable timing information. [Default = disabled]",
false, false,
0,
&value);
orte_snapc_full_timing_enabled = OPAL_INT_TO_BOOL(value);
/* /*
* Debug Output * Debug Output
*/ */

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University. * Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved. * All rights reserved.
@ -50,24 +50,24 @@ static orte_snapc_base_module_t loc_module = {
/* /*
* Global Snapshot structure * Global Snapshot structure
*/ */
void orte_snapc_full_global_construct(orte_snapc_full_global_snapshot_t *obj); void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *obj);
void orte_snapc_full_global_destruct( orte_snapc_full_global_snapshot_t *obj); void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *obj);
OBJ_CLASS_INSTANCE(orte_snapc_full_global_snapshot_t, OBJ_CLASS_INSTANCE(orte_snapc_full_orted_snapshot_t,
orte_snapc_base_snapshot_t, orte_snapc_base_global_snapshot_t,
orte_snapc_full_global_construct, orte_snapc_full_orted_construct,
orte_snapc_full_global_destruct); orte_snapc_full_orted_destruct);
/* /*
* Local Snapshot structure * Local Snapshot structure
*/ */
void orte_snapc_full_local_construct(orte_snapc_full_local_snapshot_t *obj); void orte_snapc_full_app_construct(orte_snapc_full_app_snapshot_t *obj);
void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj); void orte_snapc_full_app_destruct( orte_snapc_full_app_snapshot_t *obj);
OBJ_CLASS_INSTANCE(orte_snapc_full_local_snapshot_t, OBJ_CLASS_INSTANCE(orte_snapc_full_app_snapshot_t,
orte_snapc_base_snapshot_t, orte_snapc_base_local_snapshot_t,
orte_snapc_full_local_construct, orte_snapc_full_app_construct,
orte_snapc_full_local_destruct); orte_snapc_full_app_destruct);
/************************************ /************************************
* Locally Global vars & functions :) * Locally Global vars & functions :)
@ -77,29 +77,53 @@ OBJ_CLASS_INSTANCE(orte_snapc_full_local_snapshot_t,
/************************ /************************
* Function Definitions * Function Definitions
************************/ ************************/
void orte_snapc_full_global_construct(orte_snapc_full_global_snapshot_t *snapshot) { void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot) {
snapshot->local_coord.vpid = 0; snapshot->process_name.jobid = 0;
snapshot->local_coord.jobid = 0; snapshot->process_name.vpid = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
snapshot->opal_crs = NULL;
snapshot->term = false;
snapshot->filem_request = NULL;
} }
void orte_snapc_full_global_destruct( orte_snapc_full_global_snapshot_t *snapshot) { void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *snapshot) {
snapshot->local_coord.vpid = 0; snapshot->process_name.jobid = 0;
snapshot->local_coord.jobid = 0; snapshot->process_name.vpid = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
if( NULL != snapshot->opal_crs ) {
free( snapshot->opal_crs );
snapshot->opal_crs = NULL;
}
snapshot->term = false;
if( NULL != snapshot->filem_request ) {
OBJ_RELEASE(snapshot->filem_request);
snapshot->filem_request = NULL;
}
} }
void orte_snapc_full_local_construct(orte_snapc_full_local_snapshot_t *obj) { void orte_snapc_full_app_construct(orte_snapc_full_app_snapshot_t *obj) {
obj->comm_pipe_r = NULL; obj->comm_pipe_r = NULL;
obj->comm_pipe_w = NULL; obj->comm_pipe_w = NULL;
obj->comm_pipe_r_fd = -1; obj->comm_pipe_r_fd = -1;
obj->comm_pipe_w_fd = -1; obj->comm_pipe_w_fd = -1;
obj->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
obj->is_eh_active = false; obj->is_eh_active = false;
obj->process_pid = 0;
obj->term = false;
} }
void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj) { void orte_snapc_full_app_destruct( orte_snapc_full_app_snapshot_t *obj) {
if( NULL != obj->comm_pipe_r ) { if( NULL != obj->comm_pipe_r ) {
free(obj->comm_pipe_r); free(obj->comm_pipe_r);
obj->comm_pipe_r = NULL; obj->comm_pipe_r = NULL;
@ -113,9 +137,11 @@ void orte_snapc_full_local_destruct( orte_snapc_full_local_snapshot_t *obj) {
obj->comm_pipe_r_fd = -1; obj->comm_pipe_r_fd = -1;
obj->comm_pipe_w_fd = -1; obj->comm_pipe_w_fd = -1;
obj->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
obj->is_eh_active = false; obj->is_eh_active = false;
obj->process_pid = 0;
obj->term = false;
} }
/* /*

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2005 The University of Tennessee and The University
@ -89,45 +89,61 @@ BEGIN_C_DECLS
/** /**
* States that a process can be in while checkpointing * States that a process can be in while checkpointing
*/ */
/* Doing no checkpoint -- Quiet state */
#define ORTE_SNAPC_CKPT_STATE_NONE 0
/* There has been a request for a checkpoint from one of the applications */
#define ORTE_SNAPC_CKPT_STATE_REQUEST 1
/* There is a Pending checkpoint for this process */
#define ORTE_SNAPC_CKPT_STATE_PENDING 2
/* There is a Pending checkpoint for this process, terminate the process after checkpoint */
#define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 3
/* Running the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_RUNNING 4
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 5
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FINISHED 6
/* Unable to checkpoint this job */
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 7
/* Reached an error */ /* Reached an error */
#define ORTE_SNAPC_CKPT_STATE_ERROR 8 #define ORTE_SNAPC_CKPT_STATE_ERROR 0
/* Doing no checkpoint -- Quiet state */
#define ORTE_SNAPC_CKPT_STATE_NONE 1
/* There has been a request for a checkpoint from one of the applications */
#define ORTE_SNAPC_CKPT_STATE_REQUEST 2
/* There is a Pending checkpoint for this process */
#define ORTE_SNAPC_CKPT_STATE_PENDING 3
/* There is a Pending checkpoint for this process, terminate the process after checkpoint */
#define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 4
/* Running the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_RUNNING 5
/* Finished the checkpoint locally */
#define ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL 6
/* File Transfer in progress */
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 8
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FINISHED 9
/* Unable to checkpoint this job */
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 10
#define ORTE_SNAPC_CKPT_MAX 11
/** /**
* Definition of a orte local snapshot. * Definition of a orte local snapshot.
* Similar to the opal_crs_base_snapshot_t except that it * Similar to the opal_crs_base_snapshot_t except that it
* contains process contact information. * contains process contact information.
*/ */
struct orte_snapc_base_snapshot_1_0_0_t { struct orte_snapc_base_local_snapshot_1_0_0_t {
opal_crs_base_snapshot_t crs_snapshot_super; /** List super object */
opal_list_item_t super;
/** ORTE Process name */ /** ORTE Process name */
orte_process_name_t process_name; orte_process_name_t process_name;
/** PID of the application process that generated this snapshot */
pid_t process_pid;
/** State of the checkpoint */
size_t state;
/** Terminate this process after a checkpoint */
bool term;
};
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_1_0_0_t;
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_snapshot_t); /** State of the checkpoint */
int state;
/** Unique name of the local snapshot */
char * reference_name;
/** Local location of the local snapshot Absolute path */
char * local_location;
/** Remote location of the local snapshot Absolute path */
char * remote_location;
/** CRS agent */
char * opal_crs;
};
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_1_0_0_t;
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_local_snapshot_t);
/** /**
* Definition of the global snapshot. * Definition of the global snapshot.
@ -138,11 +154,8 @@ struct orte_snapc_base_global_snapshot_1_0_0_t {
/** This is an object, so must have super */ /** This is an object, so must have super */
opal_list_item_t super; opal_list_item_t super;
/** A list of orte_snapc_base_snapshot_ts */ /** A list of orte_snapc_base_snapshot_t's */
opal_list_t snapshots; opal_list_t local_snapshots;
/* ORTE SnapC Component used to generate the global snapshot */
char * component_name;
/** Unique name of the global snapshot */ /** Unique name of the global snapshot */
char * reference_name; char * reference_name;
@ -152,12 +165,6 @@ struct orte_snapc_base_global_snapshot_1_0_0_t {
/** Sequence Number */ /** Sequence Number */
int seq_num; int seq_num;
/** Beginning timestamp */
char * start_time;
/** Ending timestamp */
char * end_time;
}; };
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t; typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t; typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2005 The University of Tennessee and The University
@ -76,6 +76,8 @@
#include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h" #include "orte/mca/snapc/base/base.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/****************** /******************
* Local Functions * Local Functions
******************/ ******************/
@ -108,11 +110,16 @@ static int global_sequence_num = 0;
*****************************************/ *****************************************/
static bool listener_started = false; static bool listener_started = false;
static double timer_start = 0;
static double timer_last = 0;
static double get_time(void);
typedef struct { typedef struct {
bool help; bool help;
int pid; int pid;
bool term; bool term;
bool verbose; bool verbose;
int verbose_level;
orte_jobid_t req_hnp; /**< User Requested HNP */ orte_jobid_t req_hnp; /**< User Requested HNP */
bool nowait; /* Do not wait for checkpoint to complete before returning */ bool nowait; /* Do not wait for checkpoint to complete before returning */
bool status; /* Display status messages while checkpoint is progressing */ bool status; /* Display status messages while checkpoint is progressing */
@ -135,6 +142,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
&orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, &orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" }, "Be Verbose" },
{ NULL, NULL, NULL,
'V', NULL, NULL,
1,
&orte_checkpoint_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
"Set the verbosity level (For additional debugging information)" },
{ NULL, NULL, NULL, { NULL, NULL, NULL,
'\0', NULL, "term", '\0', NULL, "term",
0, 0,
@ -279,6 +292,7 @@ static int parse_args(int argc, char *argv[]) {
orte_checkpoint_globals.pid = -1; orte_checkpoint_globals.pid = -1;
orte_checkpoint_globals.term = false; orte_checkpoint_globals.term = false;
orte_checkpoint_globals.verbose = false; orte_checkpoint_globals.verbose = false;
orte_checkpoint_globals.verbose_level = 0;
orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID; orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID;
orte_checkpoint_globals.nowait = false; orte_checkpoint_globals.nowait = false;
orte_checkpoint_globals.status = false; orte_checkpoint_globals.status = false;
@ -344,6 +358,14 @@ static int parse_args(int argc, char *argv[]) {
goto cleanup; goto cleanup;
} }
if(orte_checkpoint_globals.verbose_level < 0 ) {
orte_checkpoint_globals.verbose_level = 0;
}
if(orte_checkpoint_globals.verbose_level > 0) {
orte_checkpoint_globals.verbose = true;
}
/* /*
* If the user did not supply an hnp jobid, then they must * If the user did not supply an hnp jobid, then they must
* supply the PID of MPIRUN * supply the PID of MPIRUN
@ -474,7 +496,7 @@ static int ckpt_init(int argc, char *argv[]) {
*/ */
if( orte_checkpoint_globals.verbose ) { if( orte_checkpoint_globals.verbose ) {
orte_checkpoint_globals.output = opal_output_open(NULL); orte_checkpoint_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_checkpoint_globals.output, 10); opal_output_set_verbosity(orte_checkpoint_globals.output, orte_checkpoint_globals.verbose_level);
} else { } else {
orte_checkpoint_globals.output = 0; /* Default=STDERR */ orte_checkpoint_globals.output = 0; /* Default=STDERR */
} }
@ -661,6 +683,8 @@ notify_process_for_checkpoint(int term)
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n", "orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
orte_checkpoint_globals.pid); orte_checkpoint_globals.pid);
timer_start = get_time();
/*********************************** /***********************************
* Notify HNP of checkpoint request * Notify HNP of checkpoint request
* Send: * Send:
@ -709,18 +733,51 @@ notify_process_for_checkpoint(int term)
/*************** /***************
* Pretty Print * Pretty Print
***************/ ***************/
static double get_time(void) {
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static int pretty_print_status(void) { static int pretty_print_status(void) {
char * state_str = NULL; char * state_str = NULL;
double cur_time;
state_str = orte_snapc_ckpt_state_str(orte_checkpoint_globals.ckpt_status); cur_time = get_time();
if( timer_last == 0 ) {
timer_last = cur_time;
}
orte_snapc_ckpt_state_str(&state_str, orte_checkpoint_globals.ckpt_status);
if( NULL != global_snapshot_handle ) {
opal_output(0,
"[%6.2f / %6.2f] %*s - %s\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str, global_snapshot_handle);
} else {
opal_output(0,
"[%6.2f / %6.2f] %*s - ...\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str);
}
opal_output(0,
"%*s - Global Snapshot Reference: %s\n",
25, state_str, global_snapshot_handle);
if( NULL != state_str) { if( NULL != state_str) {
free(state_str); free(state_str);
} }
timer_last = cur_time;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2005 The University of Tennessee and The University
@ -491,6 +491,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
orte_std_cntr_t i; orte_std_cntr_t i;
char *jobstr; char *jobstr;
orte_jobid_t mask=0x0000ffff; orte_jobid_t mask=0x0000ffff;
char * state_str = NULL;
for(i=0; i < num_jobs; i++) { for(i=0; i < num_jobs; i++) {
job = jobs[i]; job = jobs[i];
@ -513,9 +514,10 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
len_slots = 6; len_slots = 6;
len_vpid_r = (int) strlen("Num Procs"); len_vpid_r = (int) strlen("Num Procs");
#if OPAL_ENABLE_FT == 1 #if OPAL_ENABLE_FT == 1
len_ckpt_s = (int) (strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) < strlen("Ckpt State") ? orte_snapc_ckpt_state_str(&state_str, job->ckpt_state);
len_ckpt_s = (int) (strlen(state_str) < strlen("Ckpt State") ?
strlen("Ckpt State") : strlen("Ckpt State") :
strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) ); strlen(state_str) );
len_ckpt_r = (int) (NULL == job->ckpt_snapshot_ref ? strlen("Ckpt Ref") : len_ckpt_r = (int) (NULL == job->ckpt_snapshot_ref ? strlen("Ckpt Ref") :
(strlen(job->ckpt_snapshot_ref) < strlen("Ckpt Ref") ? (strlen(job->ckpt_snapshot_ref) < strlen("Ckpt Ref") ?
strlen("Ckpt Ref") : strlen("Ckpt Ref") :
@ -525,6 +527,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
strlen("Ckpt Loc") : strlen("Ckpt Loc") :
strlen(job->ckpt_snapshot_loc) ) ); strlen(job->ckpt_snapshot_loc) ) );
#else #else
state_str = NULL;
len_ckpt_s = -3; len_ckpt_s = -3;
len_ckpt_r = -3; len_ckpt_r = -3;
len_ckpt_l = -3; len_ckpt_l = -3;
@ -564,7 +567,7 @@ static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) {
printf("%*d | ", len_slots , (uint)job->total_slots_alloc); printf("%*d | ", len_slots , (uint)job->total_slots_alloc);
printf("%*d | ", len_vpid_r, job->num_procs); printf("%*d | ", len_vpid_r, job->num_procs);
#if OPAL_ENABLE_FT == 1 #if OPAL_ENABLE_FT == 1
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(job->ckpt_state)); printf("%*s | ", len_ckpt_s, state_str);
printf("%*s | ", len_ckpt_r, (NULL == job->ckpt_snapshot_ref ? printf("%*s | ", len_ckpt_r, (NULL == job->ckpt_snapshot_ref ?
"" : "" :
job->ckpt_snapshot_ref) ); job->ckpt_snapshot_ref) );
@ -597,6 +600,7 @@ static int pretty_print_vpids(orte_job_t *job) {
orte_proc_t *vpid; orte_proc_t *vpid;
orte_app_context_t *app; orte_app_context_t *app;
char *o_proc_name; char *o_proc_name;
char *state_str = NULL;
/* /*
* Caculate segment lengths * Caculate segment lengths
@ -663,8 +667,9 @@ static int pretty_print_vpids(orte_job_t *job) {
len_state = strlen(pretty_vpid_state(vpid->state)); len_state = strlen(pretty_vpid_state(vpid->state));
#if OPAL_ENABLE_FT == 1 #if OPAL_ENABLE_FT == 1
if( (int)strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state)) > len_ckpt_s) orte_snapc_ckpt_state_str(&state_str, vpid->ckpt_state);
len_ckpt_s = strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state)); if( (int)strlen(state_str) > len_ckpt_s)
len_ckpt_s = strlen(state_str);
if( NULL != vpid->ckpt_snapshot_ref && if( NULL != vpid->ckpt_snapshot_ref &&
(int)strlen(vpid->ckpt_snapshot_ref) > len_ckpt_r) (int)strlen(vpid->ckpt_snapshot_ref) > len_ckpt_r)
@ -673,6 +678,8 @@ static int pretty_print_vpids(orte_job_t *job) {
if( NULL != vpid->ckpt_snapshot_loc && if( NULL != vpid->ckpt_snapshot_loc &&
(int)strlen(vpid->ckpt_snapshot_loc) > len_ckpt_l) (int)strlen(vpid->ckpt_snapshot_loc) > len_ckpt_l)
len_ckpt_l = strlen(vpid->ckpt_snapshot_loc); len_ckpt_l = strlen(vpid->ckpt_snapshot_loc);
#else
state_str = NULL;
#endif #endif
} }
@ -739,7 +746,7 @@ static int pretty_print_vpids(orte_job_t *job) {
printf("%*s | ", len_state , pretty_vpid_state(vpid->state)); printf("%*s | ", len_state , pretty_vpid_state(vpid->state));
#if OPAL_ENABLE_FT == 1 #if OPAL_ENABLE_FT == 1
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(vpid->ckpt_state)); printf("%*s | ", len_ckpt_s, state_str);
printf("%*s | ", len_ckpt_r, (NULL == vpid->ckpt_snapshot_ref ? printf("%*s | ", len_ckpt_r, (NULL == vpid->ckpt_snapshot_ref ?
"" : "" :
vpid->ckpt_snapshot_ref)); vpid->ckpt_snapshot_ref));

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University * Copyright (c) 2004-2005 The University of Tennessee and The University
@ -151,8 +151,9 @@ int
main(int argc, char *argv[]) main(int argc, char *argv[])
{ {
int ret, exit_status = ORTE_SUCCESS; int ret, exit_status = ORTE_SUCCESS;
pid_t child_pid; pid_t child_pid = 0;
orte_snapc_base_global_snapshot_t *snapshot = NULL; orte_snapc_base_global_snapshot_t *snapshot = NULL;
char *tmp_str = NULL;
/*************** /***************
* Initialize * Initialize
@ -164,7 +165,10 @@ main(int argc, char *argv[])
snapshot = OBJ_NEW(orte_snapc_base_global_snapshot_t); snapshot = OBJ_NEW(orte_snapc_base_global_snapshot_t);
snapshot->reference_name = strdup(orte_restart_globals.filename); snapshot->reference_name = strdup(orte_restart_globals.filename);
snapshot->local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(snapshot->reference_name)); orte_snapc_base_get_global_snapshot_directory(&tmp_str, snapshot->reference_name);
snapshot->local_location = opal_dirname(tmp_str);
free(tmp_str);
tmp_str = NULL;
/* /*
* Check for existence of the file * Check for existence of the file
@ -453,11 +457,11 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
goto cleanup; goto cleanup;
} }
for(item = opal_list_get_first(&snapshot->snapshots); for(item = opal_list_get_first(&snapshot->local_snapshots);
item != opal_list_get_end(&snapshot->snapshots); item != opal_list_get_end(&snapshot->local_snapshots);
item = opal_list_get_next(item) ) { item = opal_list_get_next(item) ) {
orte_snapc_base_snapshot_t *vpid_snapshot; orte_snapc_base_local_snapshot_t *vpid_snapshot;
vpid_snapshot = (orte_snapc_base_snapshot_t*)item; vpid_snapshot = (orte_snapc_base_local_snapshot_t*)item;
fprintf(appfile, "#\n"); fprintf(appfile, "#\n");
fprintf(appfile, "# Old Process Name: %u.%u\n", fprintf(appfile, "# Old Process Name: %u.%u\n",
@ -467,13 +471,15 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
fprintf(appfile, "-np 1 "); fprintf(appfile, "-np 1 ");
if(orte_restart_globals.preload) { if(orte_restart_globals.preload) {
fprintf(appfile, "--preload-files %s/%s ", fprintf(appfile, "--preload-files %s/%s ",
vpid_snapshot->crs_snapshot_super.local_location, vpid_snapshot->local_location,
vpid_snapshot->crs_snapshot_super.reference_name); vpid_snapshot->reference_name);
fprintf(appfile, "--preload-files-dest-dir . "); fprintf(appfile, "--preload-files-dest-dir . ");
} }
/* JJH: Make this match what the user originally specified on the command line */ /* JJH: Make this match what the user originally specified on the command line */
fprintf(appfile, "-am ft-enable-cr "); fprintf(appfile, "-am ft-enable-cr ");
fprintf(appfile, " opal-restart "); fprintf(appfile, " opal-restart ");
/* JJH: Make sure this changes if ever the default location of the local file is changed, /* JJH: Make sure this changes if ever the default location of the local file is changed,
* currently it is safe to assume that it is in the current working directory. * currently it is safe to assume that it is in the current working directory.
* *
@ -486,9 +492,9 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
else { else {
/* If we are *not* preloading the files, the point to the original checkpoint /* If we are *not* preloading the files, the point to the original checkpoint
* directory to access the checkpoint files. */ * directory to access the checkpoint files. */
fprintf(appfile, "-mca crs_base_snapshot_dir %s ", vpid_snapshot->crs_snapshot_super.local_location); fprintf(appfile, "-mca crs_base_snapshot_dir %s ", vpid_snapshot->local_location);
} }
fprintf(appfile, "%s\n", vpid_snapshot->crs_snapshot_super.reference_name); fprintf(appfile, "%s\n", vpid_snapshot->reference_name);
} }
cleanup: cleanup: