1
1
openmpi/orte/tools/orte-checkpoint/orte-checkpoint.c
2016-11-22 15:03:20 -08:00

986 строки
30 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* ORTE Checkpoint Tool for checkpointing a multiprocess job
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h> /* for mkfifo */
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#include <string.h>
#include "opal/util/cmd_line.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/mca/base/base.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_cr.h"
#include "orte/util/hnp_contact.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "opal/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "opal/dss/dss.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/sstore.h"
#include "orte/mca/sstore/base/base.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/******************
* Local Functions
******************/
static int ckpt_init(int argc, char *argv[]); /* Initalization routine */
static int ckpt_finalize(void); /* Finalization routine */
static int parse_args(int argc, char *argv[]);
static int find_hnp(void);
static int start_listener(void);
static int stop_listener(void);
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata);
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer);
static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options);
static int pretty_print_status(void);
static int pretty_print_reference(void);
static int list_all_snapshots(void);
static orte_hnp_contact_t *orterun_hnp = NULL;
static char * global_snapshot_handle = NULL;
static int global_sequence_num = 0;
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
static bool listener_started = false;
static bool is_checkpoint_finished = false;
static bool is_checkpoint_established = false;
static bool is_checkpoint_recovered = false;
static double timer_start = 0;
static double timer_last = 0;
static double get_time(void);
typedef struct {
bool help;
int pid;
opal_crs_base_ckpt_options_t *options;
bool term;
bool stop;
bool verbose;
int verbose_level;
orte_jobid_t req_hnp; /**< User Requested HNP */
bool nowait; /* Do not wait for checkpoint to complete before returning */
bool status; /* Display status messages while checkpoint is progressing */
int output;
int ckpt_status;
bool list_only; /* List available checkpoints only */
#if OPAL_ENABLE_CRDEBUG == 1
bool enable_crdebug; /* Enable C/R Debugging */
bool attach_debugger;
bool detach_debugger;
#endif
} orte_checkpoint_globals_t;
orte_checkpoint_globals_t orte_checkpoint_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL,
'h', NULL, "help",
0,
&orte_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL,
'v', NULL, "verbose",
0,
&orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL,
'V', NULL, NULL,
1,
&orte_checkpoint_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
"Set the verbosity level (For additional debugging information)" },
{ NULL,
'\0', NULL, "term",
0,
&(orte_checkpoint_globals.term), OPAL_CMD_LINE_TYPE_BOOL,
"Terminate the application after checkpoint (Cannot be used with --stop)" },
{ NULL,
'\0', NULL, "stop",
0,
&(orte_checkpoint_globals.stop), OPAL_CMD_LINE_TYPE_BOOL,
"Send SIGSTOP to application just after checkpoint (checkpoint will not finish until SIGCONT is sent) (Cannot be used with --term)" },
{ NULL,
'w', NULL, "nowait",
0,
&orte_checkpoint_globals.nowait, OPAL_CMD_LINE_TYPE_BOOL,
"Do not wait for the application to finish checkpointing before returning" },
{ NULL,
's', NULL, "status",
0,
&orte_checkpoint_globals.status, OPAL_CMD_LINE_TYPE_BOOL,
"Display status messages describing the progression of the checkpoint" },
{ "hnp-jobid",
'\0', NULL, "hnp-jobid",
1,
&orte_checkpoint_globals.req_hnp, OPAL_CMD_LINE_TYPE_INT,
"This should be the jobid of the HNP whose applications you wish "
"to checkpoint." },
{ "hnp-pid",
'\0', NULL, "hnp-pid",
1,
&orte_checkpoint_globals.pid, OPAL_CMD_LINE_TYPE_INT,
"This should be the pid of the mpirun whose applications you wish "
"to checkpoint." },
{ NULL,
'l', NULL, "list",
0,
&orte_checkpoint_globals.list_only, OPAL_CMD_LINE_TYPE_BOOL,
"Display a list of checkpoint files available on this machine" },
#if OPAL_ENABLE_CRDEBUG == 1
{ NULL,
'\0', "crdebug", "crdebug",
0,
&orte_checkpoint_globals.enable_crdebug, OPAL_CMD_LINE_TYPE_BOOL,
"Enable C/R Enhanced Debugging" },
{ NULL,
'\0', "attach", "attach",
0,
&(orte_checkpoint_globals.attach_debugger), OPAL_CMD_LINE_TYPE_BOOL,
"Wait for the debugger to attach directly after taking the checkpoint." },
{ NULL,
'\0', "detach", "detach",
0,
&(orte_checkpoint_globals.detach_debugger), OPAL_CMD_LINE_TYPE_BOOL,
"Do not wait for the debugger to reattach after taking the checkpoint." },
#endif
/* End of list */
{ NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
int
main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
/***************
* Initialize
***************/
if (ORTE_SUCCESS != (ret = ckpt_init(argc, argv))) {
exit_status = ret;
goto cleanup;
}
/*************************************
* Listing only Checkpoint References
*************************************/
if( orte_checkpoint_globals.list_only ) {
if (ORTE_SUCCESS != (ret = list_all_snapshots())) {
exit_status = ret;
goto cleanup;
}
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/***************************
* Find the HNP that we want to connect to, if it exists
***************************/
if (ORTE_SUCCESS != (ret = find_hnp())) {
/* Error printed by called function */
exit_status = ret;
goto cleanup;
}
/*******************************
* Checkpoint the requested PID
*******************************/
is_checkpoint_finished = false;
is_checkpoint_recovered = false;
is_checkpoint_established = false;
if( orte_checkpoint_globals.verbose ) {
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: Checkpointing...");
if (0 < orte_checkpoint_globals.pid) {
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t PID %d",
orte_checkpoint_globals.pid);
} else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Mpirun (%s)",
ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp));
}
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Connected to Mpirun %s",
ORTE_NAME_PRINT(&orterun_hnp->name));
if(orte_checkpoint_globals.options->term) {
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Terminating after checkpoint\n");
}
if(orte_checkpoint_globals.options->stop) {
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Stopping after checkpoint\n");
}
}
if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.options)) ) {
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
orte_checkpoint_globals.pid, ret);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* Wait for the checkpoint to complete
*/
if(!orte_checkpoint_globals.nowait) {
while( !is_checkpoint_finished ) {
opal_progress();
}
}
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status ||
ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
if(!orte_checkpoint_globals.nowait) {
pretty_print_reference();
}
cleanup:
/***************
* Cleanup
***************/
if (ORTE_SUCCESS != (ret = ckpt_finalize())) {
return ret;
}
return exit_status;
}
static int parse_args(int argc, char *argv[]) {
int i, ret, len, exit_status = ORTE_SUCCESS ;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
char *argv0 = NULL;
/* Init structure */
memset(&orte_checkpoint_globals, 0, sizeof(orte_checkpoint_globals_t));
orte_checkpoint_globals.help = false;
orte_checkpoint_globals.pid = -1;
orte_checkpoint_globals.verbose = false;
orte_checkpoint_globals.verbose_level = 0;
orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID;
orte_checkpoint_globals.nowait = false;
orte_checkpoint_globals.status = false;
orte_checkpoint_globals.output = -1;
orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
orte_checkpoint_globals.list_only = false;
#if OPAL_ENABLE_CRDEBUG == 1
orte_checkpoint_globals.enable_crdebug = false;
#endif
orte_checkpoint_globals.options = OBJ_NEW(opal_crs_base_ckpt_options_t);
orte_checkpoint_globals.term = false;
orte_checkpoint_globals.stop = false;
#if OPAL_ENABLE_CRDEBUG == 1
orte_checkpoint_globals.attach_debugger = false;
orte_checkpoint_globals.detach_debugger = false;
#endif
#if OPAL_ENABLE_FT_CR == 0
/* Warn and exit if not configured with Checkpoint/Restart */
{
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-checkpoint.txt", "usage-no-cr",
true, args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
exit_status = ORTE_ERROR;
goto cleanup;
}
#endif
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv);
if (OPAL_SUCCESS != ret) {
if (OPAL_ERR_SILENT != ret) {
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
opal_strerror(ret));
}
exit_status = 1;
goto cleanup;
}
if (orte_checkpoint_globals.help) {
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-checkpoint.txt", "usage", true,
args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
/* If we show the help message, that should be all we do */
exit(0);
}
/**
* Put all of the MCA arguments in the environment
*/
mca_base_cmd_line_process_args(argc, &app_env, &global_env);
len = opal_argv_count(app_env);
for(i = 0; i < len; ++i) {
putenv(app_env[i]);
}
len = opal_argv_count(global_env);
for(i = 0; i < len; ++i) {
putenv(global_env[i]);
}
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
*/
/* get the remaining bits */
argv0 = strdup(argv[0]);
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
if(orte_checkpoint_globals.list_only ) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
if (0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp) {
fprintf(stderr, "%s: Nothing to do\n", argv0);
fprintf(stderr, "Type '%s --help' for usage.\n", argv0);
exit_status = 1;
goto cleanup;
}
orte_checkpoint_globals.options->term = orte_checkpoint_globals.term;
orte_checkpoint_globals.options->stop = orte_checkpoint_globals.stop;
#if OPAL_ENABLE_CRDEBUG == 1
orte_checkpoint_globals.options->attach_debugger = orte_checkpoint_globals.attach_debugger;
orte_checkpoint_globals.options->detach_debugger = orte_checkpoint_globals.detach_debugger;
#endif
if(orte_checkpoint_globals.verbose_level < 0 ) {
orte_checkpoint_globals.verbose_level = 0;
}
if(orte_checkpoint_globals.verbose_level > 0) {
orte_checkpoint_globals.verbose = true;
}
/*
* If the user did not supply an hnp jobid, then they must
* supply the PID of MPIRUN
*/
if(0 >= argc &&
ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
orte_checkpoint_globals.pid = atoi(argv[0]);
if ( 0 >= orte_checkpoint_globals.pid ) {
opal_show_help("help-orte-checkpoint.txt", "invalid_pid", true,
orte_checkpoint_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* JJH: No wait is currently not implemented or tested
*/
if(orte_checkpoint_globals.nowait) {
orte_checkpoint_globals.nowait = false;
opal_show_help("help-orte-checkpoint.txt", "not_impl",
true,
"Disconnected checkpoint");
}
if(orte_checkpoint_globals.verbose) {
orte_checkpoint_globals.status = true;
}
cleanup:
if (NULL != argv0) {
free(argv0);
}
return exit_status;
}
/*
* This function attempts to find an HNP to connect to.
*/
static int find_hnp(void) {
int ret, exit_status = ORTE_SUCCESS;
opal_list_t hnp_list;
opal_list_item_t *item;
orte_hnp_contact_t *hnpcandidate;
/* get the list of local hnp's available to us and setup
* contact info for them into the RML
*/
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
opal_show_help("help-orte-checkpoint.txt", "no_hnps", true,
orte_checkpoint_globals.pid,
orte_process_info.tmpdir_base,
orte_process_info.top_session_dir,
ret, ORTE_ERROR_NAME(ret));
exit_status = ret;
goto cleanup;
}
/* search the list for the desired hnp */
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
hnpcandidate = (orte_hnp_contact_t*)item;
if (hnpcandidate->name.jobid == orte_checkpoint_globals.req_hnp ||
hnpcandidate->pid == orte_checkpoint_globals.pid) {
/* this is the one we want */
orterun_hnp = hnpcandidate;
exit_status = ORTE_SUCCESS;
goto cleanup;
}
}
/* If no match was found, error out */
opal_show_help("help-orte-checkpoint.txt", "no_universe", true,
orte_checkpoint_globals.pid,
orte_process_info.tmpdir_base,
orte_process_info.top_session_dir);
cleanup:
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
if( NULL == orterun_hnp ) {
return ORTE_ERROR;
} else {
return exit_status;
}
}
static int ckpt_init(int argc, char *argv[]) {
int exit_status = ORTE_SUCCESS, ret;
char * tmp_env_var = NULL;
listener_started = false;
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
return ret;
}
/*
* Parse Command Line Arguments
*/
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
return ret;
}
/* Disable the checkpoint notification routine for this
* tool. As we will never need to checkpoint this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
(void) mca_base_var_env_name("crs", &tmp_env_var);
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/* we are never allowed to operate as a distributed tool,
* so insist on the ess/tool component */
opal_setenv("OMPI_MCA_ess", "tool", true, &environ);
/***************************
* We need all of OPAL and the TOOLS portion of ORTE - this
* sets us up so we can talk to any HNP over the wire
***************************/
if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
exit_status = ret;
goto cleanup;
}
/*
* Setup ORTE Output handle from the verbose argument
*/
if( orte_checkpoint_globals.verbose ) {
orte_checkpoint_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_checkpoint_globals.output, orte_checkpoint_globals.verbose_level);
} else {
orte_checkpoint_globals.output = 0; /* Default=STDERR */
}
/*
* Start the listener
*/
if( ORTE_SUCCESS != (ret = start_listener() ) ) {
exit_status = ret;
}
cleanup:
return exit_status;
}
static int ckpt_finalize(void) {
int exit_status = ORTE_SUCCESS, ret;
/*
* Stop the listener
*/
if( ORTE_SUCCESS != (ret = stop_listener() ) ) {
exit_status = ret;
}
if (ORTE_SUCCESS != (ret = orte_finalize())) {
exit_status = ret;
}
return exit_status;
}
static int start_listener(void)
{
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT,
ORTE_RML_PERSISTENT, hnp_receiver, NULL);
listener_started = true;
return ORTE_SUCCESS;
}
static int stop_listener(void)
{
if( !listener_started ) {
return ORTE_ERROR;
}
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT);
listener_started = false;
return ORTE_SUCCESS;
}
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
orte_snapc_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
opal_output_verbose(5, orte_checkpoint_globals.output,
"orte_checkpoint: hnp_receiver: Receive a command message.");
/*
* Otherwise this is an inter-coordinator command (usually updating state info).
*/
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_CMD))) {
ORTE_ERROR_LOG(rc);
return;
}
switch (command) {
case ORTE_SNAPC_GLOBAL_UPDATE_CMD:
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: hnp_receiver: Status Update.");
process_ckpt_update_cmd(sender, buffer);
break;
case ORTE_SNAPC_GLOBAL_INIT_CMD:
case ORTE_SNAPC_GLOBAL_TERM_CMD:
/* Do Nothing */
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
}
}
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer)
{
int ret;
orte_std_cntr_t count = 1;
int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
/*
* Receive the data:
* - ckpt_state
* - global snapshot handle (upon finish only)
* - sequence number (upon finish only)
*/
count = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
return;
}
orte_checkpoint_globals.ckpt_status = ckpt_status;
if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status ||
ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ||
ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status ||
ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
count = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_snapshot_handle, &count, OPAL_STRING)) ) {
return;
}
count = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_sequence_num, &count, OPAL_INT)) ) {
return;
}
}
/*
* If the job is not able to be checkpointed, then return
*/
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) {
opal_show_help("help-orte-checkpoint.txt", "non-ckptable",
true,
orte_checkpoint_globals.pid);
is_checkpoint_finished = true;
return;
}
if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status) {
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
orte_checkpoint_globals.pid, ORTE_ERROR);
is_checkpoint_finished = true;
return;
}
/* Status progression */
if( orte_checkpoint_globals.status ) {
pretty_print_status();
}
if( ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status) {
is_checkpoint_finished = true;
return;
}
/* Normal termination check */
if( (ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status && is_checkpoint_established) ||
(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status && is_checkpoint_recovered) ){
is_checkpoint_finished = true;
return;
}
else if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status ) {
is_checkpoint_recovered = true;
}
else if(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ) {
is_checkpoint_established = true;
}
}
static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t *buffer = NULL;
orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_INIT_CMD;
orte_jobid_t jobid = ORTE_JOBID_INVALID;
if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) {
exit_status = ORTE_ERROR;
goto cleanup;
}
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
orte_checkpoint_globals.pid);
timer_start = get_time();
/***********************************
* Notify HNP of checkpoint request
* Send:
* - Command
* - options
* - jobid
***********************************/
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = orte_snapc_base_pack_options(buffer, options)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &jobid, 1, ORTE_JOBID))) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&(orterun_hnp->name), buffer,
ORTE_RML_TAG_CKPT, orte_rml_send_callback,
NULL))) {
exit_status = ret;
goto cleanup;
}
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: notify_hnp: Requested a checkpoint of jobid %s\n",
ORTE_JOBID_PRINT(jobid));
cleanup:
if( ORTE_SUCCESS != exit_status ) {
opal_show_help("help-orte-checkpoint.txt", "unable_to_connect", true,
orte_checkpoint_globals.pid);
}
return exit_status;
}
/***************
* Pretty Print
***************/
static double get_time(void) {
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static int pretty_print_status(void) {
char * state_str = NULL;
double cur_time;
cur_time = get_time();
if( timer_last == 0 ) {
timer_last = cur_time;
}
orte_snapc_ckpt_state_str(&state_str, orte_checkpoint_globals.ckpt_status);
if( NULL != global_snapshot_handle ) {
opal_output(0,
"[%6.2f / %6.2f] %*s - %s\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str, global_snapshot_handle);
} else {
opal_output(0,
"[%6.2f / %6.2f] %*s - ...\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str);
}
if( NULL != state_str) {
free(state_str);
}
timer_last = cur_time;
return ORTE_SUCCESS;
}
static int pretty_print_reference(void)
{
#if OPAL_ENABLE_CRDEBUG == 1
if( orte_checkpoint_globals.enable_crdebug ) {
printf("Checkpoint handle: -s %3d %s\n",
global_sequence_num,
global_snapshot_handle);
return ORTE_SUCCESS;
}
#endif
printf("Snapshot Ref.: %3d %s\n",
global_sequence_num,
global_snapshot_handle);
return ORTE_SUCCESS;
}
static int list_all_snapshots(void) {
int ret, exit_status = ORTE_SUCCESS;
opal_list_t *all_snapshots = NULL;
opal_list_item_t* item = NULL;
orte_sstore_base_global_snapshot_info_t *global_snapshot = NULL;
all_snapshots = OBJ_NEW(opal_list_t);
if( ORTE_SUCCESS != (ret = orte_sstore_base_get_all_snapshots(all_snapshots, NULL)) ) {
opal_output(0, "Error: Unable to list the checkpoints in the directory <%s>\n",
orte_sstore_base_global_snapshot_dir);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* For each reference
*/
for(item = opal_list_get_first(all_snapshots);
item != opal_list_get_end(all_snapshots);
item = opal_list_get_next(item) ) {
global_snapshot = (orte_sstore_base_global_snapshot_info_t*)item;
/*
* Get a list of valid sequence numbers
*/
if( ORTE_SUCCESS != (ret = orte_sstore_base_find_all_seq_nums(global_snapshot,
&(global_snapshot->num_seqs),
&(global_snapshot->all_seqs)))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
#if OPAL_ENABLE_CRDEBUG == 1
int s;
/* Pretty print the result - C/R Debug version */
if( orte_checkpoint_globals.enable_crdebug ) {
for(s = 0; s < global_snapshot->num_seqs; ++s) {
printf("-s %s %s\n", global_snapshot->all_seqs[s], global_snapshot->reference);
}
}
else
#endif
{
/* Pretty print the result */
printf("Snapshot Ref.: %s\t[",
global_snapshot->reference);
if( 0 >= global_snapshot->num_seqs ) {
printf("No Valid Checkpoints");
} else {
printf("%s",
opal_argv_join(global_snapshot->all_seqs, ','));
}
printf("]\n");
}
}
cleanup:
while (NULL != (item = opal_list_remove_first(all_snapshots))) {
OBJ_RELEASE(item);
}
OBJ_RELEASE(all_snapshots);
return exit_status;
}