2013-03-28 01:09:41 +04:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
2007-03-17 02:11:45 +03:00
|
|
|
/*
|
2009-04-30 20:43:05 +04:00
|
|
|
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
|
2007-03-17 02:11:45 +03:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2009-12-04 03:51:15 +03:00
|
|
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
2007-03-17 02:11:45 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2015-06-24 06:59:57 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2007-03-17 02:11:45 +03:00
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2016-05-07 14:12:01 +03:00
|
|
|
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
2015-06-24 06:59:57 +03:00
|
|
|
* reserved.
|
2012-02-29 21:52:38 +04:00
|
|
|
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
|
2007-03-17 02:11:45 +03:00
|
|
|
* $COPYRIGHT$
|
2015-06-24 06:59:57 +03:00
|
|
|
*
|
2007-03-17 02:11:45 +03:00
|
|
|
* Additional copyrights may follow
|
2015-06-24 06:59:57 +03:00
|
|
|
*
|
2007-03-17 02:11:45 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
* OPAL Checkpoint command
|
|
|
|
*
|
2015-06-24 06:59:57 +03:00
|
|
|
* This command will initiate the checkpoint of a single
|
2007-03-17 02:11:45 +03:00
|
|
|
* process that has been compiled with OPAL support.
|
|
|
|
*/
|
|
|
|
#include "opal_config.h"
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif /* HAVE_UNISTD_H */
|
|
|
|
#ifdef HAVE_FCNTL_H
|
|
|
|
#include <fcntl.h>
|
|
|
|
#endif /* HAVE_FCNTL_H */
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
|
|
#include <sys/types.h>
|
|
|
|
#endif /* HAVE_SYS_TYPES_H */
|
|
|
|
#ifdef HAVE_SYS_STAT_H
|
|
|
|
#include <sys/stat.h> /* for mkfifo */
|
|
|
|
#endif /* HAVE_SYS_STAT_H */
|
|
|
|
#ifdef HAVE_SYS_WAIT_H
|
|
|
|
#include <sys/wait.h>
|
|
|
|
#endif
|
|
|
|
#include <string.h>
|
|
|
|
#include <signal.h>
|
|
|
|
|
|
|
|
#include "opal/constants.h"
|
|
|
|
|
|
|
|
#include "opal/util/cmd_line.h"
|
|
|
|
#include "opal/util/argv.h"
|
|
|
|
#include "opal/util/show_help.h"
|
|
|
|
#include "opal/util/opal_environ.h"
|
2012-02-29 21:52:38 +04:00
|
|
|
#include "opal/util/error.h"
|
2007-03-17 02:11:45 +03:00
|
|
|
#include "opal/util/output.h"
|
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
|
|
|
|
#include "opal/runtime/opal.h"
|
|
|
|
#include "opal/runtime/opal_cr.h"
|
|
|
|
|
|
|
|
#include "opal/mca/crs/crs.h"
|
|
|
|
#include "opal/mca/crs/base/base.h"
|
|
|
|
|
|
|
|
/******************
|
|
|
|
* Global Vars
|
|
|
|
******************/
|
|
|
|
|
|
|
|
/******************
|
|
|
|
* Local Functions
|
|
|
|
******************/
|
|
|
|
static int initialize(int argc, char *argv[]);
|
|
|
|
static int finalize(void);
|
|
|
|
static int parse_args(int argc, char *argv[]);
|
2015-06-24 06:59:57 +03:00
|
|
|
static int notify_process_for_checkpoint(pid_t pid, char **fname, int term,
|
2007-03-17 02:11:45 +03:00
|
|
|
opal_crs_state_type_t *state);
|
|
|
|
|
|
|
|
/*****************************************
|
|
|
|
* Global Vars for Command line Arguments
|
|
|
|
*****************************************/
|
|
|
|
typedef struct {
|
|
|
|
bool help;
|
|
|
|
int pid;
|
|
|
|
bool term;
|
|
|
|
bool verbose;
|
2007-09-27 17:13:29 +04:00
|
|
|
bool quiet;
|
2007-03-17 02:11:45 +03:00
|
|
|
char *snapshot_name;
|
|
|
|
char *snapshot_loc;
|
|
|
|
int output;
|
|
|
|
} opal_checkpoint_globals_t;
|
|
|
|
|
|
|
|
opal_checkpoint_globals_t opal_checkpoint_globals;
|
|
|
|
|
|
|
|
opal_cmd_line_init_t cmd_line_opts[] = {
|
2015-06-24 06:59:57 +03:00
|
|
|
{ NULL,
|
|
|
|
'h', NULL, "help",
|
2007-03-17 02:11:45 +03:00
|
|
|
0,
|
|
|
|
&opal_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
|
|
|
"This help message" },
|
|
|
|
|
2012-10-30 23:45:18 +04:00
|
|
|
{ NULL,
|
2015-06-24 06:59:57 +03:00
|
|
|
'v', NULL, "verbose",
|
2007-03-17 02:11:45 +03:00
|
|
|
0,
|
|
|
|
&opal_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
|
|
|
"Be Verbose" },
|
|
|
|
|
2012-10-30 23:45:18 +04:00
|
|
|
{ NULL,
|
2015-06-24 06:59:57 +03:00
|
|
|
'q', NULL, "quiet",
|
2007-09-27 17:13:29 +04:00
|
|
|
0,
|
|
|
|
&opal_checkpoint_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
|
|
|
"Be Super Quiet" },
|
|
|
|
|
2012-10-30 23:45:18 +04:00
|
|
|
{ NULL,
|
2015-06-24 06:59:57 +03:00
|
|
|
'\0', NULL, "term",
|
2007-03-17 02:11:45 +03:00
|
|
|
0,
|
|
|
|
&opal_checkpoint_globals.term, OPAL_CMD_LINE_TYPE_BOOL,
|
|
|
|
"Terminate the application after checkpoint" },
|
|
|
|
|
2012-10-30 23:45:18 +04:00
|
|
|
{ NULL,
|
2007-03-17 02:11:45 +03:00
|
|
|
'n', NULL, "name",
|
|
|
|
1,
|
|
|
|
&opal_checkpoint_globals.snapshot_name, OPAL_CMD_LINE_TYPE_STRING,
|
|
|
|
"Request a specific snapshot reference." },
|
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
{ "crs_base_snapshot_dir",
|
|
|
|
'w', NULL, "where",
|
2007-03-17 02:11:45 +03:00
|
|
|
1,
|
|
|
|
&opal_checkpoint_globals.snapshot_loc, OPAL_CMD_LINE_TYPE_STRING,
|
|
|
|
"Where to place the checkpoint files. Note: You must remember this "
|
|
|
|
"location to pass into opal-restart, as it may not be able to find "
|
|
|
|
"the desired directory." },
|
|
|
|
|
|
|
|
/* End of list */
|
2012-10-30 23:45:18 +04:00
|
|
|
{ NULL, '\0', NULL, NULL, 0,
|
2007-03-17 02:11:45 +03:00
|
|
|
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
|
|
|
NULL }
|
|
|
|
};
|
|
|
|
|
|
|
|
int
|
|
|
|
main(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
int ret, exit_status = OPAL_SUCCESS;
|
2009-04-30 20:43:05 +04:00
|
|
|
char *fname = NULL;
|
2007-03-17 02:11:45 +03:00
|
|
|
opal_crs_state_type_t cr_state;
|
|
|
|
|
|
|
|
/***************
|
|
|
|
* Initialize
|
|
|
|
***************/
|
|
|
|
if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*******************************
|
|
|
|
* Checkpoint the requested PID
|
|
|
|
*******************************/
|
|
|
|
opal_output_verbose(10, opal_checkpoint_globals.output,
|
|
|
|
"opal_checkpoint: Checkpointing PID %d",
|
|
|
|
opal_checkpoint_globals.pid);
|
|
|
|
if( opal_checkpoint_globals.term ) {
|
|
|
|
opal_output_verbose(10, opal_checkpoint_globals.output,
|
|
|
|
"\tTerminating application after checkpoint");
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = notify_process_for_checkpoint(opal_checkpoint_globals.pid,
|
|
|
|
&fname,
|
|
|
|
opal_checkpoint_globals.term,
|
|
|
|
&cr_state);
|
|
|
|
if (OPAL_SUCCESS != ret ||
|
|
|
|
cr_state == OPAL_CRS_ERROR) {
|
|
|
|
opal_show_help("help-opal-checkpoint.txt", "ckpt_failure", true,
|
|
|
|
opal_checkpoint_globals.pid, ret, cr_state);
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2007-09-27 17:13:29 +04:00
|
|
|
|
|
|
|
if( !opal_checkpoint_globals.quiet ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
|
|
|
"Local Snapshot Reference = %s\n",
|
|
|
|
fname);
|
|
|
|
}
|
2007-03-17 02:11:45 +03:00
|
|
|
|
|
|
|
cleanup:
|
|
|
|
/***************
|
|
|
|
* Cleanup
|
|
|
|
***************/
|
|
|
|
if (OPAL_SUCCESS != (ret = finalize())) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int initialize(int argc, char *argv[]) {
|
|
|
|
int ret, exit_status = OPAL_SUCCESS;
|
2008-04-23 04:17:12 +04:00
|
|
|
char * tmp_env_var = NULL;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2007-04-22 02:38:15 +04:00
|
|
|
/*
|
|
|
|
* Make sure to init util before parse_args
|
|
|
|
* to ensure installdirs is setup properly
|
|
|
|
* before calling mca_base_open();
|
|
|
|
*/
|
2009-12-04 03:51:15 +03:00
|
|
|
if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
|
2007-04-22 02:38:15 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
/*
|
|
|
|
* Parse Command Line Arguments
|
|
|
|
*/
|
|
|
|
if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Setup OPAL Output handle from the verbose argument
|
|
|
|
*/
|
|
|
|
if( opal_checkpoint_globals.verbose ) {
|
2007-09-27 17:13:29 +04:00
|
|
|
opal_checkpoint_globals.quiet = false; /* Automaticly turn off quiet if it is set */
|
2007-03-17 02:11:45 +03:00
|
|
|
opal_checkpoint_globals.output = opal_output_open(NULL);
|
|
|
|
opal_output_set_verbosity(opal_checkpoint_globals.output, 10);
|
|
|
|
} else {
|
|
|
|
opal_checkpoint_globals.output = 0; /* Default=STDOUT */
|
|
|
|
}
|
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
/*
|
2007-03-17 02:11:45 +03:00
|
|
|
* Disable the checkpoint notification routine for this
|
|
|
|
* tool. As we will never need to checkpoint this tool.
|
|
|
|
* Note: This must happen before opal_init().
|
|
|
|
*/
|
|
|
|
opal_cr_set_enabled(false);
|
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
/*
|
|
|
|
* Select the 'none' CRS component,
|
2007-03-17 02:11:45 +03:00
|
|
|
* since we don't actually use a checkpointer
|
|
|
|
*/
|
2013-03-28 01:09:41 +04:00
|
|
|
(void) mca_base_var_env_name("crs", &tmp_env_var);
|
2008-04-23 04:17:12 +04:00
|
|
|
opal_setenv(tmp_env_var,
|
2007-03-17 02:11:45 +03:00
|
|
|
"none",
|
|
|
|
true, &environ);
|
2008-04-23 04:17:12 +04:00
|
|
|
free(tmp_env_var);
|
|
|
|
tmp_env_var = NULL;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize OPAL
|
|
|
|
*/
|
2009-12-04 03:51:15 +03:00
|
|
|
if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) {
|
2007-03-17 02:11:45 +03:00
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int finalize(void) {
|
|
|
|
int ret = OPAL_SUCCESS;
|
|
|
|
|
|
|
|
if (OPAL_SUCCESS != (ret = opal_finalize())) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int parse_args(int argc, char *argv[]) {
|
|
|
|
int i, ret, len;
|
|
|
|
opal_cmd_line_t cmd_line;
|
|
|
|
char **app_env = NULL, **global_env = NULL;
|
2008-04-23 04:17:12 +04:00
|
|
|
char * tmp_env_var = NULL;
|
2012-02-29 21:52:38 +04:00
|
|
|
char *argv0 = NULL;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
|
|
|
memset(&opal_checkpoint_globals, 0, sizeof(opal_checkpoint_globals_t));
|
|
|
|
|
|
|
|
opal_checkpoint_globals.snapshot_name = NULL;
|
|
|
|
opal_checkpoint_globals.snapshot_loc = NULL;
|
2015-06-24 06:59:57 +03:00
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
/* Parse the command line options */
|
|
|
|
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
|
|
|
mca_base_open();
|
|
|
|
mca_base_cmd_line_setup(&cmd_line);
|
2016-05-07 14:12:01 +03:00
|
|
|
ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv);
|
2012-02-29 21:52:38 +04:00
|
|
|
|
|
|
|
if (OPAL_SUCCESS != ret) {
|
|
|
|
if (OPAL_ERR_SILENT != ret) {
|
|
|
|
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
|
|
|
|
opal_strerror(ret));
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
if (opal_checkpoint_globals.help) {
|
|
|
|
char *str, *args = NULL;
|
|
|
|
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
|
|
|
str = opal_show_help_string("help-opal-checkpoint.txt", "usage", true,
|
|
|
|
args);
|
|
|
|
if (NULL != str) {
|
|
|
|
printf("%s", str);
|
|
|
|
free(str);
|
|
|
|
}
|
|
|
|
free(args);
|
|
|
|
/* If we show the help message, that should be all we do */
|
|
|
|
exit(0);
|
|
|
|
}
|
2015-06-24 06:59:57 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Put all of the MCA arguments in the environment
|
2007-03-17 02:11:45 +03:00
|
|
|
*/
|
|
|
|
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
|
|
|
|
|
|
|
|
len = opal_argv_count(app_env);
|
|
|
|
for(i = 0; i < len; ++i) {
|
|
|
|
putenv(app_env[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
len = opal_argv_count(global_env);
|
|
|
|
for(i = 0; i < len; ++i) {
|
|
|
|
putenv(global_env[i]);
|
|
|
|
}
|
|
|
|
|
2013-03-28 01:09:41 +04:00
|
|
|
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
|
2008-04-23 04:17:12 +04:00
|
|
|
opal_setenv(tmp_env_var,
|
2007-03-17 02:11:45 +03:00
|
|
|
"1",
|
|
|
|
true, &environ);
|
2008-04-23 04:17:12 +04:00
|
|
|
free(tmp_env_var);
|
|
|
|
tmp_env_var = NULL;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Now start parsing our specific arguments
|
|
|
|
*/
|
|
|
|
|
|
|
|
if( NULL == opal_checkpoint_globals.snapshot_name )
|
|
|
|
opal_checkpoint_globals.snapshot_name = strdup("");
|
|
|
|
if( NULL == opal_checkpoint_globals.snapshot_loc ) {
|
|
|
|
opal_checkpoint_globals.snapshot_loc = strdup("");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get the remaining bits */
|
2012-02-29 21:52:38 +04:00
|
|
|
argv0 = strdup(argv[0]);
|
2007-03-17 02:11:45 +03:00
|
|
|
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
|
|
|
|
|
2012-02-29 21:52:38 +04:00
|
|
|
if (0 == argc) {
|
|
|
|
fprintf(stderr, "%s: Nothing to do\n", argv0);
|
|
|
|
fprintf(stderr, "Type '%s --help' for usage.\n", argv0);
|
|
|
|
free(argv0);
|
|
|
|
return OPAL_ERROR;
|
|
|
|
}
|
|
|
|
free(argv0);
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
opal_checkpoint_globals.pid = atoi(argv[0]);
|
|
|
|
if ( 0 >= opal_checkpoint_globals.pid ) {
|
|
|
|
opal_show_help("help-opal-checkpoint.txt", "invalid_pid", true,
|
|
|
|
opal_checkpoint_globals.pid);
|
|
|
|
return OPAL_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
static int
|
2007-03-17 02:11:45 +03:00
|
|
|
notify_process_for_checkpoint(pid_t pid, char **fname, int term, opal_crs_state_type_t *cr_state)
|
|
|
|
{
|
|
|
|
char *prog_named_pipe_r = NULL, *prog_named_pipe_w = NULL;
|
|
|
|
int prog_named_read_pipe_fd = -1, prog_named_write_pipe_fd = -1;
|
|
|
|
char *loc_fname = NULL, *tmp_pid = NULL;
|
|
|
|
unsigned char cmd;
|
|
|
|
int len, ret;
|
|
|
|
int exit_status = OPAL_SUCCESS;
|
|
|
|
int s, max_wait_time = 20; /* wait time before giving up on the checkpoint */
|
|
|
|
ssize_t tmp_size = 0;
|
|
|
|
int value;
|
|
|
|
|
|
|
|
/* A string copy of the pid */
|
|
|
|
asprintf(&tmp_pid, "%d", pid);
|
|
|
|
|
|
|
|
/* Flip the read/write files for bi-directionality */
|
|
|
|
asprintf(&prog_named_pipe_w, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_R, tmp_pid);
|
|
|
|
asprintf(&prog_named_pipe_r, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_W, tmp_pid);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Signal the application telling it that we wish to checkpoint
|
|
|
|
*/
|
2007-10-09 00:53:02 +04:00
|
|
|
if( 0 != (ret = kill(pid, opal_cr_entry_point_signal) ) ) {
|
2007-03-17 02:11:45 +03:00
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
opal_output_verbose(10, opal_checkpoint_globals.output,
|
|
|
|
"opal_checkpoint: Looking for Named Pipes (%s) (%s)\n",
|
|
|
|
prog_named_pipe_r, prog_named_pipe_w);
|
|
|
|
|
|
|
|
for( s = 0; s < max_wait_time; ++s) {
|
|
|
|
/*
|
|
|
|
* See if the named pipe exists yet for the PID in question
|
|
|
|
*/
|
|
|
|
if( 0 > (ret = access(prog_named_pipe_r, F_OK) )) {
|
|
|
|
/* File doesn't exist yet, keep waiting */
|
2007-09-27 17:13:29 +04:00
|
|
|
if( !opal_checkpoint_globals.quiet &&
|
|
|
|
s >= max_wait_time - 5 ) {
|
|
|
|
opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
|
|
|
|
prog_named_pipe_r, ret, s, max_wait_time);
|
|
|
|
}
|
2007-03-17 02:11:45 +03:00
|
|
|
sleep(1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else if( 0 > (ret = access(prog_named_pipe_w, F_OK) )) {
|
|
|
|
/* File doesn't exist yet, keep waiting */
|
2007-09-27 17:13:29 +04:00
|
|
|
if( !opal_checkpoint_globals.quiet &&
|
|
|
|
s >= max_wait_time - 5 ) {
|
|
|
|
opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
|
|
|
|
prog_named_pipe_w, ret, s, max_wait_time);
|
|
|
|
}
|
2007-03-17 02:11:45 +03:00
|
|
|
sleep(1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2015-06-24 06:59:57 +03:00
|
|
|
if( s == max_wait_time ) {
|
|
|
|
/* The file doesn't exist,
|
2007-03-17 02:11:45 +03:00
|
|
|
* This means that the process didn't open up a named pipe for us
|
|
|
|
* to access their checkpoint notification routine. Therefore,
|
|
|
|
* the application either:
|
|
|
|
* - Doesn't exist
|
|
|
|
* - Isn't checkpointable
|
|
|
|
* In either case there is nothing we can do.
|
|
|
|
*/
|
|
|
|
opal_show_help("help-opal-checkpoint.txt", "pid_does_not_exist", true,
|
|
|
|
opal_checkpoint_globals.pid, prog_named_pipe_r, prog_named_pipe_w);
|
2015-06-24 06:59:57 +03:00
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
*cr_state = OPAL_CRS_ERROR;
|
2015-06-24 06:59:57 +03:00
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The file does exist, so let's use it */
|
|
|
|
|
|
|
|
/*
|
2015-06-24 06:59:57 +03:00
|
|
|
* Open
|
2007-03-17 02:11:45 +03:00
|
|
|
* - prog_named_write_pipe:
|
|
|
|
* prog makes this file and opens Read Only
|
|
|
|
* this app. opens it Write Only
|
|
|
|
* - prog_named_read_pipe:
|
|
|
|
* prog makes this file and opens Write Only
|
|
|
|
* this app. opens it Read Only
|
|
|
|
*/
|
|
|
|
prog_named_write_pipe_fd = open(prog_named_pipe_w, O_WRONLY);
|
|
|
|
if(prog_named_write_pipe_fd < 0) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to open name pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
prog_named_pipe_w, prog_named_write_pipe_fd);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
prog_named_read_pipe_fd = open(prog_named_pipe_r, O_RDWR);
|
|
|
|
if(prog_named_read_pipe_fd < 0) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
|
|
|
"opal_checkpoint: Error: Unable to open name pipe (%s). %d\n",
|
|
|
|
prog_named_pipe_r, prog_named_read_pipe_fd);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Start the handshake
|
|
|
|
*/
|
|
|
|
len = 0;
|
|
|
|
if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to write handshake to named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
prog_named_pipe_w, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &value, sizeof(int))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to read length from named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
prog_named_pipe_r, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check the response to make sure we can checkpoint this process */
|
|
|
|
if( OPAL_CHECKPOINT_CMD_IN_PROGRESS == value ) {
|
|
|
|
opal_show_help("help-opal-checkpoint.txt",
|
2015-06-24 06:59:57 +03:00
|
|
|
"ckpt:in_progress",
|
2007-03-17 02:11:45 +03:00
|
|
|
true,
|
|
|
|
opal_checkpoint_globals.pid);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
else if( OPAL_CHECKPOINT_CMD_NULL == value ) {
|
|
|
|
opal_show_help("help-opal-checkpoint.txt",
|
2015-06-24 06:59:57 +03:00
|
|
|
"ckpt:req_null",
|
2007-03-17 02:11:45 +03:00
|
|
|
true,
|
|
|
|
opal_checkpoint_globals.pid);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
else if ( OPAL_CHECKPOINT_CMD_ERROR == value ) {
|
|
|
|
opal_show_help("help-opal-checkpoint.txt",
|
2015-06-24 06:59:57 +03:00
|
|
|
"ckpt:req_error",
|
2007-03-17 02:11:45 +03:00
|
|
|
true,
|
|
|
|
opal_checkpoint_globals.pid);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-06-24 06:59:57 +03:00
|
|
|
* Write the checkpoint request and information to the
|
2007-03-17 02:11:45 +03:00
|
|
|
* pipe
|
|
|
|
*/
|
|
|
|
cmd = OPAL_CR_CHECKPOINT;
|
|
|
|
/* Send the command */
|
|
|
|
if( sizeof(cmd) != (ret = write(prog_named_write_pipe_fd, &cmd, sizeof(cmd))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to write CHECKPOINT Command to named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
prog_named_pipe_w, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Send the arguments: {pid, term} */
|
|
|
|
if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &pid, sizeof(int))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to write pid (%d) to named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
pid, prog_named_pipe_w, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &term, sizeof(int))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to write term (%d) to named pipe (%s), %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
term, prog_named_pipe_w, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
/* Send the snapshot_name argument */
|
2007-03-17 02:11:45 +03:00
|
|
|
len = strlen(opal_checkpoint_globals.snapshot_name) + 1;
|
|
|
|
if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to write snapshot name len (%d) to named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
len, prog_named_pipe_w, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
tmp_size = sizeof(char) * len;
|
|
|
|
if( tmp_size != (ret = write(prog_named_write_pipe_fd, (opal_checkpoint_globals.snapshot_name), (sizeof(char) * len))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to write snapshot name (%s) to named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
opal_checkpoint_globals.snapshot_name, prog_named_pipe_w, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Send the snashot location argument */
|
|
|
|
len = strlen(opal_checkpoint_globals.snapshot_loc) + 1;
|
|
|
|
if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to write snapshot location len (%d) to named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
len, prog_named_pipe_w, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
tmp_size = sizeof(char) * len;
|
|
|
|
if( tmp_size != (ret = write(prog_named_write_pipe_fd, (opal_checkpoint_globals.snapshot_loc), (sizeof(char) * len))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to write snapshot location (%s) to named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
opal_checkpoint_globals.snapshot_loc, prog_named_pipe_w, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the response from the notification routine on the other
|
|
|
|
* machine.
|
|
|
|
*/
|
|
|
|
if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &len, sizeof(int))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to read length from named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
prog_named_pipe_r, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(len > 0) {
|
|
|
|
loc_fname = (char *) malloc(sizeof(char) * len);
|
|
|
|
if( (ssize_t)(sizeof(char) * len) != (ret = read(prog_named_read_pipe_fd, loc_fname, (sizeof(char) * len))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
2015-06-24 06:59:57 +03:00
|
|
|
"opal_checkpoint: Error: Unable to read filename from named pipe (%s). %d\n",
|
2007-03-17 02:11:45 +03:00
|
|
|
prog_named_pipe_w, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*fname = strdup(loc_fname);
|
|
|
|
if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &cr_state, sizeof(int))) ) {
|
|
|
|
opal_output(opal_checkpoint_globals.output,
|
|
|
|
"opal_checkpoint: Error: Unable to read state from named pipe (%s). %d\n",
|
|
|
|
prog_named_pipe_r, ret);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
cleanup:
|
2007-03-17 02:11:45 +03:00
|
|
|
/*
|
|
|
|
* Close the pipes now that we are done with it
|
|
|
|
*/
|
|
|
|
close(prog_named_write_pipe_fd);
|
|
|
|
close(prog_named_read_pipe_fd);
|
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
if( NULL != tmp_pid)
|
2007-03-17 02:11:45 +03:00
|
|
|
free(tmp_pid);
|
2015-06-24 06:59:57 +03:00
|
|
|
if( NULL != prog_named_pipe_r)
|
2007-03-17 02:11:45 +03:00
|
|
|
free(prog_named_pipe_r);
|
2015-06-24 06:59:57 +03:00
|
|
|
if( NULL != prog_named_pipe_w)
|
2007-03-17 02:11:45 +03:00
|
|
|
free(prog_named_pipe_w);
|
|
|
|
|
|
|
|
return exit_status;
|
|
|
|
}
|