1
1
openmpi/opal/tools/opal-restart/opal-restart.c
Josh Hursey 5d0607395d A couple of C/R related commits that have been sitting off-trunk for a while.
* Add 'orte-checkpoint -l' option that lists all checkpoints currently available on the system.
 * Add 'orte-restart -i' which prints information regarding the checkpoint targeted for restart.
 * Add ability to extract the timing metadata.
 * Fix show_help() in the orte-checkpoint and orte-restart tools. They should be using the opal versions instead of the orte versions (otherwise nothing is printed).

This commit was SVN r21194.
2009-05-08 19:41:11 +00:00

634 строки
18 KiB
C

/*
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* OPAL Restart command
*
* This command will restart a single process from
* the checkpoint generated by the opal-checkpoint
* command.
*/
#include "opal_config.h"
#include <stdio.h>
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/constants.h"
#include "opal/util/cmd_line.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
/******************
* Local Functions
******************/
static int initialize(int argc, char *argv[]);
static int finalize(void);
static int parse_args(int argc, char *argv[]);
static int check_file(char *given_filename);
static int post_env_vars(int prev_pid, char *location);
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
static char *expected_crs_comp = NULL;
typedef struct {
bool help;
char *filename;
bool verbose;
bool forked;
bool self_case;
char *snapshot_loc;
int output;
} opal_restart_globals_t;
opal_restart_globals_t opal_restart_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL, NULL, NULL,
'h', NULL, "help",
0,
&opal_restart_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL,
'v', NULL, "verbose",
0,
&opal_restart_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL, NULL, NULL,
'\0', NULL, "fork",
0,
&opal_restart_globals.forked, OPAL_CMD_LINE_TYPE_BOOL,
"Fork off a new process which is the restarted process instead of "
"replacing opal_restart" },
{ "crs", "base", "snapshot_dir",
'w', NULL, "where",
1,
&opal_restart_globals.snapshot_loc, OPAL_CMD_LINE_TYPE_STRING,
"Where to find the checkpoint files. In most cases this is automatically "
"detected, however if a custom location was specified to opal-checkpoint "
"then this argument is meant to match it."},
/*
* We do this instead of using the '-mca crs self' convention as to not
* influence the user into thinking that they need to do this for all of the
* checkpointers. And to reinforce that the 'self' module is an exception, and
* all other modules are automaticly detected.
*/
{ NULL, NULL, NULL,
's', NULL, "self",
0,
&opal_restart_globals.self_case, OPAL_CMD_LINE_TYPE_BOOL,
"Is this a restart using the 'self' module. This is a special case as all "
"other modules are automaticly detected" },
/* End of list */
{ NULL, NULL, NULL,
'\0', NULL, NULL,
0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
int
main(int argc, char *argv[])
{
int ret, exit_status = OPAL_SUCCESS;
int child_pid;
int prev_pid = 0;
opal_crs_base_snapshot_t *snapshot = NULL;
char * tmp_env_var = NULL;
/***************
* Initialize
***************/
if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
exit_status = ret;
goto cleanup;
}
/*
* Check for existence of the file, or program in the case of self
*/
if( OPAL_SUCCESS != (ret = check_file(opal_restart_globals.filename) )) {
opal_show_help("help-opal-restart.txt", "invalid_filename", true,
opal_restart_globals.filename);
exit_status = ret;
goto cleanup;
}
/* Re-enable the selection of the CRS component, so we can choose the right one */
tmp_env_var = mca_base_param_env_var("crs_base_do_not_select");
opal_setenv(tmp_env_var,
"0", /* turn on the selection */
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/*
* Make sure we are using the correct checkpointer
*/
if(NULL == expected_crs_comp) {
char * base = NULL;
base = opal_crs_base_get_snapshot_directory(opal_restart_globals.filename);
opal_crs_base_extract_expected_component(base, &expected_crs_comp, &prev_pid);
free(base);
}
opal_output_verbose(10, opal_restart_globals.output,
"Restart Expects checkpointer: (%s)",
expected_crs_comp);
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
expected_crs_comp,
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/* Select this component or don't continue.
* If the selection of this component fails, then we can't
* restart on this node because it doesn't have the proper checkpointer
* available.
*/
if( OPAL_SUCCESS != (ret = opal_crs_base_open()) ) {
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
"crs", ret);
exit_status = ret;
goto cleanup;
}
if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
expected_crs_comp, ret);
exit_status = ret;
goto cleanup;
}
/*
* Make sure we have selected the proper component
*/
if(NULL == expected_crs_comp ||
0 != strncmp(expected_crs_comp,
opal_crs_base_selected_component.base_version.mca_component_name,
strlen(expected_crs_comp)) ) {
opal_show_help("help-opal-restart.txt", "comp_select_mismatch",
true,
expected_crs_comp,
opal_crs_base_selected_component.base_version.mca_component_name,
ret);
exit_status = ret;
goto cleanup;
}
/******************************
* Restart in this process
******************************/
opal_output_verbose(10, opal_restart_globals.output,
"Restarting from file (%s)",
opal_restart_globals.filename);
if( opal_restart_globals.forked ) {
opal_output_verbose(10, opal_restart_globals.output,
"\t Forking off a child");
} else {
opal_output_verbose(10, opal_restart_globals.output,
"\t Exec in self");
}
/* JJH: Do not unsetenv(opal_cr_is_tool) here, as it will impact the
* JJH: application improperly. */
snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
snapshot->cold_start = true;
snapshot->reference_name = strdup(opal_restart_globals.filename);
snapshot->local_location = opal_crs_base_get_snapshot_directory(snapshot->reference_name);
snapshot->remote_location = strdup(snapshot->local_location);
/* Since some checkpoint/restart systems don't pass along env vars to the
* restarted app, we need to take care of that.
*
* Included here is the creation of any files or directories that need to be
* created before the process is restarted.
*/
if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot->local_location) ) ) {
exit_status = ret;
goto cleanup;
}
/*
* Do the actual restart
*/
ret = opal_crs.crs_restart(snapshot,
opal_restart_globals.forked,
&child_pid);
if (OPAL_SUCCESS != ret) {
opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true,
opal_restart_globals.filename, ret);
exit_status = ret;
goto cleanup;
}
/* If we required it to exec in self, then fail if this function returns. */
if(!opal_restart_globals.forked) {
opal_show_help("help-opal-restart.txt", "failed-to-exec", true,
expected_crs_comp,
opal_crs_base_selected_component.base_version.mca_component_name);
exit_status = ret;
goto cleanup;
}
opal_output_verbose(10, opal_restart_globals.output,
"opal_restart: Restarted Child with PID = %d\n", child_pid);
/***************
* Cleanup
***************/
cleanup:
if (OPAL_SUCCESS != (ret = finalize())) {
return ret;
}
if(NULL != snapshot )
OBJ_DESTRUCT(snapshot);
return exit_status;
}
static int initialize(int argc, char *argv[])
{
int ret, exit_status = OPAL_SUCCESS;
char * tmp_env_var = NULL;
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( OPAL_SUCCESS != (ret = opal_init_util()) ) {
return ret;
}
/*
* Parse Command line arguments
*/
if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
goto cleanup;
exit_status = ret;
}
/*
* Setup OPAL Output handle from the verbose argument
*/
if( opal_restart_globals.verbose ) {
opal_restart_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(opal_restart_globals.output, 10);
} else {
opal_restart_globals.output = 0; /* Default=STDOUT */
}
/*
* Turn off the selection of the CRS component,
* we need to do that later
*/
tmp_env_var = mca_base_param_env_var("crs_base_do_not_select");
opal_setenv(tmp_env_var,
"1", /* turn off the selection */
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/*
* Initialize the OPAL layer
*/
if (OPAL_SUCCESS != (ret = opal_init())) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}
static int finalize(void)
{
int ret;
if (OPAL_SUCCESS != (ret = opal_finalize())) {
return ret;
}
return OPAL_SUCCESS;
}
static int parse_args(int argc, char *argv[])
{
int i, ret, len;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
opal_restart_globals_t tmp = { false, NULL, false, false, false, NULL, 0 };
opal_restart_globals = tmp;
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
/**
* Put all of the MCA arguments in the environment
*/
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
len = opal_argv_count(app_env);
for(i = 0; i < len; ++i) {
putenv(app_env[i]);
}
len = opal_argv_count(global_env);
for(i = 0; i < len; ++i) {
putenv(global_env[i]);
}
/* JJH: Do not setenv(opal_cr_is_tool, 1) here, as it will impact the
* JJH: application improperly. */
/**
* Now start parsing our specific arguments
*/
if (OPAL_SUCCESS != ret ||
opal_restart_globals.help ||
1 >= argc) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_show_help("help-opal-restart.txt", "usage", true,
args);
free(args);
return OPAL_ERROR;
}
/* get the remaining bits */
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
if ( 1 > argc ) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_show_help("help-opal-restart.txt", "usage", true,
args);
free(args);
return OPAL_ERROR;
}
opal_restart_globals.filename = strdup(argv[0]);
if ( NULL == opal_restart_globals.filename ||
0 >= strlen(opal_restart_globals.filename) ) {
opal_show_help("help-opal-restart.txt", "invalid_filename", true,
opal_restart_globals.filename);
return OPAL_ERROR;
}
/* If we have arguments after the command, then assume they
* need to be grouped together.
* Useful in the 'mca crs self' instance.
*/
if(argc > 1) {
opal_restart_globals.filename = strdup(opal_argv_join(argv, ' '));
}
/*
* Due to the special nature of the 'self' module, we need to know if that is the
* requested module.
* If it is then we don't need to do the 'snapshot reference' handling,
* If it is NOT then we need to extract the requested CRS module from the
* metadata file, and use that.
* Should note that the 'self' module is the only one that needs to be specified
* to opal_restart, all others are detected dynamicly.
*/
if( opal_restart_globals.self_case ) {
/* They are not required to explicitly use the '-mca crs self' convention,
* so set the environment var for them */
expected_crs_comp = strdup("self");
}
return OPAL_SUCCESS;
}
static int check_file(char *given_filename)
{
int exit_status = OPAL_SUCCESS;
int ret;
char * path_to_check = NULL;
char **argv = NULL;
if(NULL == given_filename) {
opal_output(opal_restart_globals.output,
"Error: No filename provided!");
exit_status = OPAL_ERROR;
goto cleanup;
}
/* If this is the self case then we need to check that the application
* exists
*/
if(opal_restart_globals.self_case) {
if( NULL == (argv = opal_argv_split(given_filename, ' ')) ) {
exit_status = OPAL_ERROR;
goto cleanup;
}
/* Extract just the application name */
path_to_check = strdup(argv[0]);
}
/* Otherwise we are checking for the existance of the snapshot handle
* in the snapshot directory
*/
else {
path_to_check = opal_crs_base_get_snapshot_directory(given_filename);
}
/* Do the check */
opal_output_verbose(10, opal_restart_globals.output,
"Checking for the existence of (%s)",
path_to_check);
if (0 > (ret = access(path_to_check, F_OK)) ) {
opal_output(opal_restart_globals.output,
"Error: Unable to access the path [%s]!",
path_to_check);
exit_status = OPAL_ERROR;
goto cleanup;
}
cleanup:
if( NULL != path_to_check)
free(path_to_check);
if( NULL != argv)
opal_argv_free(argv);
return exit_status;
}
static int post_env_vars(int prev_pid, char *location)
{
int ret, exit_status = OPAL_SUCCESS;
char *command = NULL;
char *proc_file = NULL;
char **loc_touch = NULL;
char **loc_mkdir = NULL;
int argc, i;
if( 0 > prev_pid ) {
opal_output(opal_restart_globals.output,
"Invalid PID (%d)\n",
prev_pid);
exit_status = OPAL_ERROR;
goto cleanup;
}
/*
* JJH: Hardcode /tmp to match opal/runtime/opal_cr.c in the application.
* This is needed so we can pass the previous environment to the restarted
* application process.
*/
asprintf(&proc_file, "/tmp/%s-%d", OPAL_CR_BASE_ENV_NAME, prev_pid);
asprintf(&command, "env | grep OMPI_ > %s", proc_file);
opal_output_verbose(5, opal_restart_globals.output,
"post_env_vars: Execute: <%s>", command);
ret = system(command);
if( 0 > ret) {
exit_status = ret;
goto cleanup;
}
/*
* Any directories that need to be created
*/
opal_crs_base_metadata_read_token(location, CRS_METADATA_MKDIR, &loc_mkdir);
argc = opal_argv_count(loc_mkdir);
for( i = 0; i < argc; ++i ) {
if( NULL != command ) {
free(command);
command = NULL;
}
asprintf(&command, "mkdir -p %s", loc_mkdir[i]);
opal_output_verbose(5, opal_restart_globals.output,
"post_env_vars: Execute: <%s>", command);
ret = system(command);
if( 0 > ret) {
exit_status = ret;
goto cleanup;
}
}
if( 0 < argc ) {
system("sync ; sync");
}
/*
* Any files that need to exist
*/
opal_crs_base_metadata_read_token(location, CRS_METADATA_TOUCH, &loc_touch);
argc = opal_argv_count(loc_touch);
for( i = 0; i < argc; ++i ) {
if( NULL != command ) {
free(command);
command = NULL;
}
asprintf(&command, "touch %s", loc_touch[i]);
opal_output_verbose(5, opal_restart_globals.output,
"post_env_vars: Execute: <%s>", command);
ret = system(command);
if( 0 > ret) {
exit_status = ret;
goto cleanup;
}
}
if( 0 < argc ) {
system("sync ; sync");
}
cleanup:
if( NULL != command) {
free(command);
command = NULL;
}
if( NULL != proc_file) {
free(proc_file);
proc_file = NULL;
}
if( NULL != loc_mkdir ) {
opal_argv_free(loc_mkdir);
loc_mkdir = NULL;
}
if( NULL != loc_touch ) {
opal_argv_free(loc_touch);
loc_touch = NULL;
}
return exit_status;
}