42ecffb6d0
Take another shot at untangling the spaghetti orterun: fix for command line parsing orte-submit calls opal_init_util () before parsing out MCA command line options (-mca, -am, etc). This prevents mpirun from setting opal MCA variables for some frameworks as well as the MCA base. This is because when a framework is opened all of its variables are set to read-only. Eventually we want to lift this restriction on some MCA variables but since -mca is affected we must parse out the MCA command line options before opal_init_util(). This commit fixes the bug by adding a new option to opal_cmd_line_parse (ignore unknown option) so orte-submit can pre-parse the command line for MCA options. Signed-off-by: Nathan Hjelm <hjelmn@me.com> Minor cleanups to avoid releasing/recreating the cmd line
739 строки
22 KiB
C
739 строки
22 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
|
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
|
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* OPAL Restart command
|
|
*
|
|
* This command will restart a single process from
|
|
* the checkpoint generated by the opal-checkpoint
|
|
* command.
|
|
*/
|
|
#include "opal_config.h"
|
|
|
|
#include <stdio.h>
|
|
#include <errno.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
#include <stdlib.h>
|
|
#ifdef HAVE_SYS_STAT_H
|
|
#include <sys/stat.h>
|
|
#endif
|
|
#ifdef HAVE_FCNTL_H
|
|
#include <fcntl.h>
|
|
#endif /* HAVE_FCNTL_H */
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
#include <sys/types.h>
|
|
#endif
|
|
#ifdef HAVE_SYS_WAIT_H
|
|
#include <sys/wait.h>
|
|
#endif
|
|
#include <string.h>
|
|
|
|
#include "opal/constants.h"
|
|
|
|
#include "opal/util/cmd_line.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/opal_environ.h"
|
|
#include "opal/util/error.h"
|
|
#include "opal/util/basename.h"
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "opal/runtime/opal.h"
|
|
#include "opal/runtime/opal_cr.h"
|
|
|
|
#include "opal/mca/crs/crs.h"
|
|
#include "opal/mca/crs/base/base.h"
|
|
|
|
#include "opal/mca/compress/compress.h"
|
|
#include "opal/mca/compress/base/base.h"
|
|
|
|
/******************
|
|
* Local Functions
|
|
******************/
|
|
static int initialize(int argc, char *argv[]);
|
|
static int finalize(void);
|
|
static int parse_args(int argc, char *argv[]);
|
|
static int check_file(void);
|
|
static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot);
|
|
|
|
/*****************************************
|
|
* Global Vars for Command line Arguments
|
|
*****************************************/
|
|
static char *expected_crs_comp = NULL;
|
|
|
|
typedef struct {
|
|
bool help;
|
|
bool verbose;
|
|
char *snapshot_ref;
|
|
char *snapshot_loc;
|
|
char *snapshot_metadata;
|
|
char *snapshot_cache;
|
|
char *snapshot_compress;
|
|
char *snapshot_compress_postfix;
|
|
int output;
|
|
} opal_restart_globals_t;
|
|
|
|
opal_restart_globals_t opal_restart_globals;
|
|
|
|
opal_cmd_line_init_t cmd_line_opts[] = {
|
|
{ NULL,
|
|
'h', NULL, "help",
|
|
0,
|
|
&opal_restart_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"This help message" },
|
|
|
|
{ NULL,
|
|
'v', NULL, "verbose",
|
|
0,
|
|
&opal_restart_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Be Verbose" },
|
|
|
|
{ NULL,
|
|
'l', NULL, "location",
|
|
1,
|
|
&opal_restart_globals.snapshot_loc, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Full path to the location of the local snapshot."},
|
|
|
|
{ NULL,
|
|
'm', NULL, "metadata",
|
|
1,
|
|
&opal_restart_globals.snapshot_metadata, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Relative path (with respect to --location) to the metadata file."},
|
|
|
|
{ NULL,
|
|
'r', NULL, "reference",
|
|
1,
|
|
&opal_restart_globals.snapshot_ref, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Local snapshot reference."},
|
|
|
|
{ NULL,
|
|
'c', NULL, "cache",
|
|
1,
|
|
&opal_restart_globals.snapshot_cache, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Possible local cache of the snapshot reference."},
|
|
|
|
{ NULL,
|
|
'd', NULL, "decompress",
|
|
1,
|
|
&opal_restart_globals.snapshot_compress, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Decompression component to use."},
|
|
|
|
{ NULL,
|
|
'p', NULL, "decompress_postfix",
|
|
1,
|
|
&opal_restart_globals.snapshot_compress_postfix, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Decompression component postfix."},
|
|
|
|
/* End of list */
|
|
{ NULL,
|
|
'\0', NULL, NULL,
|
|
0,
|
|
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
|
NULL }
|
|
};
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
int ret, exit_status = OPAL_SUCCESS;
|
|
int child_pid;
|
|
int prev_pid = 0;
|
|
int idx;
|
|
opal_crs_base_snapshot_t *snapshot = NULL;
|
|
char * tmp_env_var = NULL;
|
|
bool select = false;
|
|
|
|
/***************
|
|
* Initialize
|
|
***************/
|
|
if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Check for existence of the file, or program in the case of self
|
|
*/
|
|
if( OPAL_SUCCESS != (ret = check_file() )) {
|
|
opal_show_help("help-opal-restart.txt", "invalid_filename", true,
|
|
opal_restart_globals.snapshot_ref);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Re-enable the selection of the CRS component, so we can choose the right one */
|
|
idx = mca_base_var_find(NULL, "crs", "base", "do_not_select");
|
|
|
|
if (0 > idx) {
|
|
opal_output(opal_restart_globals.output,
|
|
"MCA variable opal_crs_base_do_not_select not found\n");
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
ret = mca_base_var_set_value(idx, &select, 0, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
|
|
if (OPAL_SUCCESS != ret) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Make sure we are using the correct checkpointer
|
|
*/
|
|
if(NULL == expected_crs_comp) {
|
|
char * full_metadata_path = NULL;
|
|
FILE * metadata = NULL;
|
|
|
|
asprintf(&full_metadata_path, "%s/%s/%s",
|
|
opal_restart_globals.snapshot_loc,
|
|
opal_restart_globals.snapshot_ref,
|
|
opal_restart_globals.snapshot_metadata);
|
|
if( NULL == (metadata = fopen(full_metadata_path, "r")) ) {
|
|
opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
|
|
opal_restart_globals.snapshot_metadata,
|
|
full_metadata_path);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(metadata,
|
|
&expected_crs_comp,
|
|
&prev_pid)) ) {
|
|
opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
|
|
opal_restart_globals.snapshot_metadata,
|
|
full_metadata_path);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
free(full_metadata_path);
|
|
full_metadata_path = NULL;
|
|
|
|
fclose(metadata);
|
|
metadata = NULL;
|
|
}
|
|
|
|
opal_output_verbose(10, opal_restart_globals.output,
|
|
"Restart Expects checkpointer: (%s)",
|
|
expected_crs_comp);
|
|
|
|
(void) mca_base_var_env_name("crs", &tmp_env_var);
|
|
opal_setenv(tmp_env_var,
|
|
expected_crs_comp,
|
|
true, &environ);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
|
|
/* Select this component or don't continue.
|
|
* If the selection of this component fails, then we can't
|
|
* restart on this node because it doesn't have the proper checkpointer
|
|
* available.
|
|
*/
|
|
if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) {
|
|
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
|
|
"crs", ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
|
|
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
|
|
expected_crs_comp, ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Make sure we have selected the proper component
|
|
*/
|
|
if(NULL == expected_crs_comp ||
|
|
0 != strncmp(expected_crs_comp,
|
|
opal_crs_base_selected_component.base_version.mca_component_name,
|
|
strlen(expected_crs_comp)) ) {
|
|
opal_show_help("help-opal-restart.txt", "comp_select_mismatch",
|
|
true,
|
|
expected_crs_comp,
|
|
opal_crs_base_selected_component.base_version.mca_component_name,
|
|
ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/******************************
|
|
* Restart in this process
|
|
******************************/
|
|
opal_output_verbose(10, opal_restart_globals.output,
|
|
"Restarting from file (%s)\n",
|
|
opal_restart_globals.snapshot_ref);
|
|
|
|
snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
|
|
snapshot->cold_start = true;
|
|
asprintf(&(snapshot->snapshot_directory), "%s/%s",
|
|
opal_restart_globals.snapshot_loc,
|
|
opal_restart_globals.snapshot_ref);
|
|
asprintf(&(snapshot->metadata_filename), "%s/%s",
|
|
snapshot->snapshot_directory,
|
|
opal_restart_globals.snapshot_metadata);
|
|
|
|
/* Since some checkpoint/restart systems don't pass along env vars to the
|
|
* restarted app, we need to take care of that.
|
|
*
|
|
* Included here is the creation of any files or directories that need to be
|
|
* created before the process is restarted.
|
|
*/
|
|
if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot) ) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Do the actual restart
|
|
*/
|
|
ret = opal_crs.crs_restart(snapshot,
|
|
false,
|
|
&child_pid);
|
|
|
|
if (OPAL_SUCCESS != ret) {
|
|
opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true,
|
|
opal_restart_globals.snapshot_ref,
|
|
ret,
|
|
opal_crs_base_selected_component.base_version.mca_component_name);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
/* Should never get here, since crs_restart calls exec */
|
|
|
|
/***************
|
|
* Cleanup
|
|
***************/
|
|
cleanup:
|
|
if (OPAL_SUCCESS != (ret = finalize())) {
|
|
return ret;
|
|
}
|
|
|
|
if(NULL != snapshot )
|
|
OBJ_DESTRUCT(snapshot);
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int initialize(int argc, char *argv[])
|
|
{
|
|
int ret, exit_status = OPAL_SUCCESS;
|
|
char * tmp_env_var = NULL;
|
|
|
|
/*
|
|
* Make sure to init util before parse_args
|
|
* to ensure installdirs is setup properly
|
|
* before calling mca_base_open();
|
|
*/
|
|
if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Parse Command line arguments
|
|
*/
|
|
if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Setup OPAL Output handle from the verbose argument
|
|
*/
|
|
if( opal_restart_globals.verbose ) {
|
|
opal_restart_globals.output = opal_output_open(NULL);
|
|
opal_output_set_verbosity(opal_restart_globals.output, 10);
|
|
} else {
|
|
opal_restart_globals.output = 0; /* Default=STDOUT */
|
|
}
|
|
|
|
/*
|
|
* Turn off the selection of the CRS component,
|
|
* we need to do that later
|
|
*/
|
|
(void) mca_base_var_env_name("crs_base_do_not_select", &tmp_env_var);
|
|
opal_setenv(tmp_env_var,
|
|
"1", /* turn off the selection */
|
|
true, &environ);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
|
|
/*
|
|
* Make sure we select the proper compress component.
|
|
*/
|
|
if( NULL != opal_restart_globals.snapshot_compress ) {
|
|
(void) mca_base_var_env_name("compress", &tmp_env_var);
|
|
opal_setenv(tmp_env_var,
|
|
opal_restart_globals.snapshot_compress,
|
|
true, &environ);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
}
|
|
|
|
/*
|
|
* Initialize the OPAL layer
|
|
*/
|
|
if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* If the checkpoint was compressed, then decompress it before continuing
|
|
*/
|
|
if( NULL != opal_restart_globals.snapshot_compress ) {
|
|
char * zip_dir = NULL;
|
|
char * tmp_str = NULL;
|
|
|
|
/* Make sure to clear the selection for the restart,
|
|
* this way the user can swich compression mechanism
|
|
* across restart
|
|
*/
|
|
(void) mca_base_var_env_name("compress", &tmp_env_var);
|
|
opal_unsetenv(tmp_env_var, &environ);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
|
|
asprintf(&zip_dir, "%s/%s%s",
|
|
opal_restart_globals.snapshot_loc,
|
|
opal_restart_globals.snapshot_ref,
|
|
opal_restart_globals.snapshot_compress_postfix);
|
|
|
|
if (0 > (ret = access(zip_dir, F_OK)) ) {
|
|
opal_output(opal_restart_globals.output,
|
|
"Error: Unable to access the file [%s]!",
|
|
zip_dir);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
opal_output_verbose(10, opal_restart_globals.output,
|
|
"Decompressing (%s)",
|
|
zip_dir);
|
|
|
|
opal_compress.decompress(zip_dir, &tmp_str);
|
|
|
|
if( NULL != zip_dir ) {
|
|
free(zip_dir);
|
|
zip_dir = NULL;
|
|
}
|
|
if( NULL != tmp_str ) {
|
|
free(tmp_str);
|
|
tmp_str = NULL;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If a cache directory has been suggested, see if it exists
|
|
*/
|
|
if( NULL != opal_restart_globals.snapshot_cache ) {
|
|
if(0 == (ret = access(opal_restart_globals.snapshot_cache, F_OK)) ) {
|
|
opal_output_verbose(10, opal_restart_globals.output,
|
|
"Using the cached snapshot (%s) instead of (%s)",
|
|
opal_restart_globals.snapshot_cache,
|
|
opal_restart_globals.snapshot_loc);
|
|
if( NULL != opal_restart_globals.snapshot_loc ) {
|
|
free(opal_restart_globals.snapshot_loc);
|
|
opal_restart_globals.snapshot_loc = NULL;
|
|
}
|
|
opal_restart_globals.snapshot_loc = opal_dirname(opal_restart_globals.snapshot_cache);
|
|
} else {
|
|
opal_show_help("help-opal-restart.txt", "cache_not_avail", true,
|
|
opal_restart_globals.snapshot_cache,
|
|
opal_restart_globals.snapshot_loc);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Mark this process as a tool
|
|
*/
|
|
opal_cr_is_tool = true;
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int finalize(void)
|
|
{
|
|
#if 0
|
|
int ret;
|
|
|
|
/*
|
|
* JJH: Comment this out for now. It should only be called
|
|
* when exec fails, and opal-restart is shutting down.
|
|
* Currently BLCR is calling opal_even_fini() in the restart
|
|
* functionality, so calling it twice is causing a segv.
|
|
* Since we do not really need to do this, just comment it out
|
|
* for now.
|
|
*/
|
|
if (OPAL_SUCCESS != (ret = opal_finalize())) {
|
|
return ret;
|
|
}
|
|
#endif
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static int parse_args(int argc, char *argv[])
|
|
{
|
|
int i, ret, len;
|
|
opal_cmd_line_t cmd_line;
|
|
char **app_env = NULL, **global_env = NULL;
|
|
|
|
opal_restart_globals.help = false;
|
|
opal_restart_globals.verbose = false;
|
|
opal_restart_globals.snapshot_ref = NULL;
|
|
opal_restart_globals.snapshot_loc = NULL;
|
|
opal_restart_globals.snapshot_metadata = NULL;
|
|
opal_restart_globals.snapshot_cache = NULL;
|
|
opal_restart_globals.snapshot_compress = NULL;
|
|
opal_restart_globals.snapshot_compress_postfix = NULL;
|
|
opal_restart_globals.output = 0;
|
|
|
|
/* Parse the command line options */
|
|
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
|
|
|
mca_base_open();
|
|
mca_base_cmd_line_setup(&cmd_line);
|
|
ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv);
|
|
if (OPAL_SUCCESS != ret) {
|
|
if (OPAL_ERR_SILENT != ret) {
|
|
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
|
|
opal_strerror(ret));
|
|
}
|
|
return 1;
|
|
}
|
|
if (opal_restart_globals.help ) {
|
|
char *str, *args = NULL;
|
|
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
|
str = opal_show_help_string("help-opal-restart.txt", "usage", true,
|
|
args);
|
|
if (NULL != str) {
|
|
printf("%s", str);
|
|
free(str);
|
|
}
|
|
free(args);
|
|
/* If we show the help message, that should be all we do */
|
|
exit(0);
|
|
}
|
|
|
|
/**
|
|
* Put all of the MCA arguments in the environment
|
|
*/
|
|
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
|
|
|
|
len = opal_argv_count(app_env);
|
|
for(i = 0; i < len; ++i) {
|
|
putenv(app_env[i]);
|
|
}
|
|
|
|
len = opal_argv_count(global_env);
|
|
for(i = 0; i < len; ++i) {
|
|
putenv(global_env[i]);
|
|
}
|
|
|
|
/**
|
|
* Now start parsing our specific arguments
|
|
*/
|
|
/* get the remaining bits */
|
|
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
|
|
|
|
if ( NULL == opal_restart_globals.snapshot_ref ||
|
|
0 >= strlen(opal_restart_globals.snapshot_ref) ) {
|
|
opal_show_help("help-opal-restart.txt", "invalid_filename", true,
|
|
"<none provided>");
|
|
return OPAL_ERROR;
|
|
}
|
|
|
|
/* If we have arguments after the command, then assume they
|
|
* need to be grouped together.
|
|
* Useful in the 'mca crs self' instance.
|
|
*/
|
|
if(argc > 0) {
|
|
opal_restart_globals.snapshot_ref = strdup(opal_argv_join(argv, ' '));
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static int check_file(void)
|
|
{
|
|
int exit_status = OPAL_SUCCESS;
|
|
int ret;
|
|
char * path_to_check = NULL;
|
|
|
|
if(NULL == opal_restart_globals.snapshot_ref) {
|
|
opal_output(opal_restart_globals.output,
|
|
"Error: No filename provided!");
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Check for the existance of the snapshot handle in the snapshot directory
|
|
*/
|
|
asprintf(&path_to_check, "%s/%s",
|
|
opal_restart_globals.snapshot_loc,
|
|
opal_restart_globals.snapshot_ref);
|
|
|
|
opal_output_verbose(10, opal_restart_globals.output,
|
|
"Checking for the existence of (%s)",
|
|
path_to_check);
|
|
|
|
if (0 > (ret = access(path_to_check, F_OK)) ) {
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
if( NULL != path_to_check) {
|
|
free(path_to_check);
|
|
path_to_check = NULL;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot)
|
|
{
|
|
int ret, exit_status = OPAL_SUCCESS;
|
|
char *command = NULL;
|
|
char *proc_file = NULL;
|
|
char **loc_touch = NULL;
|
|
char **loc_mkdir = NULL;
|
|
int argc, i;
|
|
|
|
if( 0 > prev_pid ) {
|
|
opal_output(opal_restart_globals.output,
|
|
"Invalid PID (%d)\n",
|
|
prev_pid);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* This is needed so we can pass the previous environment to the restarted
|
|
* application process.
|
|
*/
|
|
asprintf(&proc_file, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid);
|
|
asprintf(&command, "env | grep OMPI_ > %s", proc_file);
|
|
|
|
opal_output_verbose(5, opal_restart_globals.output,
|
|
"post_env_vars: Execute: <%s>", command);
|
|
|
|
ret = system(command);
|
|
if( 0 > ret) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Any directories that need to be created
|
|
*/
|
|
if( NULL == (snapshot->metadata = fopen(snapshot->metadata_filename, "r")) ) {
|
|
opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
|
|
opal_restart_globals.snapshot_metadata,
|
|
snapshot->metadata_filename);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_MKDIR, &loc_mkdir);
|
|
argc = opal_argv_count(loc_mkdir);
|
|
for( i = 0; i < argc; ++i ) {
|
|
if( NULL != command ) {
|
|
free(command);
|
|
command = NULL;
|
|
}
|
|
asprintf(&command, "mkdir -p %s", loc_mkdir[i]);
|
|
|
|
opal_output_verbose(5, opal_restart_globals.output,
|
|
"post_env_vars: Execute: <%s>", command);
|
|
|
|
ret = system(command);
|
|
if( 0 > ret) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
if( 0 < argc ) {
|
|
system("sync ; sync");
|
|
}
|
|
|
|
/*
|
|
* Any files that need to exist
|
|
*/
|
|
opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_TOUCH, &loc_touch);
|
|
argc = opal_argv_count(loc_touch);
|
|
for( i = 0; i < argc; ++i ) {
|
|
if( NULL != command ) {
|
|
free(command);
|
|
command = NULL;
|
|
}
|
|
asprintf(&command, "touch %s", loc_touch[i]);
|
|
|
|
opal_output_verbose(5, opal_restart_globals.output,
|
|
"post_env_vars: Execute: <%s>", command);
|
|
|
|
ret = system(command);
|
|
if( 0 > ret) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
if( 0 < argc ) {
|
|
system("sync ; sync");
|
|
}
|
|
|
|
cleanup:
|
|
if( NULL != command) {
|
|
free(command);
|
|
command = NULL;
|
|
}
|
|
if( NULL != proc_file) {
|
|
free(proc_file);
|
|
proc_file = NULL;
|
|
}
|
|
if( NULL != loc_mkdir ) {
|
|
opal_argv_free(loc_mkdir);
|
|
loc_mkdir = NULL;
|
|
}
|
|
if( NULL != loc_touch ) {
|
|
opal_argv_free(loc_touch);
|
|
loc_touch = NULL;
|
|
}
|
|
|
|
if( NULL != snapshot->metadata ) {
|
|
fclose(snapshot->metadata);
|
|
snapshot->metadata = NULL;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|