e9e4d2a4bc
The Open MPI code base assumed that asprintf always behaved like the FreeBSD variant, where ptr is set to NULL on error. However, the C standard (and Linux) only guarantee that the return code will be -1 on error and leave ptr undefined. Rather than fix all the usage in the code, we use opal_asprintf() wrapper instead, which guarantees the BSD-like behavior of ptr always being set to NULL. In addition to being correct, this will fix many, many warnings in the Open MPI code base. Signed-off-by: Brian Barrett <bbarrett@amazon.com>
758 строки
23 KiB
C
758 строки
23 KiB
C
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
|
*
|
|
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "opal_config.h"
|
|
|
|
#include <sys/types.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
#include <string.h>
|
|
#ifdef HAVE_DLFCN_H
|
|
#include <dlfcn.h>
|
|
#endif
|
|
|
|
#include "opal/util/opal_environ.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/opal_environ.h"
|
|
#include "opal/util/printf.h"
|
|
|
|
#include "opal/constants.h"
|
|
#include "opal/mca/base/mca_base_var.h"
|
|
|
|
#include "opal/mca/crs/crs.h"
|
|
#include "opal/mca/crs/base/base.h"
|
|
#include "opal/runtime/opal_cr.h"
|
|
|
|
#include "crs_self.h"
|
|
|
|
/*
|
|
* Self module
|
|
*/
|
|
static opal_crs_base_module_t loc_module = {
|
|
/** Initialization Function */
|
|
opal_crs_self_module_init,
|
|
/** Finalization Function */
|
|
opal_crs_self_module_finalize,
|
|
|
|
/** Checkpoint interface */
|
|
opal_crs_self_checkpoint,
|
|
|
|
/** Restart Command Access */
|
|
opal_crs_self_restart,
|
|
|
|
/** Disable checkpoints */
|
|
opal_crs_self_disable_checkpoint,
|
|
/** Enable checkpoints */
|
|
opal_crs_self_enable_checkpoint,
|
|
|
|
/** Prelaunch */
|
|
opal_crs_self_prelaunch,
|
|
|
|
/** Register Thread */
|
|
opal_crs_self_reg_thread
|
|
};
|
|
|
|
/*
|
|
* Snapshot structure
|
|
*/
|
|
OBJ_CLASS_DECLARATION(opal_crs_self_snapshot_t);
|
|
|
|
struct opal_crs_self_snapshot_t {
|
|
/** Base CRS snapshot type */
|
|
opal_crs_base_snapshot_t super;
|
|
/** Command Line used to restart the app */
|
|
char * cmd_line;
|
|
};
|
|
typedef struct opal_crs_self_snapshot_t opal_crs_self_snapshot_t;
|
|
|
|
static void opal_crs_self_construct(opal_crs_self_snapshot_t *obj);
|
|
static void opal_crs_self_destruct( opal_crs_self_snapshot_t *obj);
|
|
|
|
OBJ_CLASS_INSTANCE(opal_crs_self_snapshot_t,
|
|
opal_crs_base_snapshot_t,
|
|
opal_crs_self_construct,
|
|
opal_crs_self_destruct);
|
|
|
|
|
|
typedef void (*opal_crs_self_dlsym_dummy_fn_t)(void);
|
|
|
|
/************************************
|
|
* Locally Global vars & functions :)
|
|
************************************/
|
|
static int crs_self_find_function(char *prefix, char *suffix,
|
|
opal_crs_self_dlsym_dummy_fn_t *fn_ptr);
|
|
|
|
static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot);
|
|
|
|
static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd);
|
|
static int self_cold_start(opal_crs_self_snapshot_t *snapshot);
|
|
|
|
void opal_crs_self_construct(opal_crs_self_snapshot_t *snapshot)
|
|
{
|
|
snapshot->cmd_line = NULL;
|
|
}
|
|
|
|
void opal_crs_self_destruct( opal_crs_self_snapshot_t *snapshot)
|
|
{
|
|
if(NULL != snapshot->cmd_line)
|
|
free(snapshot->cmd_line);
|
|
}
|
|
|
|
static int opal_crs_self_extract_callbacks(void);
|
|
|
|
/*
|
|
* MCA Functions
|
|
*/
|
|
int opal_crs_self_component_query(mca_base_module_t **module, int *priority)
|
|
{
|
|
int ret;
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: component_query()");
|
|
|
|
/*
|
|
* If this is a tool, then return a module with the lowest priority.
|
|
* This allows 'mpirun' to select the 'none' component since it has
|
|
* a priority higher than 0.
|
|
* But also allows 'opal-restart' to select this component if needed
|
|
* since it only ever requests that a specific component be opened
|
|
* that is defined in the snapshot metadata file.
|
|
*/
|
|
if( opal_cr_is_tool ) {
|
|
*priority = 0;
|
|
*module = (mca_base_module_t *)&loc_module;
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Extract the user level callbacks if they exist
|
|
*/
|
|
ret = opal_crs_self_extract_callbacks();
|
|
|
|
if( OPAL_SUCCESS != ret ||
|
|
!mca_crs_self_component.can_checkpoint ) {
|
|
*priority = -1;
|
|
*module = NULL;
|
|
return OPAL_ERROR;
|
|
}
|
|
else {
|
|
*priority = mca_crs_self_component.super.priority;
|
|
*module = (mca_base_module_t *)&loc_module;
|
|
return OPAL_SUCCESS;
|
|
}
|
|
}
|
|
|
|
static int opal_crs_self_extract_callbacks(void)
|
|
{
|
|
opal_crs_self_dlsym_dummy_fn_t loc_fn;
|
|
|
|
/*
|
|
* Find the function names
|
|
*/
|
|
crs_self_find_function(mca_crs_self_component.prefix,
|
|
SUFFIX_CHECKPOINT,
|
|
&loc_fn);
|
|
mca_crs_self_component.ucb_checkpoint_fn = (opal_crs_self_checkpoint_callback_fn_t)loc_fn;
|
|
|
|
crs_self_find_function(mca_crs_self_component.prefix,
|
|
SUFFIX_CONTINUE,
|
|
&loc_fn);
|
|
mca_crs_self_component.ucb_continue_fn = (opal_crs_self_continue_callback_fn_t)loc_fn;
|
|
|
|
crs_self_find_function(mca_crs_self_component.prefix,
|
|
SUFFIX_RESTART,
|
|
&loc_fn);
|
|
mca_crs_self_component.ucb_restart_fn = (opal_crs_self_restart_callback_fn_t)loc_fn;
|
|
|
|
/*
|
|
* Sanity check
|
|
*/
|
|
mca_crs_self_component.can_checkpoint = true;
|
|
|
|
if(NULL == mca_crs_self_component.ucb_checkpoint_fn) {
|
|
mca_crs_self_component.can_checkpoint = false;
|
|
}
|
|
if(NULL == mca_crs_self_component.ucb_continue_fn) {
|
|
}
|
|
if(NULL == mca_crs_self_component.ucb_restart_fn) {
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_self_module_init(void)
|
|
{
|
|
bool callback_matched = true;
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: module_init()");
|
|
|
|
if( opal_cr_is_tool ) {
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Sanity check
|
|
*/
|
|
if(NULL == mca_crs_self_component.ucb_checkpoint_fn) {
|
|
callback_matched = false;
|
|
mca_crs_self_component.can_checkpoint = false;
|
|
}
|
|
if(NULL == mca_crs_self_component.ucb_continue_fn) {
|
|
callback_matched = false;
|
|
}
|
|
if(NULL == mca_crs_self_component.ucb_restart_fn) {
|
|
callback_matched = false;
|
|
}
|
|
if( !callback_matched ) {
|
|
if( 1 <= mca_crs_self_component.super.verbose ) {
|
|
opal_show_help("help-opal-crs-self.txt", "self:no_callback", false,
|
|
"checkpoint", mca_crs_self_component.prefix, SUFFIX_CHECKPOINT,
|
|
"continue ", mca_crs_self_component.prefix, SUFFIX_CONTINUE,
|
|
"restart ", mca_crs_self_component.prefix, SUFFIX_RESTART,
|
|
PREFIX_DEFAULT);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If the user requested that we do_restart, then call their callback
|
|
*/
|
|
if(mca_crs_self_component.do_restart) {
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: module_init: Call their restart function");
|
|
if( NULL != mca_crs_self_component.ucb_restart_fn)
|
|
mca_crs_self_component.ucb_restart_fn();
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_self_module_finalize(void)
|
|
{
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: module_finalize()");
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
|
|
int opal_crs_self_checkpoint(pid_t pid,
|
|
opal_crs_base_snapshot_t *base_snapshot,
|
|
opal_crs_base_ckpt_options_t *options,
|
|
opal_crs_state_type_t *state)
|
|
{
|
|
opal_crs_self_snapshot_t *snapshot = OBJ_NEW(opal_crs_self_snapshot_t);
|
|
int ret, exit_status = OPAL_SUCCESS;
|
|
char * restart_cmd = NULL;
|
|
|
|
/*
|
|
* This function should never be called by a tool
|
|
*/
|
|
if( opal_cr_is_tool ) {
|
|
return OPAL_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
if( options->stop ) {
|
|
opal_output(0,
|
|
"crs:self: checkpoint(): Error: SIGSTOP Not currently supported!");
|
|
}
|
|
|
|
/*
|
|
* Setup for snapshot directory creation
|
|
*/
|
|
snapshot->super = *base_snapshot;
|
|
#if 0
|
|
snapshot->super.snapshot_directory = strdup(base_snapshot->snapshot_directory);
|
|
snapshot->super.metadata_filename = strdup(base_snapshot->metadata_filename);
|
|
#endif
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: checkpoint(%d, ---)", pid);
|
|
|
|
if(!mca_crs_self_component.can_checkpoint) {
|
|
opal_show_help("help-opal-crs-self.txt", "self:ckpt_disabled", false);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Update the snapshot metadata
|
|
*/
|
|
snapshot->super.component_name = strdup(mca_crs_self_component.super.base_version.mca_component_name);
|
|
if( NULL == snapshot->super.metadata ) {
|
|
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: checkpoint(): Error: Unable to open the file (%s)",
|
|
snapshot->super.metadata_filename);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);
|
|
|
|
/*
|
|
* Call the user callback function
|
|
*/
|
|
if(NULL != mca_crs_self_component.ucb_checkpoint_fn) {
|
|
mca_crs_self_component.ucb_checkpoint_fn(&restart_cmd);
|
|
}
|
|
|
|
/*
|
|
* Save the restart command
|
|
*/
|
|
if( NULL == restart_cmd) {
|
|
*state = OPAL_CRS_ERROR;
|
|
opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
|
|
true);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
else {
|
|
snapshot->cmd_line = strdup(restart_cmd);
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: checkpoint: Restart Command (%s)", snapshot->cmd_line);
|
|
}
|
|
|
|
/*
|
|
* The best we can do is update the metadata file with the
|
|
* application argv and argc we started with.
|
|
*/
|
|
if( OPAL_SUCCESS != (ret = self_update_snapshot_metadata(snapshot)) ) {
|
|
*state = OPAL_CRS_ERROR;
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: checkpoint(): Error: Unable to update metadata for snapshot (%s).",
|
|
snapshot->super.metadata_filename);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
|
|
*state = OPAL_CRS_CONTINUE;
|
|
|
|
/*
|
|
* Call their continue routine for completeness
|
|
*/
|
|
if(NULL != mca_crs_self_component.ucb_continue_fn) {
|
|
mca_crs_self_component.ucb_continue_fn();
|
|
}
|
|
|
|
base_snapshot = &(snapshot->super);
|
|
|
|
cleanup:
|
|
if( NULL != restart_cmd) {
|
|
free(restart_cmd);
|
|
restart_cmd = NULL;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
/*
|
|
* Notice that the user restart callback is not called here, but always from
|
|
* opal_init for the self module.
|
|
*/
|
|
int opal_crs_self_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
|
|
{
|
|
opal_crs_self_snapshot_t *snapshot = OBJ_NEW(opal_crs_self_snapshot_t);
|
|
char **cr_argv = NULL;
|
|
char * cr_cmd = NULL;
|
|
int ret;
|
|
int exit_status = OPAL_SUCCESS;
|
|
int status;
|
|
|
|
snapshot->super = *base_snapshot;
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: restart(%d)", spawn_child);
|
|
|
|
/*
|
|
* If we need to reconstruct the snapshot
|
|
*/
|
|
if(snapshot->super.cold_start) {
|
|
if( OPAL_SUCCESS != (ret = self_cold_start(snapshot)) ){
|
|
exit_status = ret;
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:blcr: blcr_restart: Unable to reconstruct the snapshot.");
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* JJH: Check to make sure the application exists?
|
|
*/
|
|
|
|
/*
|
|
* Get the restart command
|
|
*/
|
|
if ( OPAL_SUCCESS != (ret = opal_crs_self_restart_cmd(snapshot, &cr_cmd)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
if ( NULL == (cr_argv = opal_argv_split(cr_cmd, ' ')) ) {
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
|
|
if (!spawn_child) {
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_restart: SELF: exec :(%s, %s):",
|
|
strdup(cr_argv[0]),
|
|
opal_argv_join(cr_argv, ' '));
|
|
|
|
status = execvp(strdup(cr_argv[0]), cr_argv);
|
|
|
|
if(status < 0) {
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_restart: SELF: Child failed to execute :(%d):", status);
|
|
}
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_restart: SELF: execvp returned %d", status);
|
|
exit_status = status;
|
|
goto cleanup;
|
|
}
|
|
else {
|
|
*child_pid = fork();
|
|
if( *child_pid == 0) {
|
|
/* Child Process */
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_restart: CHILD: exec :(%s, %s):",
|
|
strdup(cr_argv[0]),
|
|
opal_argv_join(cr_argv, ' '));
|
|
|
|
status = execvp(strdup(cr_argv[0]), cr_argv);
|
|
|
|
if(status < 0) {
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_restart: CHILD: Child failed to execute :(%d):", status);
|
|
}
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_restart: CHILD: execvp returned %d", status);
|
|
exit_status = status;
|
|
goto cleanup;
|
|
}
|
|
else if(*child_pid > 0) {
|
|
/* Parent is done once it is started. */
|
|
;
|
|
}
|
|
else {
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_restart: CHILD: fork failed :(%d):", *child_pid);
|
|
}
|
|
}
|
|
|
|
cleanup:
|
|
if( NULL != cr_cmd)
|
|
free(cr_cmd);
|
|
if( NULL != cr_argv)
|
|
opal_argv_free(cr_argv);
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
int opal_crs_self_disable_checkpoint(void)
|
|
{
|
|
/*
|
|
* This function should never be called by a tool
|
|
*/
|
|
if( opal_cr_is_tool ) {
|
|
return OPAL_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: disable_checkpoint()");
|
|
|
|
mca_crs_self_component.can_checkpoint = false;
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_self_enable_checkpoint(void)
|
|
{
|
|
/*
|
|
* This function should never be called by a tool
|
|
*/
|
|
if( opal_cr_is_tool ) {
|
|
return OPAL_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: enable_checkpoint()");
|
|
|
|
mca_crs_self_component.can_checkpoint = true;
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_self_prelaunch(int32_t rank,
|
|
char *base_snapshot_dir,
|
|
char **app,
|
|
char **cwd,
|
|
char ***argv,
|
|
char ***env)
|
|
{
|
|
char * tmp_env_var = NULL;
|
|
|
|
/*
|
|
* This function should never be called by a tool
|
|
*/
|
|
if( opal_cr_is_tool ) {
|
|
return OPAL_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
|
|
opal_setenv(tmp_env_var,
|
|
"0", true, env);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_self_reg_thread(void)
|
|
{
|
|
/*
|
|
* This function should never be called by a tool
|
|
*/
|
|
if( opal_cr_is_tool ) {
|
|
return OPAL_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/******************
|
|
* Local functions
|
|
******************/
|
|
static int crs_self_find_function(char *prefix, char *suffix,
|
|
opal_crs_self_dlsym_dummy_fn_t *fn_ptr) {
|
|
char *func_to_find = NULL;
|
|
|
|
if( NULL == prefix || 0 >= strlen(prefix) ) {
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: crs_self_find_function: Error: prefix is NULL or empty string!");
|
|
*fn_ptr = NULL;
|
|
return OPAL_ERROR;
|
|
}
|
|
if( NULL == suffix || 0 >= strlen(suffix) ) {
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: crs_self_find_function: Error: suffix is NULL or empty string!");
|
|
*fn_ptr = NULL;
|
|
return OPAL_ERROR;
|
|
}
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: crs_self_find_function(--, %s, %s)",
|
|
prefix, suffix);
|
|
|
|
opal_asprintf(&func_to_find, "%s_%s", prefix, suffix);
|
|
|
|
/* The RTLD_DEFAULT is a special handle that searches the default libraries
|
|
* including the current application for the indicated symbol. This allows
|
|
* us to not have to dlopen/dlclose the executable. A bit of short hand
|
|
* really.
|
|
*/
|
|
*((void**) fn_ptr) = dlsym(RTLD_DEFAULT, func_to_find);
|
|
if( NULL == fn_ptr) {
|
|
opal_output_verbose(12, mca_crs_self_component.super.output_handle,
|
|
"crs:self: crs_self_find_function: WARNING: Function \"%s\" not found",
|
|
func_to_find);
|
|
}
|
|
else {
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: crs_self_find_function: Found function \"%s\"",
|
|
func_to_find);
|
|
}
|
|
|
|
if( NULL == func_to_find) {
|
|
free(func_to_find);
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Self is a special case. The 'fname' here is the command line that the user
|
|
* wishes to execute. This function takes this command line and adds
|
|
* -mca crs_self_do_restart 1
|
|
* Which will trigger the restart callback once the program has been run.
|
|
*
|
|
* For example, The user starts their program with:
|
|
* $ my_prog arg1 arg2
|
|
*
|
|
* They checkpoint it:
|
|
* $ opal_checkpoint -mca crs self 1234
|
|
*
|
|
* They restart it:
|
|
* $ opal_restart -mca crs self my_prog arg1 arg2
|
|
*
|
|
* fname is then:
|
|
* fname = "my_prog arg1 arg2"
|
|
*
|
|
* This funciton translates that to the command:
|
|
* cmd = "my_prog arg1 arg2 -mca crs self -mca crs_self_do_restart 1"
|
|
*
|
|
* Which will cause the program "my_prog" to call their restart function
|
|
* upon opal_init time.
|
|
*
|
|
* Note: The user could bypass the opal_restart routine safely by simply calling
|
|
* $ my_prog arg1 arg2 -mca crs self -mca crs_self_do_restart 1
|
|
* However, for consistency sake, we should not encourage this as it won't work for
|
|
* all of the other checkpointers.
|
|
*/
|
|
static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **cmd)
|
|
{
|
|
char * tmp_env_var = NULL;
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: restart_cmd(%s, ---)", snapshot->cmd_line);
|
|
|
|
(void) mca_base_var_env_name("crs", &tmp_env_var);
|
|
opal_setenv(tmp_env_var,
|
|
"self",
|
|
true, &environ);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
|
|
(void) mca_base_var_env_name("crs_self_do_restart", &tmp_env_var);
|
|
opal_setenv(tmp_env_var,
|
|
"1",
|
|
true, &environ);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
|
|
(void) mca_base_var_env_name("crs_self_prefix", &tmp_env_var);
|
|
opal_setenv(tmp_env_var,
|
|
mca_crs_self_component.prefix,
|
|
true, &environ);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
|
|
/* Instead of adding it to the command line, we should use the environment
|
|
* to pass the values. This allow sthe OPAL application to be braindead
|
|
* WRT MCA parameters
|
|
* add_args = strdup("-mca crs self -mca crs_self_do_restart 1");
|
|
*/
|
|
|
|
opal_asprintf(cmd, "%s", snapshot->cmd_line);
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
|
|
int ret, exit_status = OPAL_SUCCESS;
|
|
char **tmp_argv = NULL;
|
|
char * component_name = NULL;
|
|
int prev_pid;
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: cold_start()");
|
|
|
|
/*
|
|
* Find the snapshot directory, read the metadata file
|
|
*/
|
|
if( NULL == snapshot->super.metadata ) {
|
|
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: checkpoint(): Error: Unable to open the file (%s)",
|
|
snapshot->super.metadata_filename);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
|
|
&component_name, &prev_pid) ) ) {
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
|
|
snapshot->super.metadata_filename, ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
snapshot->super.component_name = strdup(component_name);
|
|
|
|
/* Compare the strings to make sure this is our snapshot before going further */
|
|
if ( 0 != strncmp(mca_crs_self_component.super.base_version.mca_component_name,
|
|
component_name, strlen(component_name)) ) {
|
|
exit_status = OPAL_ERROR;
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
|
|
component_name, mca_crs_self_component.super.base_version.mca_component_name);
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Restart command
|
|
* JJH: Command lines limited to 256 chars.
|
|
*/
|
|
opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
|
|
if( NULL == tmp_argv ) {
|
|
opal_output(mca_crs_self_component.super.output_handle,
|
|
"crs:self: self_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
|
|
CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
opal_asprintf(&snapshot->cmd_line, "%s", tmp_argv[0]);
|
|
|
|
/*
|
|
* Reset the cold_start flag
|
|
*/
|
|
snapshot->super.cold_start = false;
|
|
|
|
cleanup:
|
|
if(NULL != tmp_argv) {
|
|
opal_argv_free(tmp_argv);
|
|
tmp_argv = NULL;
|
|
}
|
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot) {
|
|
int exit_status = OPAL_SUCCESS;
|
|
|
|
if(NULL == snapshot->cmd_line) {
|
|
opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
|
|
true);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
|
"crs:self: update_snapshot_metadata(%s)",
|
|
snapshot->super.metadata_filename);
|
|
|
|
/*
|
|
* Append to the metadata file the command line to restart with
|
|
* - How user wants us to restart
|
|
*/
|
|
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->cmd_line);
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|