cf377db823
Features: - Support for an override parameter file (openmpi-mca-param-override.conf). Variable values in this file can not be overridden by any file or environment value. - Support for boolean, unsigned, and unsigned long long variables. - Support for true/false values. - Support for enumerations on integer variables. - Support for MPIT scope, verbosity, and binding. - Support for command line source. - Support for setting variable source via the environment using OMPI_MCA_SOURCE_<var name>=source (either command or file:filename) - Cleaner API. - Support for variable groups (equivalent to MPIT categories). Notes: - Variables must be created with a backing store (char **, int *, or bool *) that must live at least as long as the variable. - Creating a variable with the MCA_BASE_VAR_FLAG_SETTABLE enables the use of mca_base_var_set_value() to change the value. - String values are duplicated when the variable is registered. It is up to the caller to free the original value if necessary. The new value will be freed by the mca_base_var system and must not be freed by the user. - Variables with constant scope may not be settable. - Variable groups (and all associated variables) are deregistered when the component is closed or the component repository item is freed. This prevents a segmentation fault from accessing a variable after its component is unloaded. - After some discussion we decided we should remove the automatic registration of component priority variables. Few component actually made use of this feature. - The enumerator interface was updated to be general enough to handle future uses of the interface. - The code to generate ompi_info output has been moved into the MCA variable system. See mca_base_var_dump(). opal: update core and components to mca_base_var system orte: update core and components to mca_base_var system ompi: update core and components to mca_base_var system This commit also modifies the rmaps framework. The following variables were moved from ppr and lama: rmaps_base_pernode, rmaps_base_n_pernode, rmaps_base_n_persocket. Both lama and ppr create synonyms for these variables. This commit was SVN r28236.
710 строки
25 KiB
C
710 строки
25 KiB
C
/*
|
|
* Copyright (c) 2010 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
|
|
* All rights reserved.
|
|
*
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "opal_config.h"
|
|
|
|
#include <sched.h>
|
|
#include <unistd.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
#include <sys/types.h>
|
|
#include <sys/wait.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/syscall.h>
|
|
#include <fcntl.h>
|
|
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/constants.h"
|
|
|
|
#include "opal/mca/base/mca_base_var.h"
|
|
|
|
#include "opal/threads/mutex.h"
|
|
#include "opal/threads/condition.h"
|
|
|
|
#include "opal/mca/event/event.h"
|
|
|
|
#include "opal/mca/crs/crs.h"
|
|
#include "opal/mca/crs/base/base.h"
|
|
|
|
#include "crs_dmtcp.h"
|
|
|
|
#define MTCP_RESTART_COMMAND "mtcp_restart"
|
|
|
|
/*
|
|
* DMTCP module
|
|
*/
|
|
static opal_crs_base_module_t dmtcp_module = {
|
|
/** Initialization Function */
|
|
opal_crs_dmtcp_module_init,
|
|
/** Finalization Function */
|
|
opal_crs_dmtcp_module_finalize,
|
|
|
|
/** Checkpoint interface */
|
|
opal_crs_dmtcp_checkpoint,
|
|
|
|
/** Restart Command Access */
|
|
opal_crs_dmtcp_restart,
|
|
|
|
/** Disable checkpoints */
|
|
opal_crs_dmtcp_disable_checkpoint,
|
|
/** Enable checkpoints */
|
|
opal_crs_dmtcp_enable_checkpoint,
|
|
|
|
/** Prelaunch */
|
|
opal_crs_dmtcp_prelaunch,
|
|
|
|
/** Register Thread */
|
|
opal_crs_dmtcp_reg_thread
|
|
};
|
|
|
|
/***************************
|
|
* Snapshot Class Functions
|
|
***************************/
|
|
OBJ_CLASS_DECLARATION(opal_crs_dmtcp_snapshot_t);
|
|
|
|
struct opal_crs_dmtcp_snapshot_t {
|
|
/** Base CRS snapshot type */
|
|
opal_crs_base_snapshot_t super;
|
|
char * context_filename;
|
|
};
|
|
typedef struct opal_crs_dmtcp_snapshot_t opal_crs_dmtcp_snapshot_t;
|
|
|
|
void opal_crs_dmtcp_construct(opal_crs_dmtcp_snapshot_t *obj);
|
|
void opal_crs_dmtcp_destruct(opal_crs_dmtcp_snapshot_t *obj);
|
|
|
|
OBJ_CLASS_INSTANCE(opal_crs_dmtcp_snapshot_t,
|
|
opal_crs_base_snapshot_t,
|
|
opal_crs_dmtcp_construct,
|
|
opal_crs_dmtcp_destruct);
|
|
|
|
/******************
|
|
* Local Functions
|
|
******************/
|
|
static int dmtcp_cold_start(opal_crs_dmtcp_snapshot_t *snapshot);
|
|
static int dmtcp_generate_full_ckpt_path(opal_crs_dmtcp_snapshot_t *snapshot);
|
|
static void dmtcp_sleep_between_ckpt_callback(int interval);
|
|
static void dmtcp_pre_ckpt_callback(char **ckpt_filename);
|
|
static void dmtcp_post_ckpt_callback(int is_restarting,
|
|
char *mtcp_restore_argv_start_addr);
|
|
static int dmtcp_should_ckpt_fd_callback(int fd);
|
|
|
|
/*************************
|
|
* Local Global Variables
|
|
*************************/
|
|
static char *full_ckpt_path = NULL;
|
|
static pthread_cond_t checkpoint_cond = PTHREAD_COND_INITIALIZER;
|
|
static pthread_cond_t checkpoint_done_cond = PTHREAD_COND_INITIALIZER;
|
|
static pthread_mutex_t checkpoint_mutex = PTHREAD_MUTEX_INITIALIZER;
|
|
static int post_ckpt_state;
|
|
|
|
void opal_crs_dmtcp_construct(opal_crs_dmtcp_snapshot_t *snapshot) {
|
|
snapshot->context_filename = NULL;
|
|
snapshot->super.component_name =
|
|
strdup(mca_crs_dmtcp_component.super.base_version.mca_component_name);
|
|
}
|
|
|
|
void opal_crs_dmtcp_destruct( opal_crs_dmtcp_snapshot_t *snapshot) {
|
|
if(NULL != snapshot->context_filename) {
|
|
free(snapshot->context_filename);
|
|
snapshot->context_filename = NULL;
|
|
}
|
|
}
|
|
|
|
/*****************
|
|
* MCA Functions
|
|
*****************/
|
|
int opal_crs_dmtcp_component_query(mca_base_module_t **module, int *priority)
|
|
{
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: component_query()");
|
|
|
|
*priority = mca_crs_dmtcp_component.super.priority;
|
|
*module = (mca_base_module_t *)&dmtcp_module;
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_dmtcp_module_init(void)
|
|
{
|
|
char *temp_checkpoint_name;
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: module_init()");
|
|
|
|
/*
|
|
* JJH NOTE: Call any initialization routines you require
|
|
*/
|
|
mtcp_set_callbacks(dmtcp_sleep_between_ckpt_callback, /* sleep_between_ckpt */
|
|
dmtcp_pre_ckpt_callback, /* pre_ckpt */
|
|
dmtcp_post_ckpt_callback, /* post_ckpt */
|
|
dmtcp_should_ckpt_fd_callback, /* ckpt_fd */
|
|
NULL); /* write_ckpt_header */
|
|
|
|
/* This serves to simply initialize MTCP. The checkpoint file will
|
|
* actually be set by our pre_ckpt callback (which takes it from the
|
|
* snapshot given to the CRS checkpoint function), and the interval will be
|
|
* ignored, substituted for a synchronization signal that is handled by our
|
|
* sleep_between_ckpt callback.
|
|
*/
|
|
|
|
asprintf(&temp_checkpoint_name, "checkpoint.dmtcp.%ld", syscall(SYS_getpid));
|
|
mtcp_init(temp_checkpoint_name, 0, 1);
|
|
mtcp_ok();
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: leaving module_init()");
|
|
|
|
free(temp_checkpoint_name);
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_dmtcp_module_finalize(void)
|
|
{
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: module_finalize()");
|
|
|
|
/*
|
|
* JJH NOTE: Call any finalization routines you require
|
|
*/
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_dmtcp_prelaunch(int32_t rank,
|
|
char *base_snapshot_dir,
|
|
char **app,
|
|
char **cwd,
|
|
char ***argv,
|
|
char ***env)
|
|
{
|
|
char * tmp_env_var = NULL;
|
|
|
|
/*
|
|
* The below should be left untouched for now
|
|
*/
|
|
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
|
|
opal_setenv(tmp_env_var,
|
|
"0", true, env);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: leaving module_prelaunch()");
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_dmtcp_reg_thread(void)
|
|
{
|
|
/*
|
|
* JJH NOTE: If you require that all threads that may call into MTCP
|
|
* explicitly register with MTCP, then place the necessary
|
|
* initialization here.
|
|
*/
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: leaving module_reg_thread()");
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_dmtcp_checkpoint(pid_t pid,
|
|
opal_crs_base_snapshot_t *base_snapshot,
|
|
opal_crs_base_ckpt_options_t *options,
|
|
opal_crs_state_type_t *state)
|
|
{
|
|
int unlock_retval, exit_status = OPAL_SUCCESS;
|
|
char buf[BUFSIZ];
|
|
opal_crs_dmtcp_snapshot_t *snapshot;
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: about to lock mutex for checkpoint()");
|
|
|
|
pthread_mutex_lock(&checkpoint_mutex);
|
|
snapshot = (opal_crs_dmtcp_snapshot_t *) base_snapshot;
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: checkpoint(%d, ---)", pid);
|
|
|
|
/* Are we checkpointing ourselves or a peer.
|
|
* JJH NOTE: This will only ever be called when pid == getpid()
|
|
* This is an old interface argument, that is no longer used.
|
|
*/
|
|
|
|
/* bricka (2010-05-14): According to crs.h, 0 also indicates checkpointing
|
|
* self.
|
|
*/
|
|
if((pid != 0) && (pid != syscall(SYS_getpid)) ) {
|
|
/* MTCP can only checkpoint a single process: we can only checkpoint
|
|
* ourself. */
|
|
*state = OPAL_CRS_ERROR;
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* the metadata file should always be NULL at this point */
|
|
if ( NULL != snapshot->super.metadata) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: checkpoint(): Error: Metadata file already open");
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Update the snapshot metadata with the component name so opal-restart can
|
|
* pick the correct CRS to restart with.
|
|
*/
|
|
snapshot->super.component_name = strdup(mca_crs_dmtcp_component.super.base_version.mca_component_name);
|
|
|
|
if( NULL == snapshot->super.metadata ) {
|
|
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: checkpoint(): Error: Unable to open the file (%s)",
|
|
snapshot->super.metadata_filename);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
/* The filename of the checkpoint will be changed by our pre_ckpt hook
|
|
* based on the options given to this function. */
|
|
if(dmtcp_generate_full_ckpt_path(snapshot) == -1) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_checkpoint: unable to generate context filename.");
|
|
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* JJH NOTE: You can write however much or little data you want to the
|
|
* metadata file. The metadata file is stored with the local
|
|
* checkpoint, and provided at restart time to help the
|
|
* CRS component deteremine how to restart from any files
|
|
* that is left in this directory during checkpoint.
|
|
* Use the command below to write key/value strings to the
|
|
* metadata file.
|
|
* (Just as we did above with the component name).
|
|
*/
|
|
if ( 0 > fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name)) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_checkpoint: unable to print component name to metadata");
|
|
}
|
|
|
|
if ( 0 > fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->context_filename)) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_checkpoint: unable to print context name to metadata");
|
|
}
|
|
|
|
fclose(snapshot->super.metadata );
|
|
snapshot->super.metadata = NULL;
|
|
|
|
/*
|
|
* JJH NOTE: Setup and request a checkpoint of this process.
|
|
*/
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_checkpoint: will checkpoint to file: %s",
|
|
full_ckpt_path);
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_checkpoint: about to signal checkpoint");
|
|
|
|
/* Now that we have set the requested filename, we simply need to start
|
|
* the checkpoint. */
|
|
pthread_cond_signal(&checkpoint_cond);
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_checkpoint: signalled checkpoint");
|
|
|
|
/* We want to wait for the checkpoint to finish before we continue (in
|
|
* particular, we need the post_ckpt hook to happen so that we know the
|
|
* status of the checkpoint)
|
|
*/
|
|
pthread_cond_wait(&checkpoint_done_cond, &checkpoint_mutex);
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_checkpoint: received checkpoint_done signal");
|
|
|
|
/* We have now been checkpointed. Note that the state of the checkpoint
|
|
* (OPAL_CRS_CONTINUE, etc.) has been recorded by the post_ckpt hook.
|
|
*/
|
|
*state = post_ckpt_state;
|
|
exit_status = OPAL_SUCCESS;
|
|
|
|
free(full_ckpt_path);
|
|
|
|
cleanup:
|
|
unlock_retval = pthread_mutex_unlock(&checkpoint_mutex);
|
|
|
|
if( 0 != unlock_retval ) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_checkpoint: unable to unlock mutex at end of checkpoint: %s",
|
|
strerror_r(unlock_retval, buf, BUFSIZ));
|
|
|
|
exit_status = OPAL_ERROR;
|
|
}
|
|
|
|
if( NULL != snapshot->super.metadata ) {
|
|
fclose(snapshot->super.metadata );
|
|
snapshot->super.metadata = NULL;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
int opal_crs_dmtcp_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
|
|
{
|
|
int ret, exit_status = OPAL_SUCCESS;
|
|
int exec_status;
|
|
|
|
opal_crs_dmtcp_snapshot_t *snapshot = OBJ_NEW(opal_crs_dmtcp_snapshot_t);
|
|
snapshot->super = *base_snapshot;
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: restart(--, %d)", spawn_child);
|
|
|
|
/*
|
|
* JJH NOTE: 'cold_start' indicates that this process is being restarted from
|
|
* opal-restart instead of from within an already running process.
|
|
* In the current code base, this is always set to true since it
|
|
* does not allow a process to request a restart of itself.
|
|
*/
|
|
if(snapshot->super.cold_start) {
|
|
/*
|
|
* Read the metadata left by the checkpoint() of this process
|
|
*/
|
|
if( OPAL_SUCCESS != (ret = dmtcp_cold_start(snapshot)) ) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_restart: Unable to reconstruct the snapshot.");
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
/* JJH NOTE: Nearly all of the time the 'spawn_child' argument is set to
|
|
* 'false' indicating that the restart function is expected to
|
|
* call exec() directly. It is only set to 'true' if the user
|
|
* explicitly tells opal-restart to spawn off the child, which
|
|
* rarely/never happens. So I would not worry about that option.
|
|
*/
|
|
if( spawn_child ) {
|
|
pid_t child_pid = fork();
|
|
|
|
if(child_pid > 0)
|
|
goto cleanup;
|
|
else if(child_pid < 0) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_restart: Unable to spawn child.");
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* JJH NOTE: Restart the process by replacing this process
|
|
*/
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_restart: About to invoke command: %s with argv: %s %s",
|
|
MTCP_RESTART_COMMAND,
|
|
MTCP_RESTART_COMMAND,
|
|
snapshot->context_filename);
|
|
|
|
exec_status = execlp(MTCP_RESTART_COMMAND, MTCP_RESTART_COMMAND, snapshot->context_filename, NULL);
|
|
|
|
/* If we get down here, something has broken. */
|
|
|
|
if(exec_status < 0)
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_restart: error in replacing process: %s",
|
|
strerror(errno));
|
|
else
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_restart: exec() returned!");
|
|
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
int opal_crs_dmtcp_disable_checkpoint(void)
|
|
{
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: disable_checkpoint()");
|
|
|
|
/*
|
|
* JJH NOTE: Enter a critical section. This is not really used in the code
|
|
* at the moment.
|
|
*/
|
|
mtcp_no();
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_dmtcp_enable_checkpoint(void)
|
|
{
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: enable_checkpoint()");
|
|
/*
|
|
* JJH NOTE: Leave a critical section. This is not really used in the code
|
|
* at the moment.
|
|
*/
|
|
mtcp_ok();
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/*****************************
|
|
* Local Function Definitions
|
|
*****************************/
|
|
static int dmtcp_cold_start(opal_crs_dmtcp_snapshot_t *snapshot) {
|
|
int ret, exit_status = OPAL_SUCCESS;
|
|
char **tmp_argv = NULL;
|
|
char * component_name = NULL;
|
|
int prev_pid;
|
|
|
|
/*
|
|
* Find the snapshot directory, read the metadata file for
|
|
* component name and previous pid
|
|
*/
|
|
if( NULL == snapshot->super.metadata ) {
|
|
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "r")) ) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_cold_start(): Error: Unable to open the file (%s)",
|
|
snapshot->super.metadata_filename);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
|
|
&component_name, &prev_pid) ) ) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
|
|
snapshot->super.metadata_filename, ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
snapshot->super.component_name = strdup(component_name);
|
|
|
|
/*
|
|
* Compare the component strings to make sure this is our snapshot before going further.
|
|
* JJH NOTE: This will nearly always be true since opal-restart also checks this metadata.
|
|
*/
|
|
if ( 0 != strncmp(mca_crs_dmtcp_component.super.base_version.mca_component_name,
|
|
component_name, strlen(component_name)) ) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
|
|
component_name, mca_crs_dmtcp_component.super.base_version.mca_component_name);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Read context information from the metadata file
|
|
*/
|
|
opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
|
|
if( NULL == tmp_argv ) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: dmtcp_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
|
|
CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
|
|
exit_status = OPAL_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
asprintf(&(snapshot->context_filename), "%s/%s", snapshot->super.snapshot_directory, tmp_argv[0]);
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: cold_start(%s)", snapshot->context_filename);
|
|
|
|
/*
|
|
* Reset the cold_start flag
|
|
*/
|
|
snapshot->super.cold_start = false;
|
|
|
|
cleanup:
|
|
if(NULL != tmp_argv) {
|
|
opal_argv_free(tmp_argv);
|
|
tmp_argv = NULL;
|
|
}
|
|
|
|
if( NULL != snapshot->super.metadata ) {
|
|
fclose(snapshot->super.metadata);
|
|
snapshot->super.metadata = NULL;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
/**
|
|
* Given a snapshot, generate the context filename and its full path.
|
|
*
|
|
* @param snapshot the snapshot with request information
|
|
*/
|
|
static int dmtcp_generate_full_ckpt_path(opal_crs_dmtcp_snapshot_t *snapshot)
|
|
{
|
|
int retval;
|
|
retval = asprintf(&(snapshot->context_filename), "ompi_dmtcp_context.%ld", syscall(SYS_getpid));
|
|
if(retval == -1)
|
|
return -1;
|
|
|
|
return asprintf(&full_ckpt_path, "%s/%s", snapshot->super.snapshot_directory, snapshot->context_filename);
|
|
}
|
|
|
|
/**
|
|
* This is a callback function to call the actual checkpointing routine.
|
|
* Instead of waiting for a specific interval as MTCP does, we will wait on a
|
|
* synchronization signal that will allow us to checkpoint on demand. The
|
|
* argument to this function will be ignored.
|
|
*/
|
|
static void dmtcp_sleep_between_ckpt_callback(int interval)
|
|
{
|
|
int signal_retval;
|
|
char buf[BUFSIZ];
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: called sleep_between_ckpt callback");
|
|
|
|
pthread_mutex_lock(&checkpoint_mutex);
|
|
|
|
/* If the MPI checkpoint thread is waiting on the checkpoint_done_cond and
|
|
* this thread is here, it means that a checkpoint has just completed.
|
|
* Let's signal the MPI checkpoint thread to resume. */
|
|
signal_retval = pthread_cond_signal(&checkpoint_done_cond);
|
|
|
|
if( 0 != signal_retval) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: post_ckpt_callback(): Unable to signal checkpoint done: %s",
|
|
strerror_r(signal_retval, buf, BUFSIZ));
|
|
}
|
|
|
|
/* now we simply wait for the signal to checkpoint */
|
|
pthread_cond_wait(&checkpoint_cond, &checkpoint_mutex);
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: received sync signal to checkpoint.");
|
|
|
|
/* We have now been instructed to checkpoint, so we return. Note that the
|
|
* mutex is still locked: the post_ckpt callback will unlock it. */
|
|
}
|
|
|
|
/**
|
|
* This is a callback function that is invoked before the checkpoint actually
|
|
* occurs. It enables us to do any logging that is necessary, as well as change
|
|
* the filename that the checkpoint will be written to. We expect that this
|
|
* filename will be pulled from the checkpoint options.
|
|
*
|
|
* @param ckpt_filename a pointer in which to store the desired checkpoint
|
|
* filename
|
|
*/
|
|
static void dmtcp_pre_ckpt_callback(char **ckpt_filename)
|
|
{
|
|
*ckpt_filename = full_ckpt_path;
|
|
}
|
|
|
|
/**
|
|
* This is a callback function that is invoked after the checkpoint has
|
|
* finished. It enables us to do any logging that is necessary, as well as
|
|
* report whether this is called from a restart or a checkpoint. We will report
|
|
* this status, signal the CRS code to continue running, and then release the
|
|
* mutex that we are holding.
|
|
*
|
|
* @param is_restarting whether or not this is being called as part of a restart
|
|
* @param mtcp_restore_argv_start_addr unused
|
|
*/
|
|
static void dmtcp_post_ckpt_callback(int is_restarting, char *mtcp_restore_argv_start_addr)
|
|
{
|
|
int unlock_retval;
|
|
char buf[BUFSIZ];
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: in post_ckpt_callback, restarting: %d", is_restarting);
|
|
if(is_restarting)
|
|
post_ckpt_state = OPAL_CRS_RESTART;
|
|
else
|
|
post_ckpt_state = OPAL_CRS_CONTINUE;
|
|
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: unlocking at end of post_ckpt_callback");
|
|
|
|
unlock_retval = pthread_mutex_unlock(&checkpoint_mutex);
|
|
|
|
if( 0 != unlock_retval) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: post_ckpt_callback(): Unable to unlock mutex: %s",
|
|
strerror_r(unlock_retval, buf, BUFSIZ));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This is a callback function that is invoked by DMTCP to see if it should
|
|
* checkpoint the given file descriptor.
|
|
*
|
|
* If the file descriptor is a socket, named-pipe or pseudo-terminal, DMTCP
|
|
* should skip checkpointing them.
|
|
*
|
|
* If we can't determine the type of fd (stat and/or readlink failed), we ask
|
|
* DMTCP to try to checkpoint them anyways with the assumption that DMTCP would
|
|
* warn users of any such case.
|
|
*
|
|
* @param fd file descriptor to checkpoint
|
|
* @return: 1 if DMTCP should ckpt the file descriptor, 0 otherwise.
|
|
*/
|
|
static int dmtcp_should_ckpt_fd_callback(int fd)
|
|
{
|
|
struct stat stat_buf;
|
|
char device_name[PATH_MAX];
|
|
char proc_filename[64];
|
|
char buf[BUFSIZ];
|
|
|
|
if (fstat(fd, &stat_buf) != 0) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: should_ckpt_fd_callback(): error stat()'ing %d: %s",
|
|
fd, strerror_r(errno, buf, BUFSIZ));
|
|
return 1;
|
|
/* Don't checkpoint sockets and FIFOs */
|
|
} else if (S_ISSOCK(stat_buf.st_mode) || S_ISFIFO(stat_buf.st_mode)) {
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: skipping checkpointing socket/fifo: %d",
|
|
fd);
|
|
return 0;
|
|
}
|
|
|
|
memset(device_name, 0, sizeof device_name);
|
|
sprintf(proc_filename, "/proc/self/fd/%d", fd);
|
|
if (readlink(proc_filename, device_name, sizeof(device_name) - 1) <= 0) {
|
|
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: should_ckpt_fd_callback(): readlink(%d) failed: %s",
|
|
fd, strerror_r(errno, buf, BUFSIZ));
|
|
return 1;
|
|
}
|
|
|
|
/* Don't checkpoint ptys */
|
|
if (strstr(device_name, "/dev/pts/") == 0 ||
|
|
strstr(device_name, "/dev/pty") == 0 ||
|
|
strstr(device_name, "/dev/tty") == 0) {
|
|
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
"crs:dmtcp: skipping checkpointing %s",
|
|
device_name);
|
|
return 0;
|
|
}
|
|
|
|
/* Checkpoint fd by default */
|
|
return 1;
|
|
}
|