1
1
http://www.open-mpi.org/community/lists/devel/2011/10/9784.php

Bring support for a DMTCP CRS module into the trunk.  See
http://dmtcp.sourceforge.net/ for a description of DMTCP.  Thanks to
the contribution from Alex Brick at Northeastern University, and all
the others up there who helped shepherd this into being ready to
submit.

This commit was SVN r26176.
Этот коммит содержится в:
Jeff Squyres 2012-03-22 12:01:46 +00:00
родитель d30bbc2ef9
Коммит 3bf038bb1c
10 изменённых файлов: 1142 добавлений и 0 удалений

Просмотреть файл

@ -17,6 +17,7 @@ bbenton Brad Benton IBM
bosilca George Bosilca UTK
bouteill Aurelien Bouteiller UTK
brbarret Brian Barrett IU, LANL, SNL
bricka Alex Brick NU
casswell Laura Casswell LANL
coti Camille Coti UTK, INRIA
csbell Christian Bell QLogic
@ -99,6 +100,7 @@ LANL = Los Alamos National Laboratory
LBNL = Lawrence Berkeley National Laboratory
Mellanox = Mellanox
Myricom = Myricom, Inc.
NU = Northeastern University
Oracle = Oracle
ORNL = Oak Ridge National Laboratory
OSU = The Ohio State University

Просмотреть файл

@ -41,6 +41,7 @@ Copyright (c) 2008-2010 Oak Ridge National Labs. All rights reserved.
Copyright (c) 2006-2010 Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2009 Bull SAS. All rights reserved.
Copyright (c) 2010 ARM ltd. All rights reserved.
Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>. All rights reserved.
$COPYRIGHT$

1
NEWS
Просмотреть файл

@ -70,6 +70,7 @@ Trunk (not on release branches yet)
- Added a new MCA parameter (ess_base_stream_buffering) that allows the user
to override the system default for buffering of stdout/stderr streams
(via setvbuf).
- Added support for the DMTCP checkpoint/restart system.
1.5.5

15
README
Просмотреть файл

@ -863,6 +863,21 @@ MISCELLANEOUS SUPPORT LIBRARIES
covers most cases. This option is only needed for special
configurations.
--with-dmtcp=<directory>
Specify the directory where the Distributed MultiThreaded
Checkpointing (DMTCP) libraries and header files are located. This
option is generally only necessary if the DMTCP headers and
libraries are not in default compiler/linker search paths.
This option is only meaningful if the --with-ft option is also used
to active Open MPI's fault tolerance behavior.
--with-dmtcp-libdir=<directory>
Look in directory for the DMTCP libraries. By default, Open MPI
will look in <dmtcp directory>/lib and <dmtcp directory>/lib64,
which covers most cases. This option is only needed for special
configurations.
--with-esmtp=<directory>
Specify the directory where the libESMTP libraries and header files are
located. This option is generally only necessary of the libESMTP

42
opal/mca/crs/dmtcp/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,42 @@
#
# Copyright (c) 2010 The Trustees of Indiana University.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
CFLAGS = $(crs_dmtcp_CFLAGS)
AM_CPPFLAGS = $(crs_dmtcp_CPPFLAGS)
dist_pkgdata_DATA = help-opal-crs-dmtcp.txt
sources = \
crs_dmtcp.h \
crs_dmtcp_component.c \
crs_dmtcp_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_opal_crs_dmtcp_DSO
component_noinst =
component_install = mca_crs_dmtcp.la
else
component_noinst = libmca_crs_dmtcp.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_crs_dmtcp_la_SOURCES = $(sources)
mca_crs_dmtcp_la_LDFLAGS = -module -avoid-version $(crs_dmtcp_LDFLAGS)
mca_crs_dmtcp_la_LIBADD = $(crs_dmtcp_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_crs_dmtcp_la_SOURCES = $(sources)
libmca_crs_dmtcp_la_LDFLAGS = -module -avoid-version $(crs_dmtcp_LDFLAGS)
libmca_crs_dmtcp_la_LIBADD = $(crs_dmtcp_LIBS)

151
opal/mca/crs/dmtcp/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,151 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_opal_crs_dmtcp_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_opal_crs_dmtcp_CONFIG],[
AC_CONFIG_FILES([opal/mca/crs/dmtcp/Makefile])
OPAL_VAR_SCOPE_PUSH([opal_check_crs_dmtcp_good opal_opal_check_crs_dmtcp_save_CPPFLAGS opal_opal_check_crs_dmtcp_save_LDFLAGS opal_opal_check_crs_dmtcp_save_LIBS opal_check_crs_dmtcp_dir_msg opal_check_crs_dmtcp_libdir_msg opal_check_crs_dmtcp_dir opal_check_crs_dmtcp_libdir])
opal_check_crs_dmtcp_good="no"
# Configure option to specify where to look for DMTCP headers
# --with-dmtcp(=DIR)
AC_ARG_WITH([dmtcp],
[AC_HELP_STRING([--with-dmtcp(=DIR)],
[Path to DMTCP Installation])])
OMPI_CHECK_WITHDIR([dmtcp], [$with_dmtcp], [include/mtcp.h])
# Configure option to specify where to look for DMTCP libraries
# (Default: $with_dmtcp/lib)
# --with-dmtcp-libdir=DIR
AC_ARG_WITH([dmtcp-libdir],
[AC_HELP_STRING([--with-dmtcp-libdir=DIR],
[Search for DMTCP libraries in DIR])])
OMPI_CHECK_WITHDIR([dmtcp-libdir], [$with_dmtcp_libdir], [libmtcp.so])
#
# Check if Open MPI was compiled with Checkpoint/Restart support
# If not, then we do not compile this component
#
AS_IF([test "$opal_want_ft" = "0"],
[opal_check_crs_dmtcp_good="no"],
[opal_check_crs_dmtcp_good="yes"])
#
# Check if the user explicitly requested -not- to build the DMTCP component
# If so, the we do not compile this component
#
AS_IF([test "$with_dmtcp" = "no" -o "$opal_check_crs_dmtcp_good" = "no"],
[opal_check_crs_dmtcp_good="no"],
[opal_check_crs_dmtcp_good="yes"])
# Save some flags
opal_opal_check_crs_dmtcp_save_CPPFLAGS=$CPPFLAGS
opal_opal_check_crs_dmtcp_save_LDFLAGS=$LDFLAGS
opal_opal_check_crs_dmtcp_save_LIBS=$LIBS
#
# Now to check if the library is usable
#
opal_check_crs_dmtcp_dir_msg="compiler default"
opal_check_crs_dmtcp_libdir_msg="linker default"
opal_check_crs_dmtcp_dir=""
opal_check_crs_dmtcp_libdir=""
# Determine the search paths for the headers and libraries
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
[AS_IF([test ! -z "$with_dmtcp" -a "$with_dmtcp" != "yes"],
[opal_check_crs_dmtcp_dir="$with_dmtcp"
opal_check_crs_dmtcp_dir_msg="$with_dmtcp (from --with-dmtcp)"])
AS_IF([test ! -z "$with_dmtcp_libdir" -a "$with_dmtcp_libdir" != "yes"],
[opal_check_crs_dmtcp_libdir="$with_dmtcp_libdir"
opal_check_crs_dmtcp_libdir_msg="$with_dmtcp_libdir (from --with-dmtcp-libdir)"])
])
# Look for DMTCP.
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
[AC_MSG_CHECKING([for DMTCP dir])
AC_MSG_RESULT([$opal_check_crs_dmtcp_dir_msg])
AC_MSG_CHECKING([for DMTCP library dir])
AC_MSG_RESULT([$opal_check_crs_dmtcp_libdir_msg])
OMPI_CHECK_PACKAGE([crs_dmtcp_check],
[mtcp.h],
[mtcp],
[mtcp_init],
[],
[$opal_check_crs_dmtcp_dir],
[$opal_check_crs_dmtcp_libdir],
[opal_check_crs_dmtcp_good="yes"],
[opal_check_crs_dmtcp_good="no"])
])
# When we restart a thread, we use execlp() to exec the "mtcp_restart"
# command. We don't care what its path is, but it does need to exist in
# the PATH.
AC_CHECK_PROG([mtcp_restart_command_exists], ["mtcp_restart"], ["yes"], ["no"])
AS_IF([test "$mtcp_restart_command_exists" = "no"],
[opal_check_crs_dmtcp_good="no"
AS_IF([test ! -z "$with_dmtcp" -a "$with_dmtcp" != "no"],
[AC_MSG_WARN([mtcp_restart not found in PATH.])
AC_MSG_ERROR([Aborting.])])])
#
# Now setup the additions to the wrapper compiler. If '-lmtcp' or
# '-I' or '-L' was needed to link to MTCP, then OMPI_CHECK_PACKAGE
# sets the crs_mtcp_check_* variables, which we use below.
#
crs_dmtcp_CFLAGS="$CFLAGS $crs_dmtcp_check_CFLAGS"
crs_dmtcp_CPPFLAGS="$CPPFLAGS $crs_dmtcp_check_CPPFLAGS"
crs_dmtcp_LDFLAGS="$LDFLAGS $crs_dmtcp_check_LDFLAGS"
crs_dmtcp_LIBS="$crs_dmtcp_check_LIBS $LIBS"
#
# If DMTCP is working at this point, then add any options necessary to
# the wrapper compiler.
#
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
[#
# Setup wrapper options for static builds
#
crs_dmtcp_WRAPPER_EXTRA_CPPFLAGS="$crs_dmtcp_CPPFLAGS"
crs_dmtcp_WRAPPER_EXTRA_LDFLAGS="$crs_dmtcp_LDFLAGS"
crs_dmtcp_WRAPPER_EXTRA_LIBS="$crs_dmtcp_LIBS"
$1])
CPPFLAGS=$opal_opal_check_crs_dmtcp_save_CPPFLAGS
LDFLAGS="$crs_dmtcp_check_LDFLAGS $opal_opal_check_crs_dmtcp_save_LDFLAGS"
LIBS="$crs_dmtcp_LIBS $opal_opal_check_crs_dmtcp_save_LIBS"
AC_SUBST([crs_dmtcp_WRAPPER_EXTRA_LDFLAGS])
AC_SUBST([crs_dmtcp_WRAPPER_EXTRA_LIBS])
AC_SUBST([crs_dmtcp_WRAPPER_EXTRA_CPPFLAGS])
AC_SUBST([crs_dmtcp_CFLAGS])
AC_SUBST([crs_dmtcp_CPPFLAGS])
AC_SUBST([crs_dmtcp_LDFLAGS])
AC_SUBST([crs_dmtcp_LIBS])
# If all is good at this point then post any compiler options to
# the build environment. If all is not good at this point and
# DMTCP was explicitly requested, then error out.
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
[$1],
[AS_IF([test ! -z "$with_dmtcp" -a "$with_dmtcp" != "no"],
[AC_MSG_WARN([DMTCP support requested but not found. Perhaps you need to specify the location of the DMTCP libraries.])
AC_MSG_ERROR([Aborting.])])
$2])
OPAL_VAR_SCOPE_POP
])dnl

87
opal/mca/crs/dmtcp/crs_dmtcp.h Обычный файл
Просмотреть файл

@ -0,0 +1,87 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* DMTCP CRS component
*
*/
#ifndef MCA_CRS_DMTCP_EXPORT_H
#define MCA_CRS_DMTCP_EXPORT_H
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/base/base.h"
/* JJH NOTE: Include your library header here */
/* #include <libmtcp.h> */
#include <mtcp.h>
BEGIN_C_DECLS
/*
* Local Component Structure
*/
struct opal_crs_dmtcp_component_t {
/** Base CRS component */
opal_crs_base_component_t super;
/** JJH: Add additional items here as needed internally */
};
typedef struct opal_crs_dmtcp_component_t opal_crs_dmtcp_component_t;
OPAL_MODULE_DECLSPEC extern opal_crs_dmtcp_component_t mca_crs_dmtcp_component;
/*
* Component query command
* - Called during opal_init() to determine if this component should be selected.
*/
int opal_crs_dmtcp_component_query(mca_base_module_t **module, int *priority);
/*
* Module functions
*/
int opal_crs_dmtcp_module_init(void);
int opal_crs_dmtcp_module_finalize(void);
/*
* Actual CRS funcationality
*/
int opal_crs_dmtcp_checkpoint( pid_t pid,
opal_crs_base_snapshot_t *snapshot,
opal_crs_base_ckpt_options_t *options,
opal_crs_state_type_t *state);
int opal_crs_dmtcp_restart( opal_crs_base_snapshot_t *snapshot,
bool spawn_child,
pid_t *child_pid);
int opal_crs_dmtcp_disable_checkpoint(void);
int opal_crs_dmtcp_enable_checkpoint(void);
int opal_crs_dmtcp_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
int opal_crs_dmtcp_reg_thread(void);
END_C_DECLS
#endif /* MCA_CRS_DMTCP_EXPORT_H */

121
opal/mca/crs/dmtcp/crs_dmtcp_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,121 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/util/output.h"
#include "opal/constants.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "crs_dmtcp.h"
/*
* Local functionality
*/
static int crs_dmtcp_open(void);
static int crs_dmtcp_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
opal_crs_dmtcp_component_t mca_crs_dmtcp_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component itself
*/
{
OPAL_CRS_BASE_VERSION_2_0_0,
/* Component name and version */
"dmtcp",
OPAL_MAJOR_VERSION,
OPAL_MINOR_VERSION,
OPAL_RELEASE_VERSION,
/* Component open and close functions */
crs_dmtcp_open,
crs_dmtcp_close,
opal_crs_dmtcp_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
20
}
};
static int crs_dmtcp_open(void)
{
/*
* User can adjust the relative priority of this component with respect
* to other CRS components available for selection.
*/
mca_base_param_reg_int(&mca_crs_dmtcp_component.super.base_version,
"priority",
"Priority of the CRS dmtcp component",
false, false,
mca_crs_dmtcp_component.super.priority,
&mca_crs_dmtcp_component.super.priority);
/*
* Adjust the verbosity level for this component. Default off or 0.
*/
mca_base_param_reg_int(&mca_crs_dmtcp_component.super.base_version,
"verbose",
"Verbose level for the CRS dmtcp component",
false, false,
mca_crs_dmtcp_component.super.verbose,
&mca_crs_dmtcp_component.super.verbose);
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_crs_dmtcp_component.super.verbose) {
mca_crs_dmtcp_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_crs_dmtcp_component.super.output_handle,
mca_crs_dmtcp_component.super.verbose);
} else {
mca_crs_dmtcp_component.super.output_handle = opal_crs_base_output;
}
/*
* Debug output
*/
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: open()");
opal_output_verbose(20, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: open: priority = %d",
mca_crs_dmtcp_component.super.priority);
opal_output_verbose(20, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: open: verbosity = %d",
mca_crs_dmtcp_component.super.verbose);
return OPAL_SUCCESS;
}
static int crs_dmtcp_close(void)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: close()");
return OPAL_SUCCESS;
}

709
opal/mca/crs/dmtcp/crs_dmtcp_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,709 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/constants.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/mca/event/event.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "crs_dmtcp.h"
#define MTCP_RESTART_COMMAND "mtcp_restart"
/*
* DMTCP module
*/
static opal_crs_base_module_t dmtcp_module = {
/** Initialization Function */
opal_crs_dmtcp_module_init,
/** Finalization Function */
opal_crs_dmtcp_module_finalize,
/** Checkpoint interface */
opal_crs_dmtcp_checkpoint,
/** Restart Command Access */
opal_crs_dmtcp_restart,
/** Disable checkpoints */
opal_crs_dmtcp_disable_checkpoint,
/** Enable checkpoints */
opal_crs_dmtcp_enable_checkpoint,
/** Prelaunch */
opal_crs_dmtcp_prelaunch,
/** Register Thread */
opal_crs_dmtcp_reg_thread
};
/***************************
* Snapshot Class Functions
***************************/
OBJ_CLASS_DECLARATION(opal_crs_dmtcp_snapshot_t);
struct opal_crs_dmtcp_snapshot_t {
/** Base CRS snapshot type */
opal_crs_base_snapshot_t super;
char * context_filename;
};
typedef struct opal_crs_dmtcp_snapshot_t opal_crs_dmtcp_snapshot_t;
void opal_crs_dmtcp_construct(opal_crs_dmtcp_snapshot_t *obj);
void opal_crs_dmtcp_destruct(opal_crs_dmtcp_snapshot_t *obj);
OBJ_CLASS_INSTANCE(opal_crs_dmtcp_snapshot_t,
opal_crs_base_snapshot_t,
opal_crs_dmtcp_construct,
opal_crs_dmtcp_destruct);
/******************
* Local Functions
******************/
static int dmtcp_cold_start(opal_crs_dmtcp_snapshot_t *snapshot);
static int dmtcp_generate_full_ckpt_path(opal_crs_dmtcp_snapshot_t *snapshot);
static void dmtcp_sleep_between_ckpt_callback(int interval);
static void dmtcp_pre_ckpt_callback(char **ckpt_filename);
static void dmtcp_post_ckpt_callback(int is_restarting,
char *mtcp_restore_argv_start_addr);
static int dmtcp_should_ckpt_fd_callback(int fd);
/*************************
* Local Global Variables
*************************/
static char *full_ckpt_path = NULL;
static pthread_cond_t checkpoint_cond = PTHREAD_COND_INITIALIZER;
static pthread_cond_t checkpoint_done_cond = PTHREAD_COND_INITIALIZER;
static pthread_mutex_t checkpoint_mutex = PTHREAD_MUTEX_INITIALIZER;
static int post_ckpt_state;
void opal_crs_dmtcp_construct(opal_crs_dmtcp_snapshot_t *snapshot) {
snapshot->context_filename = NULL;
snapshot->super.component_name =
strdup(mca_crs_dmtcp_component.super.base_version.mca_component_name);
}
void opal_crs_dmtcp_destruct( opal_crs_dmtcp_snapshot_t *snapshot) {
if(NULL != snapshot->context_filename) {
free(snapshot->context_filename);
snapshot->context_filename = NULL;
}
}
/*****************
* MCA Functions
*****************/
int opal_crs_dmtcp_component_query(mca_base_module_t **module, int *priority)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: component_query()");
*priority = mca_crs_dmtcp_component.super.priority;
*module = (mca_base_module_t *)&dmtcp_module;
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_module_init(void)
{
char *temp_checkpoint_name;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: module_init()");
/*
* JJH NOTE: Call any initialization routines you require
*/
mtcp_set_callbacks(dmtcp_sleep_between_ckpt_callback, /* sleep_between_ckpt */
dmtcp_pre_ckpt_callback, /* pre_ckpt */
dmtcp_post_ckpt_callback, /* post_ckpt */
dmtcp_should_ckpt_fd_callback, /* ckpt_fd */
NULL); /* write_ckpt_header */
/* This serves to simply initialize MTCP. The checkpoint file will
* actually be set by our pre_ckpt callback (which takes it from the
* snapshot given to the CRS checkpoint function), and the interval will be
* ignored, substituted for a synchronization signal that is handled by our
* sleep_between_ckpt callback.
*/
asprintf(&temp_checkpoint_name, "checkpoint.dmtcp.%ld", syscall(SYS_getpid));
mtcp_init(temp_checkpoint_name, 0, 1);
mtcp_ok();
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: leaving module_init()");
free(temp_checkpoint_name);
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_module_finalize(void)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: module_finalize()");
/*
* JJH NOTE: Call any finalization routines you require
*/
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env)
{
char * tmp_env_var = NULL;
/*
* The below should be left untouched for now
*/
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"0", true, env);
free(tmp_env_var);
tmp_env_var = NULL;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: leaving module_prelaunch()");
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_reg_thread(void)
{
/*
* JJH NOTE: If you require that all threads that may call into MTCP
* explicitly register with MTCP, then place the necessary
* initialization here.
*/
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: leaving module_reg_thread()");
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_checkpoint(pid_t pid,
opal_crs_base_snapshot_t *base_snapshot,
opal_crs_base_ckpt_options_t *options,
opal_crs_state_type_t *state)
{
int unlock_retval, exit_status = OPAL_SUCCESS;
char buf[BUFSIZ];
opal_crs_dmtcp_snapshot_t *snapshot;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: about to lock mutex for checkpoint()");
pthread_mutex_lock(&checkpoint_mutex);
snapshot = (opal_crs_dmtcp_snapshot_t *) base_snapshot;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: checkpoint(%d, ---)", pid);
/* Are we checkpointing ourselves or a peer.
* JJH NOTE: This will only ever be called when pid == getpid()
* This is an old interface argument, that is no longer used.
*/
/* bricka (2010-05-14): According to crs.h, 0 also indicates checkpointing
* self.
*/
if((pid != 0) && (pid != syscall(SYS_getpid)) ) {
/* MTCP can only checkpoint a single process: we can only checkpoint
* ourself. */
*state = OPAL_CRS_ERROR;
exit_status = OPAL_ERROR;
goto cleanup;
}
/* the metadata file should always be NULL at this point */
if ( NULL != snapshot->super.metadata) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: checkpoint(): Error: Metadata file already open");
exit_status = OPAL_ERROR;
goto cleanup;
}
/*
* Update the snapshot metadata with the component name so opal-restart can
* pick the correct CRS to restart with.
*/
snapshot->super.component_name = strdup(mca_crs_dmtcp_component.super.base_version.mca_component_name);
if( NULL == snapshot->super.metadata ) {
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: checkpoint(): Error: Unable to open the file (%s)",
snapshot->super.metadata_filename);
exit_status = OPAL_ERROR;
goto cleanup;
}
}
/* The filename of the checkpoint will be changed by our pre_ckpt hook
* based on the options given to this function. */
if(dmtcp_generate_full_ckpt_path(snapshot) == -1) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: unable to generate context filename.");
exit_status = OPAL_ERROR;
goto cleanup;
}
/*
* JJH NOTE: You can write however much or little data you want to the
* metadata file. The metadata file is stored with the local
* checkpoint, and provided at restart time to help the
* CRS component deteremine how to restart from any files
* that is left in this directory during checkpoint.
* Use the command below to write key/value strings to the
* metadata file.
* (Just as we did above with the component name).
*/
if ( 0 > fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name)) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: unable to print component name to metadata");
}
if ( 0 > fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->context_filename)) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: unable to print context name to metadata");
}
fclose(snapshot->super.metadata );
snapshot->super.metadata = NULL;
/*
* JJH NOTE: Setup and request a checkpoint of this process.
*/
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: will checkpoint to file: %s",
full_ckpt_path);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: about to signal checkpoint");
/* Now that we have set the requested filename, we simply need to start
* the checkpoint. */
pthread_cond_signal(&checkpoint_cond);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: signalled checkpoint");
/* We want to wait for the checkpoint to finish before we continue (in
* particular, we need the post_ckpt hook to happen so that we know the
* status of the checkpoint)
*/
pthread_cond_wait(&checkpoint_done_cond, &checkpoint_mutex);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: received checkpoint_done signal");
/* We have now been checkpointed. Note that the state of the checkpoint
* (OPAL_CRS_CONTINUE, etc.) has been recorded by the post_ckpt hook.
*/
*state = post_ckpt_state;
exit_status = OPAL_SUCCESS;
free(full_ckpt_path);
cleanup:
unlock_retval = pthread_mutex_unlock(&checkpoint_mutex);
if( 0 != unlock_retval ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: unable to unlock mutex at end of checkpoint: %s",
strerror_r(unlock_retval, buf, BUFSIZ));
exit_status = OPAL_ERROR;
}
if( NULL != snapshot->super.metadata ) {
fclose(snapshot->super.metadata );
snapshot->super.metadata = NULL;
}
return exit_status;
}
int opal_crs_dmtcp_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
{
int ret, exit_status = OPAL_SUCCESS;
int exec_status;
opal_crs_dmtcp_snapshot_t *snapshot = OBJ_NEW(opal_crs_dmtcp_snapshot_t);
snapshot->super = *base_snapshot;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: restart(--, %d)", spawn_child);
/*
* JJH NOTE: 'cold_start' indicates that this process is being restarted from
* opal-restart instead of from within an already running process.
* In the current code base, this is always set to true since it
* does not allow a process to request a restart of itself.
*/
if(snapshot->super.cold_start) {
/*
* Read the metadata left by the checkpoint() of this process
*/
if( OPAL_SUCCESS != (ret = dmtcp_cold_start(snapshot)) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: Unable to reconstruct the snapshot.");
exit_status = OPAL_ERROR;
goto cleanup;
}
}
/* JJH NOTE: Nearly all of the time the 'spawn_child' argument is set to
* 'false' indicating that the restart function is expected to
* call exec() directly. It is only set to 'true' if the user
* explicitly tells opal-restart to spawn off the child, which
* rarely/never happens. So I would not worry about that option.
*/
if( spawn_child ) {
pid_t child_pid = fork();
if(child_pid > 0)
goto cleanup;
else if(child_pid < 0) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: Unable to spawn child.");
exit_status = OPAL_ERROR;
goto cleanup;
}
}
/*
* JJH NOTE: Restart the process by replacing this process
*/
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: About to invoke command: %s with argv: %s %s",
MTCP_RESTART_COMMAND,
MTCP_RESTART_COMMAND,
snapshot->context_filename);
exec_status = execlp(MTCP_RESTART_COMMAND, MTCP_RESTART_COMMAND, snapshot->context_filename, NULL);
/* If we get down here, something has broken. */
if(exec_status < 0)
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: error in replacing process: %s",
strerror(errno));
else
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: exec() returned!");
exit_status = OPAL_ERROR;
goto cleanup;
cleanup:
return exit_status;
}
int opal_crs_dmtcp_disable_checkpoint(void)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: disable_checkpoint()");
/*
* JJH NOTE: Enter a critical section. This is not really used in the code
* at the moment.
*/
mtcp_no();
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_enable_checkpoint(void)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: enable_checkpoint()");
/*
* JJH NOTE: Leave a critical section. This is not really used in the code
* at the moment.
*/
mtcp_ok();
return OPAL_SUCCESS;
}
/*****************************
* Local Function Definitions
*****************************/
static int dmtcp_cold_start(opal_crs_dmtcp_snapshot_t *snapshot) {
int ret, exit_status = OPAL_SUCCESS;
char **tmp_argv = NULL;
char * component_name = NULL;
int prev_pid;
/*
* Find the snapshot directory, read the metadata file for
* component name and previous pid
*/
if( NULL == snapshot->super.metadata ) {
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "r")) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_cold_start(): Error: Unable to open the file (%s)",
snapshot->super.metadata_filename);
exit_status = OPAL_ERROR;
goto cleanup;
}
}
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
&component_name, &prev_pid) ) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
snapshot->super.metadata_filename, ret);
exit_status = ret;
goto cleanup;
}
snapshot->super.component_name = strdup(component_name);
/*
* Compare the component strings to make sure this is our snapshot before going further.
* JJH NOTE: This will nearly always be true since opal-restart also checks this metadata.
*/
if ( 0 != strncmp(mca_crs_dmtcp_component.super.base_version.mca_component_name,
component_name, strlen(component_name)) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
component_name, mca_crs_dmtcp_component.super.base_version.mca_component_name);
exit_status = OPAL_ERROR;
goto cleanup;
}
/*
* Read context information from the metadata file
*/
opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
if( NULL == tmp_argv ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
exit_status = OPAL_ERROR;
goto cleanup;
}
asprintf(&(snapshot->context_filename), "%s/%s", snapshot->super.snapshot_directory, tmp_argv[0]);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: cold_start(%s)", snapshot->context_filename);
/*
* Reset the cold_start flag
*/
snapshot->super.cold_start = false;
cleanup:
if(NULL != tmp_argv) {
opal_argv_free(tmp_argv);
tmp_argv = NULL;
}
if( NULL != snapshot->super.metadata ) {
fclose(snapshot->super.metadata);
snapshot->super.metadata = NULL;
}
return exit_status;
}
/**
* Given a snapshot, generate the context filename and its full path.
*
* @param snapshot the snapshot with request information
*/
static int dmtcp_generate_full_ckpt_path(opal_crs_dmtcp_snapshot_t *snapshot)
{
int retval;
retval = asprintf(&(snapshot->context_filename), "ompi_dmtcp_context.%ld", syscall(SYS_getpid));
if(retval == -1)
return -1;
return asprintf(&full_ckpt_path, "%s/%s", snapshot->super.snapshot_directory, snapshot->context_filename);
}
/**
* This is a callback function to call the actual checkpointing routine.
* Instead of waiting for a specific interval as MTCP does, we will wait on a
* synchronization signal that will allow us to checkpoint on demand. The
* argument to this function will be ignored.
*/
static void dmtcp_sleep_between_ckpt_callback(int interval)
{
int signal_retval;
char buf[BUFSIZ];
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: called sleep_between_ckpt callback");
pthread_mutex_lock(&checkpoint_mutex);
/* If the MPI checkpoint thread is waiting on the checkpoint_done_cond and
* this thread is here, it means that a checkpoint has just completed.
* Let's signal the MPI checkpoint thread to resume. */
signal_retval = pthread_cond_signal(&checkpoint_done_cond);
if( 0 != signal_retval) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: post_ckpt_callback(): Unable to signal checkpoint done: %s",
strerror_r(signal_retval, buf, BUFSIZ));
}
/* now we simply wait for the signal to checkpoint */
pthread_cond_wait(&checkpoint_cond, &checkpoint_mutex);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: received sync signal to checkpoint.");
/* We have now been instructed to checkpoint, so we return. Note that the
* mutex is still locked: the post_ckpt callback will unlock it. */
}
/**
* This is a callback function that is invoked before the checkpoint actually
* occurs. It enables us to do any logging that is necessary, as well as change
* the filename that the checkpoint will be written to. We expect that this
* filename will be pulled from the checkpoint options.
*
* @param ckpt_filename a pointer in which to store the desired checkpoint
* filename
*/
static void dmtcp_pre_ckpt_callback(char **ckpt_filename)
{
*ckpt_filename = full_ckpt_path;
}
/**
* This is a callback function that is invoked after the checkpoint has
* finished. It enables us to do any logging that is necessary, as well as
* report whether this is called from a restart or a checkpoint. We will report
* this status, signal the CRS code to continue running, and then release the
* mutex that we are holding.
*
* @param is_restarting whether or not this is being called as part of a restart
* @param mtcp_restore_argv_start_addr unused
*/
static void dmtcp_post_ckpt_callback(int is_restarting, char *mtcp_restore_argv_start_addr)
{
int unlock_retval;
char buf[BUFSIZ];
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: in post_ckpt_callback, restarting: %d", is_restarting);
if(is_restarting)
post_ckpt_state = OPAL_CRS_RESTART;
else
post_ckpt_state = OPAL_CRS_CONTINUE;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: unlocking at end of post_ckpt_callback");
unlock_retval = pthread_mutex_unlock(&checkpoint_mutex);
if( 0 != unlock_retval) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: post_ckpt_callback(): Unable to unlock mutex: %s",
strerror_r(unlock_retval, buf, BUFSIZ));
}
}
/**
* This is a callback function that is invoked by DMTCP to see if it should
* checkpoint the given file descriptor.
*
* If the file descriptor is a socket, named-pipe or pseudo-terminal, DMTCP
* should skip checkpointing them.
*
* If we can't determine the type of fd (stat and/or readlink failed), we ask
* DMTCP to try to checkpoint them anyways with the assumption that DMTCP would
* warn users of any such case.
*
* @param fd file descriptor to checkpoint
* @return: 1 if DMTCP should ckpt the file descriptor, 0 otherwise.
*/
static int dmtcp_should_ckpt_fd_callback(int fd)
{
struct stat stat_buf;
char device_name[PATH_MAX];
char proc_filename[64];
char buf[BUFSIZ];
if (fstat(fd, &stat_buf) != 0) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: should_ckpt_fd_callback(): error stat()'ing %d: %s",
fd, strerror_r(errno, buf, BUFSIZ));
return 1;
/* Don't checkpoint sockets and FIFOs */
} else if (S_ISSOCK(stat_buf.st_mode) || S_ISFIFO(stat_buf.st_mode)) {
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: skipping checkpointing socket/fifo: %d",
fd);
return 0;
}
memset(device_name, 0, sizeof device_name);
sprintf(proc_filename, "/proc/self/fd/%d", fd);
if (readlink(proc_filename, device_name, sizeof(device_name) - 1) <= 0) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: should_ckpt_fd_callback(): readlink(%d) failed: %s",
fd, strerror_r(errno, buf, BUFSIZ));
return 1;
}
/* Don't checkpoint ptys */
if (strstr(device_name, "/dev/pts/") == 0 ||
strstr(device_name, "/dev/pty") == 0 ||
strstr(device_name, "/dev/tty") == 0) {
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: skipping checkpointing %s",
device_name);
return 0;
}
/* Checkpoint fd by default */
return 1;
}

13
opal/mca/crs/dmtcp/help-opal-crs-dmtcp.txt Обычный файл
Просмотреть файл

@ -0,0 +1,13 @@
-*- text -*-
#
# Copyright (c) 2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open PAL CRS DMTCP component.
#