1
1

remove some dead crs components

Signed-off-by: Howard Pritchard <howardp@lanl.gov>
(cherry picked from commit 6564d3d217)
Этот коммит содержится в:
Howard Pritchard 2018-10-17 10:29:00 -06:00
родитель b8e040c704
Коммит 210b4c60aa
21 изменённых файлов: 0 добавлений и 3210 удалений

Просмотреть файл

Просмотреть файл

@ -1,51 +0,0 @@
#
# Copyright (c) 2004-2007 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
CFLAGS = $(crs_blcr_CFLAGS)
AM_CPPFLAGS = $(crs_blcr_CPPFLAGS)
dist_opaldata_DATA = help-opal-crs-blcr.txt
sources = \
crs_blcr.h \
crs_blcr_component.c \
crs_blcr_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_opal_crs_blcr_DSO
component_noinst =
component_install = mca_crs_blcr.la
else
component_noinst = libmca_crs_blcr.la
component_install =
endif
mcacomponentdir = $(opallibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_crs_blcr_la_SOURCES = $(sources)
mca_crs_blcr_la_LDFLAGS = -module -avoid-version $(crs_blcr_LDFLAGS)
mca_crs_blcr_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
$(crs_blcr_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_crs_blcr_la_SOURCES = $(sources)
libmca_crs_blcr_la_LDFLAGS = -module -avoid-version $(crs_blcr_LDFLAGS)
libmca_crs_blcr_la_LIBADD = $(crs_blcr_LIBS)

Просмотреть файл

@ -1,204 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2010 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
# Copyright (c) 2015 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_crs_blcr_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_opal_crs_blcr_CONFIG],[
AC_CONFIG_FILES([opal/mca/crs/blcr/Makefile])
AC_ARG_WITH([blcr],
[AC_HELP_STRING([--with-blcr(=DIR)],
[Path to BLCR Installation])])
OPAL_CHECK_WITHDIR([blcr], [$with_blcr], [include/libcr.h])
AC_ARG_WITH([blcr-libdir],
[AC_HELP_STRING([--with-blcr-libdir=DIR],
[Search for BLCR libraries in DIR])])
OPAL_CHECK_WITHDIR([blcr-libdir], [$with_blcr_libdir], [libcr.*])
check_crs_blcr_good="no"
# If we do not want FT, don't compile this component
#
# If we wanted BLCR, but did not specify the FT option,
# error out with a warning for the user
AS_IF([test "$opal_want_ft_cr" = "0"],
[$2
check_crs_blcr_good="no"
AS_IF([test ! -z "$with_blcr" && test "$with_blcr" != "no"],
[AC_MSG_WARN([BLCR support requested, but FT support not requested. You need to specify the --with-ft=cr configure option.])
AC_MSG_ERROR([Aborting.])])
],
[check_crs_blcr_good="yes"])
# If we do not want BLCR, then do not compile it
AS_IF([test "$with_blcr" = "no" || test "$check_crs_blcr_good" = "no"],
[$2
check_crs_blcr_good="no"],
[check_crs_blcr_good="yes"])
# Defaults
check_crs_blcr_dir_msg="compiler default"
check_crs_blcr_libdir_msg="linker default"
check_crs_blcr_dir=""
check_crs_blcr_libdir=""
# Determine the search paths for the headers and libraries
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2],
[AS_IF([test ! -z "$with_blcr" && test "$with_blcr" != "yes"],
[check_crs_blcr_dir="$with_blcr"
check_crs_blcr_dir_msg="$with_blcr (from --with-blcr)"])
AS_IF([test ! -z "$with_blcr_libdir" && test "$with_blcr_libdir" != "yes"],
[check_crs_blcr_libdir="$with_blcr_libdir"
check_crs_blcr_libdir_msg="$with_blcr_libdir (from --with-blcr-libdir)"])
])
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2],
[AC_MSG_CHECKING([for BLCR dir])
AC_MSG_RESULT([$check_crs_blcr_dir_msg])
AC_MSG_CHECKING([for BLCR library dir])
AC_MSG_RESULT([$check_crs_blcr_libdir_msg])
OPAL_CHECK_PACKAGE([crs_blcr_check],
[libcr.h],
[cr],
[cr_init],
[],
[$check_crs_blcr_dir],
[$check_crs_blcr_libdir],
[check_crs_blcr_good="yes"],
[check_crs_blcr_good="no"])
])
crs_blcr_save_CFLAGS="$CFLAGS"
crs_blcr_save_CPPFLAGS="$CPPFLAGS"
crs_blcr_save_LDFLAGS="$LDFLAGS"
crs_blcr_save_LIBS="$LIBS"
crs_blcr_CFLAGS="$CFLAGS $crs_blcr_check_CFLAGS"
crs_blcr_CPPFLAGS="$CPPFLAGS $crs_blcr_check_CPPFLAGS"
crs_blcr_LDFLAGS="$LDFLAGS $crs_blcr_check_LDFLAGS"
crs_blcr_LIBS="$LIBS $crs_blcr_check_LIBS"
# Check to see if we found the BLCR libcr.h library
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2],
[
#
# Since BLCR libraries are not fully ISO99 C compliant
# -pedantic and -Wundef raise a bunch of warnings, so
# we just strip them off for this component
AC_MSG_WARN([Removed -pedantic and -Wundef from CFLAGS for blcr component because libcr.h is not really ANSI C])
# Strip off problematic arguments
crs_blcr_CFLAGS="`echo $crs_blcr_CFLAGS | sed 's/-pedantic//g'`"
crs_blcr_CFLAGS="`echo $crs_blcr_CFLAGS | sed 's/-Wundef//g'`"
crs_blcr_CPPFLAGS="`echo $crs_blcr_CPPFLAGS | sed 's/-pedantic//g'`"
crs_blcr_CPPFLAGS="`echo $crs_blcr_CPPFLAGS | sed 's/-Wundef//g'`"
crs_blcr_LDFLAGS="$crs_blcr_LDFLAGS"
crs_blcr_LIBS="$crs_blcr_LIBS"
$1])
#
# Check for version difference which may have:
# - working cr_request_file
# - working cr_request_checkpoint (which should be used instead of cr_request_file)
# - 'requester' parameter to checkpoint_info
#
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2], [
CFLAGS="$crs_blcr_CFLAGS"
CPPFLAGS="$crs_blcr_CPPFLAGS"
LDFLAGS="$crs_blcr_LDFLAGS"
LIBS="$crs_blcr_LIBS"
#
# First look for the cr_request_file function
#
crs_blcr_have_working_cr_request=0
AC_MSG_CHECKING(for BLCR working cr_request)
OPAL_SEARCH_LIBS_COMPONENT([crs_blcr], [cr_request_file],[cr],
[AC_TRY_COMPILE([#include <libcr.h>],
[#if CR_RELEASE_MAJOR <= 0 && CR_RELEASE_MINOR < 6
#error Version earlier than 0.6.0
#endif
],
[crs_blcr_have_working_cr_request=1
],
[crs_blcr_have_working_cr_request=0
AC_MSG_WARN([This BLCR version does not contain a known working version of cr_request_file])
])],
[crs_blcr_have_working_cr_request=0
AC_MSG_WARN([This BLCR version does not contain the cr_request_file function])
])
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST], [$crs_blcr_have_working_cr_request],
[BLCR cr_request_file check])
#
# Look for the cr_request_checkpoint function
#
crs_blcr_have_cr_request_checkpoint=0
AC_MSG_CHECKING(for BLCR cr_request_checkpoint)
OPAL_SEARCH_LIBS_COMPONENT([crs_blcr],
[cr_request_checkpoint],[cr],
[crs_blcr_have_cr_request_checkpoint=1
],
[crs_blcr_have_cr_request_checkpoint=0
AC_MSG_WARN([This BLCR version does not contain the cr_request_checkpoint function])
])
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT], [$crs_blcr_have_cr_request_checkpoint],
[BLCR cr_request_checkpoint check])
#
# Look for the cr_checkpoint_info.requester member
#
crs_blcr_have_info_requester=0
AC_CHECK_MEMBER([struct cr_checkpoint_info.requester],
[crs_blcr_have_info_requester=1],
[AC_MSG_WARN([This BLCR version does not contain a 'requester' member of the 'cr_checkpoint_info' struct])],
[#include <libcr.h>])
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_INFO_REQUESTER], [$crs_blcr_have_info_requester],
[BLCRs cr_checkpoint_info.requester member availability])
$1])
#
# Require either a working cr_request_file() or cr_request_checkpoint() function
#
AS_IF([test "$crs_blcr_have_working_cr_request" = "0" && test "$crs_blcr_have_cr_request_checkpoint" = "0"],
[$2
check_crs_blcr_good="no"
AC_MSG_WARN([The BLCR CRS component requires either the cr_request_checkpoint() or cr_request_file() functions])])
#
# Reset the flags
#
CFLAGS="$crs_blcr_save_CFLAGS"
CPPFLAGS="$crs_blcr_save_CPPFLAGS"
LDFLAGS="$crs_blcr_save_LDFLAGS"
LIBS="$crs_blcr_save_LIBS"
#
AS_IF([test "$check_crs_blcr_good" = "yes"],
[ AC_SUBST([crs_blcr_CFLAGS])
AC_SUBST([crs_blcr_CPPFLAGS])
AC_SUBST([crs_blcr_LDFLAGS])
AC_SUBST([crs_blcr_LIBS])
$1],
[AS_IF([test ! -z "$with_blcr" && test "$with_blcr" != "no"],
[AC_MSG_WARN([BLCR support requested but not found. Perhaps you need to specify the location of the BLCR libraries.])
AC_MSG_ERROR([Aborting.])])
$3])
])dnl

Просмотреть файл

@ -1,84 +0,0 @@
/*
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* BLCR CRS component
*
*/
#ifndef MCA_CRS_BLCR_EXPORT_H
#define MCA_CRS_BLCR_EXPORT_H
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/base/base.h"
#include <libcr.h>
BEGIN_C_DECLS
/*
* Local Component structures
*/
struct opal_crs_blcr_component_t {
/** Base CRS component */
opal_crs_base_component_t super;
};
typedef struct opal_crs_blcr_component_t opal_crs_blcr_component_t;
OPAL_MODULE_DECLSPEC extern opal_crs_blcr_component_t mca_crs_blcr_component;
int opal_crs_blcr_component_query(mca_base_module_t **module, int *priority);
extern bool opal_crs_blcr_dev_null;
/*
* Module functions
*/
int opal_crs_blcr_module_init(void);
int opal_crs_blcr_module_finalize(void);
/*
* Actual funcationality
*/
int opal_crs_blcr_checkpoint( pid_t pid,
opal_crs_base_snapshot_t *snapshot,
opal_crs_base_ckpt_options_t *options,
opal_crs_state_type_t *state);
int opal_crs_blcr_restart( opal_crs_base_snapshot_t *snapshot,
bool spawn_child,
pid_t *child_pid);
int opal_crs_blcr_disable_checkpoint(void);
int opal_crs_blcr_enable_checkpoint(void);
int opal_crs_blcr_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
int opal_crs_blcr_reg_thread(void);
END_C_DECLS
#endif /* MCA_CRS_BLCR_EXPORT_H */

Просмотреть файл

@ -1,145 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/util/output.h"
#include "opal/constants.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "crs_blcr.h"
/*
* Local functionality
*/
static int crs_blcr_register (void);
static int crs_blcr_open(void);
static int crs_blcr_close(void);
bool opal_crs_blcr_dev_null = false;
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
opal_crs_blcr_component_t mca_crs_blcr_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component itself
*/
.base_version = {
OPAL_CRS_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "blcr",
MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
OPAL_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = crs_blcr_open,
.mca_close_component = crs_blcr_close,
.mca_query_component = opal_crs_blcr_component_query,
.mca_register_component_params = crs_blcr_register
},
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.verbose = 0,
.output_handle = -1,
}
};
static int crs_blcr_register (void)
{
int ret;
mca_crs_blcr_component.super.priority = 10;
ret = mca_base_component_var_register (&mca_crs_blcr_component.super.base_version,
"priority", "Priority of the CRS blcr component "
"(default: 10)". MCA_BASE_VAR_TYPE_INT, NULL,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_crs_blcr_component.super.priority);
if (0 > ret) {
return ret;
}
mca_crs_blcr_component.super.verbose = 0;
ret = mca_base_component_var_register (&mca_crs_blcr_component.super.base_version,
"verbose",
"Verbose level for the CRS blcr component",
MCA_BASE_VAR_TYPE_INT, NULL, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_crs_blcr_component.super.verbose);
if (0 > ret) {
return ret;
}
opal_crs_blcr_dev_null = false;
ret = mca_base_component_var_register (&mca_crs_blcr_component.super.base_version,
"dev_null",
"Not for general use! For debugging only! Save checkpoint to /dev/null. [Default = disabled]",
MCA_BASE_VAR_TYPE_BOOL, NULL, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
&opal_crs_blcr_dev_null);
return (0 > ret) ? ret : OPAL_SUCCESS
}
static int crs_blcr_open(void)
{
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_crs_blcr_component.super.verbose) {
mca_crs_blcr_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_crs_blcr_component.super.output_handle,
mca_crs_blcr_component.super.verbose);
} else {
mca_crs_blcr_component.super.output_handle = opal_crs_base_framework.framework_output;
}
/*
* Debug output
*/
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: open()");
opal_output_verbose(20, mca_crs_blcr_component.super.output_handle,
"crs:blcr: open: priority = %d",
mca_crs_blcr_component.super.priority);
opal_output_verbose(20, mca_crs_blcr_component.super.output_handle,
"crs:blcr: open: verbosity = %d",
mca_crs_blcr_component.super.verbose);
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: open: dev_null = %s",
(opal_crs_blcr_dev_null == true ? "True" : "False"));
return OPAL_SUCCESS;
}
static int crs_blcr_close(void)
{
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: close()");
return OPAL_SUCCESS;
}

Просмотреть файл

@ -1,866 +0,0 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
*
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "opal/util/show_help.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/constants.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/threads/threads.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/mca/event/event.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "crs_blcr.h"
/*
* Blcr module
*/
static opal_crs_base_module_t blcr_module = {
/** Initialization Function */
opal_crs_blcr_module_init,
/** Finalization Function */
opal_crs_blcr_module_finalize,
/** Checkpoint interface */
opal_crs_blcr_checkpoint,
/** Restart Command Access */
opal_crs_blcr_restart,
/** Disable checkpoints */
opal_crs_blcr_disable_checkpoint,
/** Enable checkpoints */
opal_crs_blcr_enable_checkpoint,
/** Prelaunch */
opal_crs_blcr_prelaunch,
/** Register Thread */
opal_crs_blcr_reg_thread
};
/***************************
* Snapshot Class Functions
***************************/
OBJ_CLASS_DECLARATION(opal_crs_blcr_snapshot_t);
struct opal_crs_blcr_snapshot_t {
/** Base CRS snapshot type */
opal_crs_base_snapshot_t super;
char * context_filename;
};
typedef struct opal_crs_blcr_snapshot_t opal_crs_blcr_snapshot_t;
void opal_crs_blcr_construct(opal_crs_blcr_snapshot_t *obj);
void opal_crs_blcr_destruct( opal_crs_blcr_snapshot_t *obj);
OBJ_CLASS_INSTANCE(opal_crs_blcr_snapshot_t,
opal_crs_base_snapshot_t,
opal_crs_blcr_construct,
opal_crs_blcr_destruct);
/******************
* Local Functions
******************/
static int blcr_get_checkpoint_filename(char **fname, pid_t pid);
static int opal_crs_blcr_thread_callback(void *arg);
static int opal_crs_blcr_signal_callback(void *arg);
static int opal_crs_blcr_restart_cmd(char *fname, char **cmd);
static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot);
#if OPAL_ENABLE_CRDEBUG == 1
static void MPIR_checkpoint_debugger_crs_hook(cr_hook_event_t event);
#endif
/*************************
* Local Global Variables
*************************/
#if OPAL_ENABLE_CRDEBUG == 1
static opal_thread_t *checkpoint_thread_id = NULL;
static bool blcr_crdebug_refreshed_env = false;
#endif
static cr_client_id_t client_id;
static cr_callback_id_t cr_thread_callback_id;
static cr_callback_id_t cr_signal_callback_id;
static int blcr_current_state = OPAL_CRS_NONE;
static char *blcr_restart_cmd = NULL;
static char *blcr_checkpoint_cmd = NULL;
static opal_condition_t blcr_cond;
static opal_mutex_t blcr_lock;
static pid_t my_pid = -1;
void opal_crs_blcr_construct(opal_crs_blcr_snapshot_t *snapshot) {
snapshot->context_filename = NULL;
snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name);
}
void opal_crs_blcr_destruct( opal_crs_blcr_snapshot_t *snapshot) {
if(NULL != snapshot->context_filename) {
free(snapshot->context_filename);
snapshot->context_filename = NULL;
}
}
/*****************
* MCA Functions
*****************/
int opal_crs_blcr_component_query(mca_base_module_t **module, int *priority)
{
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: component_query()");
*priority = mca_crs_blcr_component.super.priority;
*module = (mca_base_module_t *)&blcr_module;
return OPAL_SUCCESS;
}
int opal_crs_blcr_module_init(void)
{
void *crs_blcr_thread_callback_arg = NULL;
void *crs_blcr_signal_callback_arg = NULL;
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: module_init()");
blcr_restart_cmd = strdup("cr_restart");
blcr_checkpoint_cmd = strdup("cr_checkpoint");
my_pid = getpid();
if( !opal_cr_is_tool ) {
/* We need to make the lock and condition variable before
* starting the thread, since the thread uses these vars.
*/
OBJ_CONSTRUCT(&blcr_lock, opal_mutex_t);
OBJ_CONSTRUCT(&blcr_cond, opal_condition_t);
/*
* Initialize BLCR
*/
client_id = cr_init();
if (0 > client_id) {
opal_output(mca_crs_blcr_component.super.output_handle,
"Error: crs:blcr: module_init: cr_init failed (%d)\n", client_id);
return OPAL_ERROR;
}
}
#if OPAL_ENABLE_CRDEBUG == 1
blcr_crdebug_refreshed_env = false;
#endif
blcr_restart_cmd = strdup("cr_restart");
blcr_checkpoint_cmd = strdup("cr_checkpoint");
if( !opal_cr_is_tool ) {
/*
* Register the thread handler
*/
cr_thread_callback_id = cr_register_callback(opal_crs_blcr_thread_callback,
crs_blcr_thread_callback_arg,
CR_THREAD_CONTEXT);
/*
* Register the signal handler
* - even though we do not use it
*/
cr_signal_callback_id = cr_register_callback(opal_crs_blcr_signal_callback,
crs_blcr_signal_callback_arg,
CR_SIGNAL_CONTEXT);
#if OPAL_ENABLE_CRDEBUG == 1
/*
* Checkpoint/restart enabled debugging hooks
* "NO_CALLBACKS" -> non-MPI threads
* "SIGNAL_CONTEXT" -> MPI threads
* "THREAD_CONTEXT" -> BLCR threads
*/
cr_register_hook(CR_HOOK_CONT_NO_CALLBACKS, MPIR_checkpoint_debugger_crs_hook);
cr_register_hook(CR_HOOK_CONT_SIGNAL_CONTEXT, MPIR_checkpoint_debugger_crs_hook);
cr_register_hook(CR_HOOK_RSTRT_NO_CALLBACKS, MPIR_checkpoint_debugger_crs_hook);
cr_register_hook(CR_HOOK_RSTRT_SIGNAL_CONTEXT, MPIR_checkpoint_debugger_crs_hook);
#endif
}
/*
* Now that we are done with init, set the state to running
*/
blcr_current_state = OPAL_CRS_RUNNING;
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: module_init() --> Finished [%d]",
opal_cr_is_tool);
return OPAL_SUCCESS;
}
int opal_crs_blcr_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env)
{
char * tmp_env_var = NULL;
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
opal_setenv(tmp_env_var,
"0", true, env);
free(tmp_env_var);
tmp_env_var = NULL;
return OPAL_SUCCESS;
}
int opal_crs_blcr_reg_thread(void)
{
cr_client_id_t loc_client_id;
/*
* Initialize BLCR
*/
loc_client_id = cr_init();
if (0 > loc_client_id) {
opal_output(mca_crs_blcr_component.super.output_handle,
"Error: crs:blcr: reg_thread: cr_init failed (%d)\n", loc_client_id);
return OPAL_ERROR;
}
return OPAL_SUCCESS;
}
int opal_crs_blcr_module_finalize(void)
{
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: module_finalize()");
/* Cleanup some memory */
if( NULL != blcr_restart_cmd ) {
free(blcr_restart_cmd);
blcr_restart_cmd = NULL;
}
if( NULL != blcr_checkpoint_cmd ) {
free(blcr_checkpoint_cmd);
blcr_checkpoint_cmd = NULL;
}
if( !opal_cr_is_tool ) {
OBJ_DESTRUCT(&blcr_lock);
OBJ_DESTRUCT(&blcr_cond);
if( OPAL_CRS_RUNNING == blcr_current_state ) {
/* Unload the thread callback */
cr_replace_callback(cr_thread_callback_id, NULL, NULL, CR_THREAD_CONTEXT);
/* Unload the signal callback */
cr_replace_callback(cr_signal_callback_id, NULL, NULL, CR_SIGNAL_CONTEXT);
}
#if OPAL_ENABLE_CRDEBUG == 1
/*
* Checkpoint/restart enabled debugging hooks
*/
cr_register_hook(CR_HOOK_CONT_NO_CALLBACKS, NULL);
cr_register_hook(CR_HOOK_CONT_SIGNAL_CONTEXT, NULL);
cr_register_hook(CR_HOOK_RSTRT_NO_CALLBACKS, NULL);
cr_register_hook(CR_HOOK_RSTRT_SIGNAL_CONTEXT, NULL);
#endif
}
/* BLCR does not have a finalization routine */
blcr_current_state = OPAL_CRS_NONE;
return OPAL_SUCCESS;
}
int opal_crs_blcr_checkpoint(pid_t pid,
opal_crs_base_snapshot_t *base_snapshot,
opal_crs_base_ckpt_options_t *options,
opal_crs_state_type_t *state)
{
int ret, exit_status = OPAL_SUCCESS;
opal_crs_blcr_snapshot_t *snapshot = NULL;
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
cr_checkpoint_args_t cr_args;
static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1);
#endif
int fd = 0;
char *loc_fname = NULL;
if( pid != my_pid ) {
opal_output(0, "crs:blcr: checkpoint(%d, ---): Checkpointing of peers not allowed!", pid);
exit_status = OPAL_ERROR;
goto cleanup;
}
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(%d, ---)", pid);
snapshot = (opal_crs_blcr_snapshot_t *)base_snapshot;
/*
* Update the snapshot metadata
*/
snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name);
blcr_get_checkpoint_filename(&(snapshot->context_filename), pid);
if( NULL == snapshot->super.metadata ) {
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to open the file (%s)",
snapshot->super.metadata_filename);
exit_status = OPAL_ERROR;
goto cleanup;
}
}
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->context_filename);
fclose(snapshot->super.metadata );
snapshot->super.metadata = NULL;
/*
* If we can checkpointing ourselves do so:
* use cr_request_checkpoint() if available, and cr_request_file() if not
*/
if( opal_crs_blcr_dev_null ) {
loc_fname = strdup("/dev/null");
} else {
asprintf(&loc_fname, "%s/%s", snapshot->super.snapshot_directory, snapshot->context_filename);
}
#if OPAL_ENABLE_CRDEBUG == 1
/* Make sure to identify the checkpointing thread, so that it is not
* prevented from requesting the checkpoint after the debugger detaches
*/
opal_cr_debug_set_current_ckpt_thread_self();
checkpoint_thread_id = opal_thread_get_self();
blcr_crdebug_refreshed_env = false;
/* If checkpoint/restart enabled debugging then mark detachment place */
if( MPIR_debug_with_checkpoint ) {
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Detaching debugger...");
MPIR_checkpoint_debugger_detach();
}
#endif
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint SELF <%s>",
loc_fname);
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
fd = open(loc_fname,
O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE,
S_IRUSR | S_IWUSR);
if( fd < 0 ) {
*state = OPAL_CRS_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)",
loc_fname, pid);
exit_status = OPAL_ERROR;
goto cleanup;
}
cr_initialize_checkpoint_args_t(&cr_args);
cr_args.cr_scope = CR_SCOPE_PROC;
cr_args.cr_fd = fd;
if( options->stop ) {
cr_args.cr_signal = SIGSTOP;
}
ret = cr_request_checkpoint(&cr_args, &cr_handle);
if( ret < 0 ) {
close(cr_args.cr_fd);
*state = OPAL_CRS_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)",
pid, loc_fname);
exit_status = ret;
goto cleanup;
}
/* Wait for checkpoint to finish */
do {
ret = cr_poll_checkpoint(&cr_handle, NULL);
if( ret < 0 ) {
/* Check if restarting. This is not an error. */
if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) {
ret = 0;
break;
}
/* If Call was interrupted by a signal, retry the call */
else if (errno == EINTR) {
;
}
/* Otherwise this is a real error that we need to deal with */
else {
*state = OPAL_CRS_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)",
pid, loc_fname, ret);
exit_status = ret;
goto cleanup;
}
}
} while( ret < 0 );
/* Close the file */
close(cr_args.cr_fd);
#else
/* Request a checkpoint be taken of the current process.
* Since we are not guaranteed to finish the checkpoint before this
* returns, we also need to wait for it.
*/
cr_request_file(loc_fname);
/* Wait for checkpoint to finish */
do {
usleep(1000); /* JJH Do we really want to sleep? */
} while(CR_STATE_IDLE != cr_status());
#endif
#endif
*state = blcr_current_state;
free(loc_fname);
cleanup:
if( NULL != snapshot->super.metadata ) {
fclose(snapshot->super.metadata );
snapshot->super.metadata = NULL;
}
return exit_status;
}
int opal_crs_blcr_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
{
opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
char **cr_argv = NULL;
char *cr_cmd = NULL;
char *cr_full_cmd = NULL;
int ret;
int exit_status = OPAL_SUCCESS;
int status;
snapshot->super = *base_snapshot;
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: restart(--, %d)", spawn_child);
/*
* If we need to reconstruct the snapshot,
*/
if(snapshot->super.cold_start) {
if( OPAL_SUCCESS != (ret = blcr_cold_start(snapshot)) ) {
exit_status = OPAL_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_restart: Unable to reconstruct the snapshot.");
goto cleanup;
}
}
/*
* Get the restart command
*/
if ( OPAL_SUCCESS != (ret = opal_crs_blcr_restart_cmd(snapshot->context_filename, &cr_cmd)) ) {
exit_status = ret;
goto cleanup;
}
if ( NULL == (cr_argv = opal_argv_split(cr_cmd, ' ')) ) {
exit_status = OPAL_ERROR;
goto cleanup;
}
/* Need to shutdown the event engine before this.
* for some reason the BLCR checkpointer and our event engine don't get
* along very well.
*/
opal_progress_finalize();
(void) mca_base_framework_close(&opal_event_base_framework);
if (!spawn_child) {
cr_full_cmd = opal_argv_join(cr_argv, ' ');
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_restart: SELF: exec :(%s, %s):",
blcr_restart_cmd, cr_full_cmd);
status = execvp(blcr_restart_cmd, cr_argv);
if(status < 0) {
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_restart: SELF: Child failed to execute :(%d):", status);
}
opal_show_help("help-opal-crs-blcr.txt", "blcr:restart_failed_exec", true,
status,
blcr_restart_cmd,
cr_full_cmd);
exit_status = status;
goto cleanup;
}
/*
* Restart by starting a new process
*/
else {
*child_pid = fork();
if( 0 == *child_pid) {
/* Child Process */
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_restart: CHILD: exec :(%s, %s):",
blcr_restart_cmd,
opal_argv_join(cr_argv, ' '));
status = execvp(blcr_restart_cmd, cr_argv);
if(status < 0) {
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_restart: CHILD: Child failed to execute :(%d):", status);
}
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_restart: CHILD: execvp returned %d", status);
exit_status = status;
goto cleanup;
}
else if(*child_pid > 0) {
/* Parent is done once it is started. */
;
}
else {
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_restart: CHILD: fork failed :(%d):", *child_pid);
}
}
cleanup:
if(NULL != cr_cmd)
free(cr_cmd);
if(NULL != cr_argv)
opal_argv_free(cr_argv);
return exit_status;
}
int opal_crs_blcr_disable_checkpoint(void)
{
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: disable_checkpoint()");
/*
* Enter the BLCR Critical Section
*/
cr_enter_cs(client_id);
return OPAL_SUCCESS;
}
int opal_crs_blcr_enable_checkpoint(void)
{
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: enable_checkpoint()");
/*
* Leave the BLCR Critical Section
*/
cr_leave_cs(client_id);
return OPAL_SUCCESS;
}
/*****************************
* Local Function Definitions
*****************************/
static int opal_crs_blcr_thread_callback(void *arg) {
const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info();
int ret;
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: thread_callback()");
OPAL_THREAD_LOCK(&blcr_lock);
blcr_current_state = OPAL_CRS_CHECKPOINT;
/*
* Allow the checkpoint to be taken, if we requested it
*/
#if CRS_BLCR_HAVE_INFO_REQUESTER == 1
if( ckpt_info->requester != my_pid ) {
ret = cr_checkpoint(CR_CHECKPOINT_OMIT);
blcr_current_state = OPAL_CRS_RUNNING;
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: thread_callback(); WARNING: An external agent attempted to checkpoint this process "
"when it did not expect to be checkpointed. Skipping this checkpoint request."
" [%d != %d].", ckpt_info->requester, my_pid);
return 0;
}
else
#endif
{
if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_CRS_PRE_CKPT,
OPAL_CR_INC_STATE_PREPARE)) ) {
;
}
ret = cr_checkpoint(0);
}
/*
* Restarting
*/
if ( 0 < ret ) {
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: thread_callback: Restarting.");
blcr_current_state = OPAL_CRS_RESTART;
}
/*
* Continuing
*/
else {
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: thread_callback: Continue.");
blcr_current_state = OPAL_CRS_CONTINUE;
}
if( OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_CRS_POST_CKPT,
(blcr_current_state == OPAL_CRS_CONTINUE ?
OPAL_CR_INC_STATE_CONTINUE :
OPAL_CR_INC_STATE_RESTART))) ) {
;
}
OPAL_THREAD_UNLOCK(&blcr_lock);
opal_condition_signal(&blcr_cond);
return 0;
}
static int opal_crs_blcr_signal_callback(void *arg) {
const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info();
int ret;
/*
* Allow the checkpoint to be taken, if we requested it
*/
#if CRS_BLCR_HAVE_INFO_REQUESTER == 1
if( ckpt_info->requester != my_pid ) {
ret = cr_checkpoint(CR_CHECKPOINT_OMIT);
return 0;
}
else
#endif
{
ret = cr_checkpoint(0);
}
return 0;
}
static int opal_crs_blcr_restart_cmd(char *fname, char **cmd)
{
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: restart_cmd(%s, ---)", fname);
if (NULL == fname) {
opal_output_verbose(10, opal_crs_base_framework.framework_output,
"crs:blcr: restart_cmd: Error: filename is NULL!");
return OPAL_CRS_ERROR;
}
asprintf(cmd, "%s %s", blcr_restart_cmd, fname);
return OPAL_SUCCESS;
}
static int blcr_get_checkpoint_filename(char **fname, pid_t pid)
{
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: get_checkpoint_filename(--, %d)", pid);
asprintf(fname, "ompi_blcr_context.%d", pid);
return OPAL_SUCCESS;
}
static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
int ret, exit_status = OPAL_SUCCESS;
char **tmp_argv = NULL;
char * component_name = NULL;
int prev_pid;
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: cold_start()");
/*
* Find the snapshot directory, read the metadata file
*/
if( NULL == snapshot->super.metadata ) {
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "r")) ) {
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint(): Error: Unable to open the file (%s)",
snapshot->super.metadata_filename);
exit_status = OPAL_ERROR;
goto cleanup;
}
}
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
&component_name, &prev_pid) ) ) {
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
snapshot->super.metadata_filename, ret);
exit_status = ret;
goto cleanup;
}
snapshot->super.component_name = strdup(component_name);
/* Compare the component strings to make sure this is our snapshot before going further */
if ( 0 != strncmp(mca_crs_blcr_component.super.base_version.mca_component_name,
component_name, strlen(component_name)) ) {
exit_status = OPAL_ERROR;
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
component_name, mca_crs_blcr_component.super.base_version.mca_component_name);
goto cleanup;
}
/*
* Context Filename
*/
opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
if( NULL == tmp_argv ) {
opal_output(mca_crs_blcr_component.super.output_handle,
"crs:blcr: blcr_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
exit_status = OPAL_ERROR;
goto cleanup;
}
asprintf(&snapshot->context_filename, "%s/%s", snapshot->super.snapshot_directory, tmp_argv[0]);
/*
* Reset the cold_start flag
*/
snapshot->super.cold_start = false;
cleanup:
if(NULL != tmp_argv) {
opal_argv_free(tmp_argv);
tmp_argv = NULL;
}
if( NULL != snapshot->super.metadata ) {
fclose(snapshot->super.metadata);
snapshot->super.metadata = NULL;
}
return exit_status;
}
#if OPAL_ENABLE_CRDEBUG == 1
static void MPIR_checkpoint_debugger_crs_hook(cr_hook_event_t event) {
opal_thread_t *my_thread_id = NULL;
my_thread_id = opal_thread_get_self();
/* Non-MPI threads */
if(event == CR_HOOK_RSTRT_NO_CALLBACKS ) {
/* wait for the MPI thread to refresh the environment for us */
while(!blcr_crdebug_refreshed_env) {
sched_yield();
}
}
/* MPI threads */
else if(event == CR_HOOK_RSTRT_SIGNAL_CONTEXT ) {
if( opal_thread_self_compare(checkpoint_thread_id) ) {
opal_cr_refresh_environ(my_pid);
blcr_crdebug_refreshed_env = true;
} else {
while(!blcr_crdebug_refreshed_env) {
sched_yield();
}
}
}
/*
* Some debugging output
*/
/* Non-MPI threads */
if( event == CR_HOOK_CONT_NO_CALLBACKS ) {
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Waiting in Continue (Non-MPI). (%d)",
(int)my_thread_id->t_handle);
}
else if(event == CR_HOOK_RSTRT_NO_CALLBACKS ) {
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Waiting in Restart (Non-MPI). (%d)",
(int)my_thread_id->t_handle);
}
/* MPI Threads */
else if( event == CR_HOOK_CONT_SIGNAL_CONTEXT ) {
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Waiting in Continue (MPI).");
}
else if(event == CR_HOOK_RSTRT_SIGNAL_CONTEXT ) {
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Waiting in Restart (MPI).");
}
/*
* Enter the breakpoint function.
* If no debugger intends on attaching, then this function is expected to
* return immediately.
*
* If this is an MPI thread then odds are that this is the checkpointing
* thread, in which case this function will return immediately allowing
* it to prepare the MPI library before signaling to the debugger that
* it is safe to attach, if necessary.
*/
MPIR_checkpoint_debugger_waitpoint();
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Finished...");
}
#endif

Просмотреть файл

@ -1,28 +0,0 @@
-*- text -*-
#
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open PAL CRS framework.
#
[blcr:restart_failed_exec]
Error: BLCR was not able to restart the process because exec failed.
Check the installation of BLCR on all of the machines in your
system. The following information may be of help:
Return Code : %d
BLCR Restart Command : %s
Restart Command Line : %s

Просмотреть файл

Просмотреть файл

@ -1,51 +0,0 @@
#
# Copyright (c) 2004-2007 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
#
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
CFLAGS = $(crs_criu_CFLAGS)
AM_CPPFLAGS = $(crs_criu_CPPFLAGS)
sources = \
crs_criu.h \
crs_criu_component.c \
crs_criu_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_opal_crs_criu_DSO
component_noinst =
component_install = mca_crs_criu.la
else
component_noinst = libmca_crs_criu.la
component_install =
endif
mcacomponentdir = $(opallibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_crs_criu_la_SOURCES = $(sources)
mca_crs_criu_la_LDFLAGS = -module -avoid-version $(crs_criu_LDFLAGS)
mca_crs_criu_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
$(crs_criu_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_crs_criu_la_SOURCES = $(sources)
libmca_crs_criu_la_LDFLAGS = -module -avoid-version $(crs_criu_LDFLAGS)
libmca_crs_criu_la_LIBADD = $(crs_criu_LIBS)

Просмотреть файл

@ -1,93 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2010 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
# Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
# Copyright (c) 2015 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_crs_criu_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_opal_crs_criu_CONFIG],[
OPAL_VAR_SCOPE_PUSH([check_crs_criu_good check_crs_criu_dir_msg check_crs_criu_libdir_msg check_crs_criu_dir check_crs_criu_libdir])
AC_CONFIG_FILES([opal/mca/crs/criu/Makefile])
AC_ARG_WITH([criu],
[AC_HELP_STRING([--with-criu(=DIR)],
[Path to CRIU Installation])])
OPAL_CHECK_WITHDIR([criu], [$with_criu], [include/criu/criu.h])
AC_ARG_WITH([criu-libdir],
[AC_HELP_STRING([--with-criu-libdir=DIR],
[Search for CRIU libraries in DIR])])
OPAL_CHECK_WITHDIR([criu-libdir], [$with_criu_libdir], [libcriu.*])
# If we do not want FT or CRIU, don't compile this component
AS_IF([test "$opal_want_ft_cr" = "1" && test "$with_criu" = "yes"],
[check_crs_criu_good=yes],
[check_crs_criu_good=no])
# Defaults
check_crs_criu_dir_msg="compiler default"
check_crs_criu_libdir_msg="linker default"
check_crs_criu_dir=""
check_crs_criu_libdir=""
# Determine the search paths for the headers and libraries
AS_IF([test $check_crs_criu_good = yes],
[AS_IF([test ! -z "$with_criu" && test "$with_criu" != "yes"],
[check_crs_criu_dir="$with_criu"
check_crs_criu_dir_msg="$with_criu (from --with-criu)"])
AS_IF([test ! -z "$with_criu_libdir" && test "$with_criu_libdir" != "yes"],
[check_crs_criu_libdir="$with_criu_libdir"
check_crs_criu_libdir_msg="$with_criu_libdir (from --with-criu-libdir)"])
])
AS_IF([test $check_crs_criu_good = yes],
[AC_MSG_CHECKING([for CRIU dir])
AC_MSG_RESULT([$check_crs_criu_dir_msg])
AC_MSG_CHECKING([for CRIU library dir])
AC_MSG_RESULT([$check_crs_criu_libdir_msg])
OPAL_CHECK_PACKAGE([crs_criu_check],
[criu/criu.h],
[criu],
[criu_init_opts],
[],
[$check_crs_criu_dir],
[$check_crs_criu_libdir],
[check_crs_criu_good="yes"],
[check_crs_criu_good="no"])
])
crs_criu_CFLAGS="$CFLAGS $crs_criu_check_CFLAGS"
crs_criu_CPPFLAGS="$CPPFLAGS $crs_criu_check_CPPFLAGS"
crs_criu_LDFLAGS="$LDFLAGS $crs_criu_check_LDFLAGS"
crs_criu_LIBS="$LIBS $crs_criu_check_LIBS"
AS_IF([test $check_crs_criu_good = yes],
[ AC_SUBST([crs_criu_CFLAGS])
AC_SUBST([crs_criu_CPPFLAGS])
AC_SUBST([crs_criu_LDFLAGS])
AC_SUBST([crs_criu_LIBS])
$1],
[AS_IF([test ! -z "$with_criu" && test "$with_criu" != "no"],
[AC_MSG_WARN([CRIU support requested but not found. Perhaps you need to enable FT support, or specify the location of the CRIU libraries...?])
AC_MSG_ERROR([Aborting.])])
$2])
OPAL_VAR_SCOPE_POP
])dnl

Просмотреть файл

@ -1,88 +0,0 @@
/*
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* CRIU CRS component - support checkpoint/restart using CRIU
*/
#ifndef MCA_CRS_CRIU_EXPORT_H
#define MCA_CRS_CRIU_EXPORT_H
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/base/base.h"
#include <criu/criu.h>
BEGIN_C_DECLS
#define LOG_FILE ("criu.log")
/* Local Component structures */
struct opal_crs_criu_component_t {
/* Base CRS component */
opal_crs_base_component_t super;
/* criu log file */
char *log_file;
/* criu log level */
int log_level;
/* criu tcp established */
bool tcp_established;
/* criu shell job */
bool shell_job;
/* criu external unix sockets */
bool ext_unix_sk;
/* criu leave tasks in running state after checkpoint */
bool leave_running;
};
typedef struct opal_crs_criu_component_t opal_crs_criu_component_t;
OPAL_MODULE_DECLSPEC extern opal_crs_criu_component_t mca_crs_criu_component;
int opal_crs_criu_component_query(mca_base_module_t **module, int *priority);
/*
* Module functions
*/
int opal_crs_criu_module_init(void);
int opal_crs_criu_module_finalize(void);
int opal_crs_criu_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot,
opal_crs_base_ckpt_options_t *options,
opal_crs_state_type_t *state);
int opal_crs_criu_restart(opal_crs_base_snapshot_t *snapshot,
bool spawn_child, pid_t *child_pid);
int opal_crs_criu_disable_checkpoint(void);
int opal_crs_criu_enable_checkpoint(void);
int opal_crs_criu_prelaunch(int32_t rank, char *base_snapshot_dir, char **app,
char **cwd, char ***argv, char ***env);
int opal_crs_criu_reg_thread(void);
END_C_DECLS
#endif /* MCA_CRS_CRIU_EXPORT_H */

Просмотреть файл

@ -1,213 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2009 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/util/output.h"
#include "opal/constants.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "crs_criu.h"
/* Local functionality */
static int crs_criu_register(void);
static int crs_criu_open(void);
static int crs_criu_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
opal_crs_criu_component_t mca_crs_criu_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component itself
*/
.base_version = {
OPAL_CRS_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "criu",
MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
OPAL_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = crs_criu_open,
.mca_close_component = crs_criu_close,
.mca_query_component = opal_crs_criu_component_query,
.mca_register_component_params = crs_criu_register,
},
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.verbose = 0,
.output_handle = -1,
},
/* criu log file */
LOG_FILE,
/* criu log level */
0,
/* criu tcp established */
true,
/* criu shell job */
true,
/* criu external unix sockets */
true,
/* criu leave tasks in running state after checkpoint */
true
};
static int crs_criu_register(void)
{
int ret;
mca_base_component_t *component = &mca_crs_criu_component.super.base_version;
mca_crs_criu_component.super.priority = 10;
ret = mca_base_component_var_register(component, "priority",
"Priority of the CRS criu component (default: 10)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_crs_criu_component.super.priority);
if (0 > ret) {
return ret;
}
mca_crs_criu_component.super.verbose = 0;
ret = mca_base_component_var_register(component, "verbose",
"Verbose level for the CRS criu component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_crs_criu_component.super.verbose);
if (0 > ret) {
return ret;
}
ret = mca_base_component_var_register(component, "log", "Name of CRIU logfile (default: criu.log)",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_crs_criu_component.log_file);
if (0 > ret) {
return ret;
}
ret = mca_base_component_var_register(component, "log_level",
"Verbose level for the CRS criu component (default: 0)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_crs_criu_component.log_level);
if (0 > ret) {
return ret;
}
ret = mca_base_component_var_register(component, "tcp_established",
"Checkpoint/restore established TCP connections (default: true)",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_crs_criu_component.tcp_established);
if (0 > ret) {
return ret;
}
ret = mca_base_component_var_register(component, "shell_job",
"Allow to dump and restore shell jobs (default: true)",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_crs_criu_component.shell_job);
if (0 > ret) {
return ret;
}
ret = mca_base_component_var_register(component, "ext_unix_sk",
"Allow external unix connections (default: true)",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_crs_criu_component.ext_unix_sk);
if (0 > ret) {
return ret;
}
ret = mca_base_component_var_register(component, "leave_running",
"Leave tasks in running state after checkpoint (default: true)",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_crs_criu_component.leave_running);
return (0 > ret) ? ret : OPAL_SUCCESS;
}
static int crs_criu_open(void)
{
int oh;
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if (0 != mca_crs_criu_component.super.verbose) {
mca_crs_criu_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_crs_criu_component.super.output_handle,
mca_crs_criu_component.super.verbose);
} else {
mca_crs_criu_component.super.output_handle = opal_crs_base_framework.framework_output;
}
oh = mca_crs_criu_component.super.output_handle;
/*
* Debug output
*/
opal_output_verbose(10, oh, "crs:criu: open()");
opal_output_verbose(20, oh, "crs:criu: open: priority = %d",
mca_crs_criu_component.super.priority);
opal_output_verbose(20, oh, "crs:criu: open: verbosity = %d",
mca_crs_criu_component.super.verbose);
opal_output_verbose(20, oh, "crs:criu: open: log_file = %s",
mca_crs_criu_component.log_file);
opal_output_verbose(20, oh, "crs:criu: open: log_level = %d",
mca_crs_criu_component.log_level);
opal_output_verbose(20, oh, "crs:criu: open: tcp_established = %d",
mca_crs_criu_component.tcp_established);
opal_output_verbose(20, oh, "crs:criu: open: shell_job = %d",
mca_crs_criu_component.shell_job);
opal_output_verbose(20, oh, "crs:criu: open: ext_unix_sk = %d",
mca_crs_criu_component.ext_unix_sk);
opal_output_verbose(20, oh, "crs:criu: open: leave_running = %d",
mca_crs_criu_component.leave_running);
return OPAL_SUCCESS;
}
static int crs_criu_close(void)
{
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
"crs:criu: close()");
return OPAL_SUCCESS;
}

Просмотреть файл

@ -1,261 +0,0 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include "opal/util/show_help.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/constants.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "crs_criu.h"
/* CRIU module */
static opal_crs_base_module_t criu_module = {
/* Initialization Function */
opal_crs_criu_module_init,
/* Finalization Function */
opal_crs_criu_module_finalize,
/* Checkpoint interface */
opal_crs_criu_checkpoint,
/* Restart Command Access */
opal_crs_criu_restart,
/* Disable checkpoints */
opal_crs_criu_disable_checkpoint,
/* Enable checkpoints */
opal_crs_criu_enable_checkpoint,
/* Prelaunch */
opal_crs_criu_prelaunch,
/* Register Thread */
opal_crs_criu_reg_thread
};
/* Snapshot Class Functions */
OBJ_CLASS_DECLARATION(opal_crs_criu_snapshot_t);
struct opal_crs_criu_snapshot_t {
/* Base CRS snapshot type */
opal_crs_base_snapshot_t super;
};
typedef struct opal_crs_criu_snapshot_t opal_crs_criu_snapshot_t;
void opal_crs_criu_construct(opal_crs_criu_snapshot_t *obj);
void opal_crs_criu_destruct(opal_crs_criu_snapshot_t *obj);
OBJ_CLASS_INSTANCE(opal_crs_criu_snapshot_t,
opal_crs_base_snapshot_t,
opal_crs_criu_construct,
opal_crs_criu_destruct);
void opal_crs_criu_construct(opal_crs_criu_snapshot_t *snapshot)
{
snapshot->super.component_name = strdup(mca_crs_criu_component.super.base_version.mca_component_name);
}
void opal_crs_criu_destruct(opal_crs_criu_snapshot_t *snapshot)
{
}
int opal_crs_criu_component_query(mca_base_module_t **module, int *priority)
{
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
"crs:criu: component_query()");
*priority = mca_crs_criu_component.super.priority;
*module = (mca_base_module_t *)&criu_module;
return OPAL_SUCCESS;
}
int opal_crs_criu_module_init(void)
{
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
"crs:criu: module_init()");
return OPAL_SUCCESS;
}
int opal_crs_criu_module_finalize(void)
{
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
"crs:criu: module_finalize()");
return OPAL_SUCCESS;
}
static void criu_error(int ret, pid_t pid)
{
switch (ret) {
case -EBADE:
opal_output(0, "crs:criu:(PID:%d):RPC has returned fail", pid);
break;
case -ECONNREFUSED:
opal_output(0, "crs:criu:(PID:%d):Unable to connect to CRIU", pid);
break;
case -ECOMM:
opal_output(0, "crs:criu:(PID:%d):Unable to send/recv msg to/from CRIU", pid);
break;
case -EINVAL:
opal_output(0, "crs:criu:(PID:%d):CRIU doesn't support this type of request."
"You should probably update CRIU", pid);
break;
case -EBADMSG:
opal_output(0, "crs:criu:(PID:%d):Unexpected response from CRIU."
"You should probably update CRIU", pid);
break;
default:
opal_output(0, "crs:criu:(PID:%d):Unknown error type code."
"You should probably update CRIU", pid);
}
}
int opal_crs_criu_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
opal_crs_base_ckpt_options_t *options,
opal_crs_state_type_t *state)
{
int ret;
int fd = 0;
int oh = mca_crs_criu_component.super.output_handle;
opal_crs_criu_snapshot_t *snapshot = NULL;
char *dest = NULL;
opal_output_verbose(10, oh, "crs:criu: checkpoint(%d, ---)", pid);
snapshot = (opal_crs_criu_snapshot_t *)base_snapshot;
snapshot->super.component_name = strdup(mca_crs_criu_component.super.base_version.mca_component_name);
if (NULL == snapshot->super.metadata) {
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a"))) {
opal_output(oh, "crs:criu: checkpoint(): Error: Unable to open the file (%s)",
snapshot->super.metadata_filename);
*state = OPAL_CRS_ERROR;
goto cleanup;
}
}
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);
fclose(snapshot->super.metadata);
snapshot->super.metadata = NULL;
ret = criu_init_opts();
if (ret < 0) {
criu_error(ret, pid);
*state = OPAL_CRS_ERROR;
goto cleanup;
}
opal_output_verbose(10, oh, "crs:criu: criu_init_opts() returned %d", ret);
dest = snapshot->super.snapshot_directory;
opal_output_verbose(10, oh, "crs:criu: opening snapshot directory %s", dest);
fd = open(dest, O_DIRECTORY);
if (fd < 0) {
*state = OPAL_CRS_ERROR;
opal_output(oh, "crs:criu: checkpoint(): Error: Unable to open checkpoint "
"directory (%s) for pid (%d)", dest, pid);
goto cleanup;
}
/* http://criu.org/C_API */
criu_set_images_dir_fd(fd);
criu_set_pid(pid);
criu_set_log_file(mca_crs_criu_component.log_file);
criu_set_log_level(mca_crs_criu_component.log_level);
criu_set_tcp_established(mca_crs_criu_component.tcp_established);
criu_set_shell_job(mca_crs_criu_component.shell_job);
criu_set_ext_unix_sk(mca_crs_criu_component.ext_unix_sk);
criu_set_leave_running(mca_crs_criu_component.leave_running);
ret = criu_dump();
if (ret < 0) {
criu_error(ret, pid);
*state = OPAL_CRS_ERROR;
goto cleanup;
}
*state = OPAL_CRS_CONTINUE;
cleanup:
if (fd > 0) {
close(fd);
}
if (OPAL_CRS_ERROR == *state) {
return OPAL_ERROR;
}
return OPAL_SUCCESS;
}
int opal_crs_criu_restart(opal_crs_base_snapshot_t *snapshot,
bool spawn_child, pid_t *child_pid)
{
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
"crs:criu: %s", __func__);
return OPAL_SUCCESS;
}
int opal_crs_criu_disable_checkpoint(void)
{
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
"crs:criu: %s", __func__);
return OPAL_SUCCESS;
}
int opal_crs_criu_enable_checkpoint(void)
{
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
"crs:criu: %s", __func__);
return OPAL_SUCCESS;
}
int opal_crs_criu_prelaunch(int32_t rank, char *base_snapshot_dir,
char **app, char **cwd, char ***argv,
char ***env)
{
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
"crs:criu: %s", __func__);
return OPAL_SUCCESS;
}
int opal_crs_criu_reg_thread(void)
{
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
"crs:criu: %s", __func__);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: CISCO
status: maintenance

Просмотреть файл

Просмотреть файл

@ -1,43 +0,0 @@
#
# Copyright (c) 2010 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
CFLAGS = $(crs_dmtcp_CFLAGS)
AM_CPPFLAGS = $(crs_dmtcp_CPPFLAGS)
sources = \
crs_dmtcp.h \
crs_dmtcp_component.c \
crs_dmtcp_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_opal_crs_dmtcp_DSO
component_noinst =
component_install = mca_crs_dmtcp.la
else
component_noinst = libmca_crs_dmtcp.la
component_install =
endif
mcacomponentdir = $(opallibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_crs_dmtcp_la_SOURCES = $(sources)
mca_crs_dmtcp_la_LDFLAGS = -module -avoid-version $(crs_dmtcp_LDFLAGS)
mca_crs_dmtcp_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
$(crs_dmtcp_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_crs_dmtcp_la_SOURCES = $(sources)
libmca_crs_dmtcp_la_LDFLAGS = -module -avoid-version $(crs_dmtcp_LDFLAGS)
libmca_crs_dmtcp_la_LIBADD = $(crs_dmtcp_LIBS)

Просмотреть файл

@ -1,140 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2015 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_opal_crs_dmtcp_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_opal_crs_dmtcp_CONFIG],[
AC_CONFIG_FILES([opal/mca/crs/dmtcp/Makefile])
OPAL_VAR_SCOPE_PUSH([opal_check_crs_dmtcp_good opal_opal_check_crs_dmtcp_save_CPPFLAGS opal_opal_check_crs_dmtcp_save_LDFLAGS opal_opal_check_crs_dmtcp_save_LIBS opal_check_crs_dmtcp_dir_msg opal_check_crs_dmtcp_libdir_msg opal_check_crs_dmtcp_dir opal_check_crs_dmtcp_libdir])
opal_check_crs_dmtcp_good="no"
# Configure option to specify where to look for DMTCP headers
# --with-dmtcp(=DIR)
AC_ARG_WITH([dmtcp],
[AC_HELP_STRING([--with-dmtcp(=DIR)],
[Path to DMTCP Installation])])
OPAL_CHECK_WITHDIR([dmtcp], [$with_dmtcp], [include/mtcp.h])
# Configure option to specify where to look for DMTCP libraries
# (Default: $with_dmtcp/lib)
# --with-dmtcp-libdir=DIR
AC_ARG_WITH([dmtcp-libdir],
[AC_HELP_STRING([--with-dmtcp-libdir=DIR],
[Search for DMTCP libraries in DIR])])
OPAL_CHECK_WITHDIR([dmtcp-libdir], [$with_dmtcp_libdir], [libmtcp.so])
#
# Check if Open MPI was compiled with Checkpoint/Restart support
# If not, then we do not compile this component
#
AS_IF([test "$opal_want_ft" = "0"],
[opal_check_crs_dmtcp_good="no"],
[opal_check_crs_dmtcp_good="yes"])
#
# Check if the user explicitly requested -not- to build the DMTCP component
# If so, the we do not compile this component
#
AS_IF([test "$with_dmtcp" = "no" || test "$opal_check_crs_dmtcp_good" = "no"],
[opal_check_crs_dmtcp_good="no"],
[opal_check_crs_dmtcp_good="yes"])
# Save some flags
opal_opal_check_crs_dmtcp_save_CPPFLAGS=$CPPFLAGS
opal_opal_check_crs_dmtcp_save_LDFLAGS=$LDFLAGS
opal_opal_check_crs_dmtcp_save_LIBS=$LIBS
#
# Now to check if the library is usable
#
opal_check_crs_dmtcp_dir_msg="compiler default"
opal_check_crs_dmtcp_libdir_msg="linker default"
opal_check_crs_dmtcp_dir=""
opal_check_crs_dmtcp_libdir=""
# Determine the search paths for the headers and libraries
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
[AS_IF([test ! -z "$with_dmtcp" && test "$with_dmtcp" != "yes"],
[opal_check_crs_dmtcp_dir="$with_dmtcp"
opal_check_crs_dmtcp_dir_msg="$with_dmtcp (from --with-dmtcp)"])
AS_IF([test ! -z "$with_dmtcp_libdir" && test "$with_dmtcp_libdir" != "yes"],
[opal_check_crs_dmtcp_libdir="$with_dmtcp_libdir"
opal_check_crs_dmtcp_libdir_msg="$with_dmtcp_libdir (from --with-dmtcp-libdir)"])
])
# Look for DMTCP.
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
[AC_MSG_CHECKING([for DMTCP dir])
AC_MSG_RESULT([$opal_check_crs_dmtcp_dir_msg])
AC_MSG_CHECKING([for DMTCP library dir])
AC_MSG_RESULT([$opal_check_crs_dmtcp_libdir_msg])
OPAL_CHECK_PACKAGE([crs_dmtcp_check],
[mtcp.h],
[mtcp],
[mtcp_init],
[],
[$opal_check_crs_dmtcp_dir],
[$opal_check_crs_dmtcp_libdir],
[opal_check_crs_dmtcp_good="yes"],
[opal_check_crs_dmtcp_good="no"])
])
# When we restart a thread, we use execlp() to exec the "mtcp_restart"
# command. We don't care what its path is, but it does need to exist in
# the PATH.
AC_CHECK_PROG([mtcp_restart_command_exists], ["mtcp_restart"], ["yes"], ["no"])
AS_IF([test "$mtcp_restart_command_exists" = "no"],
[opal_check_crs_dmtcp_good="no"
AS_IF([test ! -z "$with_dmtcp" && test "$with_dmtcp" != "no"],
[AC_MSG_WARN([mtcp_restart not found in PATH.])
AC_MSG_ERROR([Aborting.])])])
#
# If '-lmtcp' or
# '-I' or '-L' was needed to link to MTCP, then OPAL_CHECK_PACKAGE
# sets the crs_mtcp_check_* variables, which we use below.
#
crs_dmtcp_CFLAGS="$CFLAGS $crs_dmtcp_check_CFLAGS"
crs_dmtcp_CPPFLAGS="$CPPFLAGS $crs_dmtcp_check_CPPFLAGS"
crs_dmtcp_LDFLAGS="$LDFLAGS $crs_dmtcp_check_LDFLAGS"
crs_dmtcp_LIBS="$crs_dmtcp_check_LIBS $LIBS"
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
[$1])
CPPFLAGS=$opal_opal_check_crs_dmtcp_save_CPPFLAGS
LDFLAGS="$crs_dmtcp_check_LDFLAGS $opal_opal_check_crs_dmtcp_save_LDFLAGS"
LIBS="$crs_dmtcp_LIBS $opal_opal_check_crs_dmtcp_save_LIBS"
AC_SUBST([crs_dmtcp_CFLAGS])
AC_SUBST([crs_dmtcp_CPPFLAGS])
AC_SUBST([crs_dmtcp_LDFLAGS])
AC_SUBST([crs_dmtcp_LIBS])
# If all is good at this point then post any compiler options to
# the build environment. If all is not good at this point and
# DMTCP was explicitly requested, then error out.
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
[$1],
[AS_IF([test ! -z "$with_dmtcp" && test "$with_dmtcp" != "no"],
[AC_MSG_WARN([DMTCP support requested but not found. Perhaps you need to specify the location of the DMTCP libraries.])
AC_MSG_ERROR([Aborting.])])
$2])
OPAL_VAR_SCOPE_POP
])dnl

Просмотреть файл

@ -1,87 +0,0 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* DMTCP CRS component
*
*/
#ifndef MCA_CRS_DMTCP_EXPORT_H
#define MCA_CRS_DMTCP_EXPORT_H
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/base/base.h"
/* JJH NOTE: Include your library header here */
/* #include <libmtcp.h> */
#include <mtcp.h>
BEGIN_C_DECLS
/*
* Local Component Structure
*/
struct opal_crs_dmtcp_component_t {
/** Base CRS component */
opal_crs_base_component_t super;
/** JJH: Add additional items here as needed internally */
};
typedef struct opal_crs_dmtcp_component_t opal_crs_dmtcp_component_t;
OPAL_MODULE_DECLSPEC extern opal_crs_dmtcp_component_t mca_crs_dmtcp_component;
/*
* Component query command
* - Called during opal_init() to determine if this component should be selected.
*/
int opal_crs_dmtcp_component_query(mca_base_module_t **module, int *priority);
/*
* Module functions
*/
int opal_crs_dmtcp_module_init(void);
int opal_crs_dmtcp_module_finalize(void);
/*
* Actual CRS funcationality
*/
int opal_crs_dmtcp_checkpoint( pid_t pid,
opal_crs_base_snapshot_t *snapshot,
opal_crs_base_ckpt_options_t *options,
opal_crs_state_type_t *state);
int opal_crs_dmtcp_restart( opal_crs_base_snapshot_t *snapshot,
bool spawn_child,
pid_t *child_pid);
int opal_crs_dmtcp_disable_checkpoint(void);
int opal_crs_dmtcp_enable_checkpoint(void);
int opal_crs_dmtcp_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
int opal_crs_dmtcp_reg_thread(void);
END_C_DECLS
#endif /* MCA_CRS_DMTCP_EXPORT_H */

Просмотреть файл

@ -1,133 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/util/output.h"
#include "opal/constants.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "crs_dmtcp.h"
/*
* Local functionality
*/
static int crs_dmtcp_register (void);
static int crs_dmtcp_open(void);
static int crs_dmtcp_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
opal_crs_dmtcp_component_t mca_crs_dmtcp_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component itself
*/
.base_version = {
OPAL_CRS_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "dmtcp",
MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
OPAL_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = crs_dmtcp_open,
.mca_close_component = crs_dmtcp_close,
.mca_query_component = opal_crs_dmtcp_component_query,
.mca_register_component_params = crs_dmtcp_register,
},
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.verbose = 0,
.output_handle = -1
}
};
static int crs_dmtcp_register (void)
{
int ret;
/*
* User can adjust the relative priority of this component with respect
* to other CRS components available for selection.
*/
mca_crs_dmtcp_component.super.priority = 20
ret = mca_base_component_var_register (&mca_crs_dmtcp_component.super.base_version,
"priority", "Priority of the CRS dmtcp component "
"(default: 20)", MCA_BASE_VAR_TYPE_INT, NULL,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_crs_dmtcp_component.super.priority);
if (0 > ret) {
return ret;
}
/*
* Adjust the verbosity level for this component. Default off or 0.
*/
mca_crs_dmtcp_component.super.verbose = 0;
ret = mca_base_component_var_register (&mca_crs_dmtcp_component.super.base_version,
"verbose",
"Verbose level for the CRS dmtcp component",
MCA_BASE_VAR_TYPE_INT, NULL,MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_crs_dmtcp_component.super.verbose);
return (0 > ret) ? ret : OPAL_SUCCESS;
}
static int crs_dmtcp_open(void)
{
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_crs_dmtcp_component.super.verbose) {
mca_crs_dmtcp_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_crs_dmtcp_component.super.output_handle,
mca_crs_dmtcp_component.super.verbose);
} else {
mca_crs_dmtcp_component.super.output_handle = opal_crs_base_framework.framework_output;
}
/*
* Debug output
*/
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: open()");
opal_output_verbose(20, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: open: priority = %d",
mca_crs_dmtcp_component.super.priority);
opal_output_verbose(20, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: open: verbosity = %d",
mca_crs_dmtcp_component.super.verbose);
return OPAL_SUCCESS;
}
static int crs_dmtcp_close(void)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: close()");
return OPAL_SUCCESS;
}

Просмотреть файл

@ -1,709 +0,0 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/constants.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/mca/event/event.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "crs_dmtcp.h"
#define MTCP_RESTART_COMMAND "mtcp_restart"
/*
* DMTCP module
*/
static opal_crs_base_module_t dmtcp_module = {
/** Initialization Function */
opal_crs_dmtcp_module_init,
/** Finalization Function */
opal_crs_dmtcp_module_finalize,
/** Checkpoint interface */
opal_crs_dmtcp_checkpoint,
/** Restart Command Access */
opal_crs_dmtcp_restart,
/** Disable checkpoints */
opal_crs_dmtcp_disable_checkpoint,
/** Enable checkpoints */
opal_crs_dmtcp_enable_checkpoint,
/** Prelaunch */
opal_crs_dmtcp_prelaunch,
/** Register Thread */
opal_crs_dmtcp_reg_thread
};
/***************************
* Snapshot Class Functions
***************************/
OBJ_CLASS_DECLARATION(opal_crs_dmtcp_snapshot_t);
struct opal_crs_dmtcp_snapshot_t {
/** Base CRS snapshot type */
opal_crs_base_snapshot_t super;
char * context_filename;
};
typedef struct opal_crs_dmtcp_snapshot_t opal_crs_dmtcp_snapshot_t;
void opal_crs_dmtcp_construct(opal_crs_dmtcp_snapshot_t *obj);
void opal_crs_dmtcp_destruct(opal_crs_dmtcp_snapshot_t *obj);
OBJ_CLASS_INSTANCE(opal_crs_dmtcp_snapshot_t,
opal_crs_base_snapshot_t,
opal_crs_dmtcp_construct,
opal_crs_dmtcp_destruct);
/******************
* Local Functions
******************/
static int dmtcp_cold_start(opal_crs_dmtcp_snapshot_t *snapshot);
static int dmtcp_generate_full_ckpt_path(opal_crs_dmtcp_snapshot_t *snapshot);
static void dmtcp_sleep_between_ckpt_callback(int interval);
static void dmtcp_pre_ckpt_callback(char **ckpt_filename);
static void dmtcp_post_ckpt_callback(int is_restarting,
char *mtcp_restore_argv_start_addr);
static int dmtcp_should_ckpt_fd_callback(int fd);
/*************************
* Local Global Variables
*************************/
static char *full_ckpt_path = NULL;
static pthread_cond_t checkpoint_cond = PTHREAD_COND_INITIALIZER;
static pthread_cond_t checkpoint_done_cond = PTHREAD_COND_INITIALIZER;
static pthread_mutex_t checkpoint_mutex = PTHREAD_MUTEX_INITIALIZER;
static int post_ckpt_state;
void opal_crs_dmtcp_construct(opal_crs_dmtcp_snapshot_t *snapshot) {
snapshot->context_filename = NULL;
snapshot->super.component_name =
strdup(mca_crs_dmtcp_component.super.base_version.mca_component_name);
}
void opal_crs_dmtcp_destruct( opal_crs_dmtcp_snapshot_t *snapshot) {
if(NULL != snapshot->context_filename) {
free(snapshot->context_filename);
snapshot->context_filename = NULL;
}
}
/*****************
* MCA Functions
*****************/
int opal_crs_dmtcp_component_query(mca_base_module_t **module, int *priority)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: component_query()");
*priority = mca_crs_dmtcp_component.super.priority;
*module = (mca_base_module_t *)&dmtcp_module;
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_module_init(void)
{
char *temp_checkpoint_name;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: module_init()");
/*
* JJH NOTE: Call any initialization routines you require
*/
mtcp_set_callbacks(dmtcp_sleep_between_ckpt_callback, /* sleep_between_ckpt */
dmtcp_pre_ckpt_callback, /* pre_ckpt */
dmtcp_post_ckpt_callback, /* post_ckpt */
dmtcp_should_ckpt_fd_callback, /* ckpt_fd */
NULL); /* write_ckpt_header */
/* This serves to simply initialize MTCP. The checkpoint file will
* actually be set by our pre_ckpt callback (which takes it from the
* snapshot given to the CRS checkpoint function), and the interval will be
* ignored, substituted for a synchronization signal that is handled by our
* sleep_between_ckpt callback.
*/
asprintf(&temp_checkpoint_name, "checkpoint.dmtcp.%ld", syscall(SYS_getpid));
mtcp_init(temp_checkpoint_name, 0, 1);
mtcp_ok();
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: leaving module_init()");
free(temp_checkpoint_name);
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_module_finalize(void)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: module_finalize()");
/*
* JJH NOTE: Call any finalization routines you require
*/
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env)
{
char * tmp_env_var = NULL;
/*
* The below should be left untouched for now
*/
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
opal_setenv(tmp_env_var,
"0", true, env);
free(tmp_env_var);
tmp_env_var = NULL;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: leaving module_prelaunch()");
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_reg_thread(void)
{
/*
* JJH NOTE: If you require that all threads that may call into MTCP
* explicitly register with MTCP, then place the necessary
* initialization here.
*/
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: leaving module_reg_thread()");
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_checkpoint(pid_t pid,
opal_crs_base_snapshot_t *base_snapshot,
opal_crs_base_ckpt_options_t *options,
opal_crs_state_type_t *state)
{
int unlock_retval, exit_status = OPAL_SUCCESS;
char buf[BUFSIZ];
opal_crs_dmtcp_snapshot_t *snapshot;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: about to lock mutex for checkpoint()");
pthread_mutex_lock(&checkpoint_mutex);
snapshot = (opal_crs_dmtcp_snapshot_t *) base_snapshot;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: checkpoint(%d, ---)", pid);
/* Are we checkpointing ourselves or a peer.
* JJH NOTE: This will only ever be called when pid == getpid()
* This is an old interface argument, that is no longer used.
*/
/* bricka (2010-05-14): According to crs.h, 0 also indicates checkpointing
* self.
*/
if((pid != 0) && (pid != syscall(SYS_getpid)) ) {
/* MTCP can only checkpoint a single process: we can only checkpoint
* ourself. */
*state = OPAL_CRS_ERROR;
exit_status = OPAL_ERROR;
goto cleanup;
}
/* the metadata file should always be NULL at this point */
if ( NULL != snapshot->super.metadata) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: checkpoint(): Error: Metadata file already open");
exit_status = OPAL_ERROR;
goto cleanup;
}
/*
* Update the snapshot metadata with the component name so opal-restart can
* pick the correct CRS to restart with.
*/
snapshot->super.component_name = strdup(mca_crs_dmtcp_component.super.base_version.mca_component_name);
if( NULL == snapshot->super.metadata ) {
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: checkpoint(): Error: Unable to open the file (%s)",
snapshot->super.metadata_filename);
exit_status = OPAL_ERROR;
goto cleanup;
}
}
/* The filename of the checkpoint will be changed by our pre_ckpt hook
* based on the options given to this function. */
if(dmtcp_generate_full_ckpt_path(snapshot) == -1) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: unable to generate context filename.");
exit_status = OPAL_ERROR;
goto cleanup;
}
/*
* JJH NOTE: You can write however much or little data you want to the
* metadata file. The metadata file is stored with the local
* checkpoint, and provided at restart time to help the
* CRS component deteremine how to restart from any files
* that is left in this directory during checkpoint.
* Use the command below to write key/value strings to the
* metadata file.
* (Just as we did above with the component name).
*/
if ( 0 > fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name)) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: unable to print component name to metadata");
}
if ( 0 > fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->context_filename)) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: unable to print context name to metadata");
}
fclose(snapshot->super.metadata );
snapshot->super.metadata = NULL;
/*
* JJH NOTE: Setup and request a checkpoint of this process.
*/
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: will checkpoint to file: %s",
full_ckpt_path);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: about to signal checkpoint");
/* Now that we have set the requested filename, we simply need to start
* the checkpoint. */
pthread_cond_signal(&checkpoint_cond);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: signalled checkpoint");
/* We want to wait for the checkpoint to finish before we continue (in
* particular, we need the post_ckpt hook to happen so that we know the
* status of the checkpoint)
*/
pthread_cond_wait(&checkpoint_done_cond, &checkpoint_mutex);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: received checkpoint_done signal");
/* We have now been checkpointed. Note that the state of the checkpoint
* (OPAL_CRS_CONTINUE, etc.) has been recorded by the post_ckpt hook.
*/
*state = post_ckpt_state;
exit_status = OPAL_SUCCESS;
free(full_ckpt_path);
cleanup:
unlock_retval = pthread_mutex_unlock(&checkpoint_mutex);
if( 0 != unlock_retval ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_checkpoint: unable to unlock mutex at end of checkpoint: %s",
strerror_r(unlock_retval, buf, BUFSIZ));
exit_status = OPAL_ERROR;
}
if( NULL != snapshot->super.metadata ) {
fclose(snapshot->super.metadata );
snapshot->super.metadata = NULL;
}
return exit_status;
}
int opal_crs_dmtcp_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
{
int ret, exit_status = OPAL_SUCCESS;
int exec_status;
opal_crs_dmtcp_snapshot_t *snapshot = OBJ_NEW(opal_crs_dmtcp_snapshot_t);
snapshot->super = *base_snapshot;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: restart(--, %d)", spawn_child);
/*
* JJH NOTE: 'cold_start' indicates that this process is being restarted from
* opal-restart instead of from within an already running process.
* In the current code base, this is always set to true since it
* does not allow a process to request a restart of itself.
*/
if(snapshot->super.cold_start) {
/*
* Read the metadata left by the checkpoint() of this process
*/
if( OPAL_SUCCESS != (ret = dmtcp_cold_start(snapshot)) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: Unable to reconstruct the snapshot.");
exit_status = OPAL_ERROR;
goto cleanup;
}
}
/* JJH NOTE: Nearly all of the time the 'spawn_child' argument is set to
* 'false' indicating that the restart function is expected to
* call exec() directly. It is only set to 'true' if the user
* explicitly tells opal-restart to spawn off the child, which
* rarely/never happens. So I would not worry about that option.
*/
if( spawn_child ) {
pid_t child_pid = fork();
if(child_pid > 0)
goto cleanup;
else if(child_pid < 0) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: Unable to spawn child.");
exit_status = OPAL_ERROR;
goto cleanup;
}
}
/*
* JJH NOTE: Restart the process by replacing this process
*/
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: About to invoke command: %s with argv: %s %s",
MTCP_RESTART_COMMAND,
MTCP_RESTART_COMMAND,
snapshot->context_filename);
exec_status = execlp(MTCP_RESTART_COMMAND, MTCP_RESTART_COMMAND, snapshot->context_filename, NULL);
/* If we get down here, something has broken. */
if(exec_status < 0)
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: error in replacing process: %s",
strerror(errno));
else
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_restart: exec() returned!");
exit_status = OPAL_ERROR;
goto cleanup;
cleanup:
return exit_status;
}
int opal_crs_dmtcp_disable_checkpoint(void)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: disable_checkpoint()");
/*
* JJH NOTE: Enter a critical section. This is not really used in the code
* at the moment.
*/
mtcp_no();
return OPAL_SUCCESS;
}
int opal_crs_dmtcp_enable_checkpoint(void)
{
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: enable_checkpoint()");
/*
* JJH NOTE: Leave a critical section. This is not really used in the code
* at the moment.
*/
mtcp_ok();
return OPAL_SUCCESS;
}
/*****************************
* Local Function Definitions
*****************************/
static int dmtcp_cold_start(opal_crs_dmtcp_snapshot_t *snapshot) {
int ret, exit_status = OPAL_SUCCESS;
char **tmp_argv = NULL;
char * component_name = NULL;
int prev_pid;
/*
* Find the snapshot directory, read the metadata file for
* component name and previous pid
*/
if( NULL == snapshot->super.metadata ) {
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "r")) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_cold_start(): Error: Unable to open the file (%s)",
snapshot->super.metadata_filename);
exit_status = OPAL_ERROR;
goto cleanup;
}
}
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
&component_name, &prev_pid) ) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
snapshot->super.metadata_filename, ret);
exit_status = ret;
goto cleanup;
}
snapshot->super.component_name = strdup(component_name);
/*
* Compare the component strings to make sure this is our snapshot before going further.
* JJH NOTE: This will nearly always be true since opal-restart also checks this metadata.
*/
if ( 0 != strncmp(mca_crs_dmtcp_component.super.base_version.mca_component_name,
component_name, strlen(component_name)) ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
component_name, mca_crs_dmtcp_component.super.base_version.mca_component_name);
exit_status = OPAL_ERROR;
goto cleanup;
}
/*
* Read context information from the metadata file
*/
opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
if( NULL == tmp_argv ) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: dmtcp_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
exit_status = OPAL_ERROR;
goto cleanup;
}
asprintf(&(snapshot->context_filename), "%s/%s", snapshot->super.snapshot_directory, tmp_argv[0]);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: cold_start(%s)", snapshot->context_filename);
/*
* Reset the cold_start flag
*/
snapshot->super.cold_start = false;
cleanup:
if(NULL != tmp_argv) {
opal_argv_free(tmp_argv);
tmp_argv = NULL;
}
if( NULL != snapshot->super.metadata ) {
fclose(snapshot->super.metadata);
snapshot->super.metadata = NULL;
}
return exit_status;
}
/**
* Given a snapshot, generate the context filename and its full path.
*
* @param snapshot the snapshot with request information
*/
static int dmtcp_generate_full_ckpt_path(opal_crs_dmtcp_snapshot_t *snapshot)
{
int retval;
retval = asprintf(&(snapshot->context_filename), "ompi_dmtcp_context.%ld", syscall(SYS_getpid));
if(retval == -1)
return -1;
return asprintf(&full_ckpt_path, "%s/%s", snapshot->super.snapshot_directory, snapshot->context_filename);
}
/**
* This is a callback function to call the actual checkpointing routine.
* Instead of waiting for a specific interval as MTCP does, we will wait on a
* synchronization signal that will allow us to checkpoint on demand. The
* argument to this function will be ignored.
*/
static void dmtcp_sleep_between_ckpt_callback(int interval)
{
int signal_retval;
char buf[BUFSIZ];
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: called sleep_between_ckpt callback");
pthread_mutex_lock(&checkpoint_mutex);
/* If the MPI checkpoint thread is waiting on the checkpoint_done_cond and
* this thread is here, it means that a checkpoint has just completed.
* Let's signal the MPI checkpoint thread to resume. */
signal_retval = pthread_cond_signal(&checkpoint_done_cond);
if( 0 != signal_retval) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: post_ckpt_callback(): Unable to signal checkpoint done: %s",
strerror_r(signal_retval, buf, BUFSIZ));
}
/* now we simply wait for the signal to checkpoint */
pthread_cond_wait(&checkpoint_cond, &checkpoint_mutex);
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: received sync signal to checkpoint.");
/* We have now been instructed to checkpoint, so we return. Note that the
* mutex is still locked: the post_ckpt callback will unlock it. */
}
/**
* This is a callback function that is invoked before the checkpoint actually
* occurs. It enables us to do any logging that is necessary, as well as change
* the filename that the checkpoint will be written to. We expect that this
* filename will be pulled from the checkpoint options.
*
* @param ckpt_filename a pointer in which to store the desired checkpoint
* filename
*/
static void dmtcp_pre_ckpt_callback(char **ckpt_filename)
{
*ckpt_filename = full_ckpt_path;
}
/**
* This is a callback function that is invoked after the checkpoint has
* finished. It enables us to do any logging that is necessary, as well as
* report whether this is called from a restart or a checkpoint. We will report
* this status, signal the CRS code to continue running, and then release the
* mutex that we are holding.
*
* @param is_restarting whether or not this is being called as part of a restart
* @param mtcp_restore_argv_start_addr unused
*/
static void dmtcp_post_ckpt_callback(int is_restarting, char *mtcp_restore_argv_start_addr)
{
int unlock_retval;
char buf[BUFSIZ];
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: in post_ckpt_callback, restarting: %d", is_restarting);
if(is_restarting)
post_ckpt_state = OPAL_CRS_RESTART;
else
post_ckpt_state = OPAL_CRS_CONTINUE;
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: unlocking at end of post_ckpt_callback");
unlock_retval = pthread_mutex_unlock(&checkpoint_mutex);
if( 0 != unlock_retval) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: post_ckpt_callback(): Unable to unlock mutex: %s",
strerror_r(unlock_retval, buf, BUFSIZ));
}
}
/**
* This is a callback function that is invoked by DMTCP to see if it should
* checkpoint the given file descriptor.
*
* If the file descriptor is a socket, named-pipe or pseudo-terminal, DMTCP
* should skip checkpointing them.
*
* If we can't determine the type of fd (stat and/or readlink failed), we ask
* DMTCP to try to checkpoint them anyways with the assumption that DMTCP would
* warn users of any such case.
*
* @param fd file descriptor to checkpoint
* @return: 1 if DMTCP should ckpt the file descriptor, 0 otherwise.
*/
static int dmtcp_should_ckpt_fd_callback(int fd)
{
struct stat stat_buf;
char device_name[PATH_MAX];
char proc_filename[64];
char buf[BUFSIZ];
if (fstat(fd, &stat_buf) != 0) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: should_ckpt_fd_callback(): error stat()'ing %d: %s",
fd, strerror_r(errno, buf, BUFSIZ));
return 1;
/* Don't checkpoint sockets and FIFOs */
} else if (S_ISSOCK(stat_buf.st_mode) || S_ISFIFO(stat_buf.st_mode)) {
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: skipping checkpointing socket/fifo: %d",
fd);
return 0;
}
memset(device_name, 0, sizeof device_name);
sprintf(proc_filename, "/proc/self/fd/%d", fd);
if (readlink(proc_filename, device_name, sizeof(device_name) - 1) <= 0) {
opal_output(mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: should_ckpt_fd_callback(): readlink(%d) failed: %s",
fd, strerror_r(errno, buf, BUFSIZ));
return 1;
}
/* Don't checkpoint ptys */
if (strstr(device_name, "/dev/pts/") == 0 ||
strstr(device_name, "/dev/pty") == 0 ||
strstr(device_name, "/dev/tty") == 0) {
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
"crs:dmtcp: skipping checkpointing %s",
device_name);
return 0;
}
/* Checkpoint fd by default */
return 1;
}

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: U Brit.Columbia
status: unmaintained