remove some dead crs components
Signed-off-by: Howard Pritchard <howardp@lanl.gov>
(cherry picked from commit 6564d3d217
)
Этот коммит содержится в:
родитель
b8e040c704
Коммит
210b4c60aa
@ -1,51 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2004-2007 The Trustees of Indiana University.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
# University of Stuttgart. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
CFLAGS = $(crs_blcr_CFLAGS)
|
|
||||||
AM_CPPFLAGS = $(crs_blcr_CPPFLAGS)
|
|
||||||
|
|
||||||
dist_opaldata_DATA = help-opal-crs-blcr.txt
|
|
||||||
|
|
||||||
sources = \
|
|
||||||
crs_blcr.h \
|
|
||||||
crs_blcr_component.c \
|
|
||||||
crs_blcr_module.c
|
|
||||||
|
|
||||||
# Make the output library in this directory, and name it either
|
|
||||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
|
||||||
# (for static builds).
|
|
||||||
|
|
||||||
if MCA_BUILD_opal_crs_blcr_DSO
|
|
||||||
component_noinst =
|
|
||||||
component_install = mca_crs_blcr.la
|
|
||||||
else
|
|
||||||
component_noinst = libmca_crs_blcr.la
|
|
||||||
component_install =
|
|
||||||
endif
|
|
||||||
|
|
||||||
mcacomponentdir = $(opallibdir)
|
|
||||||
mcacomponent_LTLIBRARIES = $(component_install)
|
|
||||||
mca_crs_blcr_la_SOURCES = $(sources)
|
|
||||||
mca_crs_blcr_la_LDFLAGS = -module -avoid-version $(crs_blcr_LDFLAGS)
|
|
||||||
mca_crs_blcr_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
|
|
||||||
$(crs_blcr_LIBS)
|
|
||||||
|
|
||||||
noinst_LTLIBRARIES = $(component_noinst)
|
|
||||||
libmca_crs_blcr_la_SOURCES = $(sources)
|
|
||||||
libmca_crs_blcr_la_LDFLAGS = -module -avoid-version $(crs_blcr_LDFLAGS)
|
|
||||||
libmca_crs_blcr_la_LIBADD = $(crs_blcr_LIBS)
|
|
@ -1,204 +0,0 @@
|
|||||||
# -*- shell-script -*-
|
|
||||||
#
|
|
||||||
# Copyright (c) 2004-2010 The Trustees of Indiana University.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
# University of Stuttgart. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
|
||||||
# Copyright (c) 2015 Research Organization for Information Science
|
|
||||||
# and Technology (RIST). All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
# MCA_crs_blcr_CONFIG([action-if-found], [action-if-not-found])
|
|
||||||
# -----------------------------------------------------------
|
|
||||||
AC_DEFUN([MCA_opal_crs_blcr_CONFIG],[
|
|
||||||
AC_CONFIG_FILES([opal/mca/crs/blcr/Makefile])
|
|
||||||
|
|
||||||
AC_ARG_WITH([blcr],
|
|
||||||
[AC_HELP_STRING([--with-blcr(=DIR)],
|
|
||||||
[Path to BLCR Installation])])
|
|
||||||
OPAL_CHECK_WITHDIR([blcr], [$with_blcr], [include/libcr.h])
|
|
||||||
AC_ARG_WITH([blcr-libdir],
|
|
||||||
[AC_HELP_STRING([--with-blcr-libdir=DIR],
|
|
||||||
[Search for BLCR libraries in DIR])])
|
|
||||||
OPAL_CHECK_WITHDIR([blcr-libdir], [$with_blcr_libdir], [libcr.*])
|
|
||||||
|
|
||||||
check_crs_blcr_good="no"
|
|
||||||
|
|
||||||
# If we do not want FT, don't compile this component
|
|
||||||
#
|
|
||||||
# If we wanted BLCR, but did not specify the FT option,
|
|
||||||
# error out with a warning for the user
|
|
||||||
AS_IF([test "$opal_want_ft_cr" = "0"],
|
|
||||||
[$2
|
|
||||||
check_crs_blcr_good="no"
|
|
||||||
AS_IF([test ! -z "$with_blcr" && test "$with_blcr" != "no"],
|
|
||||||
[AC_MSG_WARN([BLCR support requested, but FT support not requested. You need to specify the --with-ft=cr configure option.])
|
|
||||||
AC_MSG_ERROR([Aborting.])])
|
|
||||||
],
|
|
||||||
[check_crs_blcr_good="yes"])
|
|
||||||
|
|
||||||
# If we do not want BLCR, then do not compile it
|
|
||||||
AS_IF([test "$with_blcr" = "no" || test "$check_crs_blcr_good" = "no"],
|
|
||||||
[$2
|
|
||||||
check_crs_blcr_good="no"],
|
|
||||||
[check_crs_blcr_good="yes"])
|
|
||||||
|
|
||||||
# Defaults
|
|
||||||
check_crs_blcr_dir_msg="compiler default"
|
|
||||||
check_crs_blcr_libdir_msg="linker default"
|
|
||||||
check_crs_blcr_dir=""
|
|
||||||
check_crs_blcr_libdir=""
|
|
||||||
|
|
||||||
# Determine the search paths for the headers and libraries
|
|
||||||
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2],
|
|
||||||
[AS_IF([test ! -z "$with_blcr" && test "$with_blcr" != "yes"],
|
|
||||||
[check_crs_blcr_dir="$with_blcr"
|
|
||||||
check_crs_blcr_dir_msg="$with_blcr (from --with-blcr)"])
|
|
||||||
AS_IF([test ! -z "$with_blcr_libdir" && test "$with_blcr_libdir" != "yes"],
|
|
||||||
[check_crs_blcr_libdir="$with_blcr_libdir"
|
|
||||||
check_crs_blcr_libdir_msg="$with_blcr_libdir (from --with-blcr-libdir)"])
|
|
||||||
])
|
|
||||||
|
|
||||||
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2],
|
|
||||||
[AC_MSG_CHECKING([for BLCR dir])
|
|
||||||
AC_MSG_RESULT([$check_crs_blcr_dir_msg])
|
|
||||||
AC_MSG_CHECKING([for BLCR library dir])
|
|
||||||
AC_MSG_RESULT([$check_crs_blcr_libdir_msg])
|
|
||||||
OPAL_CHECK_PACKAGE([crs_blcr_check],
|
|
||||||
[libcr.h],
|
|
||||||
[cr],
|
|
||||||
[cr_init],
|
|
||||||
[],
|
|
||||||
[$check_crs_blcr_dir],
|
|
||||||
[$check_crs_blcr_libdir],
|
|
||||||
[check_crs_blcr_good="yes"],
|
|
||||||
[check_crs_blcr_good="no"])
|
|
||||||
])
|
|
||||||
|
|
||||||
crs_blcr_save_CFLAGS="$CFLAGS"
|
|
||||||
crs_blcr_save_CPPFLAGS="$CPPFLAGS"
|
|
||||||
crs_blcr_save_LDFLAGS="$LDFLAGS"
|
|
||||||
crs_blcr_save_LIBS="$LIBS"
|
|
||||||
|
|
||||||
crs_blcr_CFLAGS="$CFLAGS $crs_blcr_check_CFLAGS"
|
|
||||||
crs_blcr_CPPFLAGS="$CPPFLAGS $crs_blcr_check_CPPFLAGS"
|
|
||||||
crs_blcr_LDFLAGS="$LDFLAGS $crs_blcr_check_LDFLAGS"
|
|
||||||
crs_blcr_LIBS="$LIBS $crs_blcr_check_LIBS"
|
|
||||||
|
|
||||||
# Check to see if we found the BLCR libcr.h library
|
|
||||||
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2],
|
|
||||||
[
|
|
||||||
#
|
|
||||||
# Since BLCR libraries are not fully ISO99 C compliant
|
|
||||||
# -pedantic and -Wundef raise a bunch of warnings, so
|
|
||||||
# we just strip them off for this component
|
|
||||||
AC_MSG_WARN([Removed -pedantic and -Wundef from CFLAGS for blcr component because libcr.h is not really ANSI C])
|
|
||||||
# Strip off problematic arguments
|
|
||||||
crs_blcr_CFLAGS="`echo $crs_blcr_CFLAGS | sed 's/-pedantic//g'`"
|
|
||||||
crs_blcr_CFLAGS="`echo $crs_blcr_CFLAGS | sed 's/-Wundef//g'`"
|
|
||||||
crs_blcr_CPPFLAGS="`echo $crs_blcr_CPPFLAGS | sed 's/-pedantic//g'`"
|
|
||||||
crs_blcr_CPPFLAGS="`echo $crs_blcr_CPPFLAGS | sed 's/-Wundef//g'`"
|
|
||||||
crs_blcr_LDFLAGS="$crs_blcr_LDFLAGS"
|
|
||||||
crs_blcr_LIBS="$crs_blcr_LIBS"
|
|
||||||
$1])
|
|
||||||
|
|
||||||
#
|
|
||||||
# Check for version difference which may have:
|
|
||||||
# - working cr_request_file
|
|
||||||
# - working cr_request_checkpoint (which should be used instead of cr_request_file)
|
|
||||||
# - 'requester' parameter to checkpoint_info
|
|
||||||
#
|
|
||||||
AS_IF([test "$check_crs_blcr_good" != "yes"], [$2], [
|
|
||||||
CFLAGS="$crs_blcr_CFLAGS"
|
|
||||||
CPPFLAGS="$crs_blcr_CPPFLAGS"
|
|
||||||
LDFLAGS="$crs_blcr_LDFLAGS"
|
|
||||||
LIBS="$crs_blcr_LIBS"
|
|
||||||
#
|
|
||||||
# First look for the cr_request_file function
|
|
||||||
#
|
|
||||||
crs_blcr_have_working_cr_request=0
|
|
||||||
AC_MSG_CHECKING(for BLCR working cr_request)
|
|
||||||
OPAL_SEARCH_LIBS_COMPONENT([crs_blcr], [cr_request_file],[cr],
|
|
||||||
[AC_TRY_COMPILE([#include <libcr.h>],
|
|
||||||
[#if CR_RELEASE_MAJOR <= 0 && CR_RELEASE_MINOR < 6
|
|
||||||
#error Version earlier than 0.6.0
|
|
||||||
#endif
|
|
||||||
],
|
|
||||||
[crs_blcr_have_working_cr_request=1
|
|
||||||
],
|
|
||||||
[crs_blcr_have_working_cr_request=0
|
|
||||||
AC_MSG_WARN([This BLCR version does not contain a known working version of cr_request_file])
|
|
||||||
])],
|
|
||||||
[crs_blcr_have_working_cr_request=0
|
|
||||||
AC_MSG_WARN([This BLCR version does not contain the cr_request_file function])
|
|
||||||
])
|
|
||||||
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST], [$crs_blcr_have_working_cr_request],
|
|
||||||
[BLCR cr_request_file check])
|
|
||||||
|
|
||||||
#
|
|
||||||
# Look for the cr_request_checkpoint function
|
|
||||||
#
|
|
||||||
crs_blcr_have_cr_request_checkpoint=0
|
|
||||||
AC_MSG_CHECKING(for BLCR cr_request_checkpoint)
|
|
||||||
OPAL_SEARCH_LIBS_COMPONENT([crs_blcr],
|
|
||||||
[cr_request_checkpoint],[cr],
|
|
||||||
[crs_blcr_have_cr_request_checkpoint=1
|
|
||||||
],
|
|
||||||
[crs_blcr_have_cr_request_checkpoint=0
|
|
||||||
AC_MSG_WARN([This BLCR version does not contain the cr_request_checkpoint function])
|
|
||||||
])
|
|
||||||
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT], [$crs_blcr_have_cr_request_checkpoint],
|
|
||||||
[BLCR cr_request_checkpoint check])
|
|
||||||
|
|
||||||
#
|
|
||||||
# Look for the cr_checkpoint_info.requester member
|
|
||||||
#
|
|
||||||
crs_blcr_have_info_requester=0
|
|
||||||
AC_CHECK_MEMBER([struct cr_checkpoint_info.requester],
|
|
||||||
[crs_blcr_have_info_requester=1],
|
|
||||||
[AC_MSG_WARN([This BLCR version does not contain a 'requester' member of the 'cr_checkpoint_info' struct])],
|
|
||||||
[#include <libcr.h>])
|
|
||||||
AC_DEFINE_UNQUOTED([CRS_BLCR_HAVE_INFO_REQUESTER], [$crs_blcr_have_info_requester],
|
|
||||||
[BLCRs cr_checkpoint_info.requester member availability])
|
|
||||||
$1])
|
|
||||||
|
|
||||||
#
|
|
||||||
# Require either a working cr_request_file() or cr_request_checkpoint() function
|
|
||||||
#
|
|
||||||
AS_IF([test "$crs_blcr_have_working_cr_request" = "0" && test "$crs_blcr_have_cr_request_checkpoint" = "0"],
|
|
||||||
[$2
|
|
||||||
check_crs_blcr_good="no"
|
|
||||||
AC_MSG_WARN([The BLCR CRS component requires either the cr_request_checkpoint() or cr_request_file() functions])])
|
|
||||||
|
|
||||||
#
|
|
||||||
# Reset the flags
|
|
||||||
#
|
|
||||||
CFLAGS="$crs_blcr_save_CFLAGS"
|
|
||||||
CPPFLAGS="$crs_blcr_save_CPPFLAGS"
|
|
||||||
LDFLAGS="$crs_blcr_save_LDFLAGS"
|
|
||||||
LIBS="$crs_blcr_save_LIBS"
|
|
||||||
|
|
||||||
#
|
|
||||||
AS_IF([test "$check_crs_blcr_good" = "yes"],
|
|
||||||
[ AC_SUBST([crs_blcr_CFLAGS])
|
|
||||||
AC_SUBST([crs_blcr_CPPFLAGS])
|
|
||||||
AC_SUBST([crs_blcr_LDFLAGS])
|
|
||||||
AC_SUBST([crs_blcr_LIBS])
|
|
||||||
$1],
|
|
||||||
[AS_IF([test ! -z "$with_blcr" && test "$with_blcr" != "no"],
|
|
||||||
[AC_MSG_WARN([BLCR support requested but not found. Perhaps you need to specify the location of the BLCR libraries.])
|
|
||||||
AC_MSG_ERROR([Aborting.])])
|
|
||||||
$3])
|
|
||||||
|
|
||||||
])dnl
|
|
@ -1,84 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @file
|
|
||||||
*
|
|
||||||
* BLCR CRS component
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MCA_CRS_BLCR_EXPORT_H
|
|
||||||
#define MCA_CRS_BLCR_EXPORT_H
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
|
|
||||||
|
|
||||||
#include "opal/mca/mca.h"
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/base/base.h"
|
|
||||||
|
|
||||||
#include <libcr.h>
|
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Local Component structures
|
|
||||||
*/
|
|
||||||
struct opal_crs_blcr_component_t {
|
|
||||||
/** Base CRS component */
|
|
||||||
opal_crs_base_component_t super;
|
|
||||||
};
|
|
||||||
typedef struct opal_crs_blcr_component_t opal_crs_blcr_component_t;
|
|
||||||
OPAL_MODULE_DECLSPEC extern opal_crs_blcr_component_t mca_crs_blcr_component;
|
|
||||||
|
|
||||||
int opal_crs_blcr_component_query(mca_base_module_t **module, int *priority);
|
|
||||||
|
|
||||||
extern bool opal_crs_blcr_dev_null;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Module functions
|
|
||||||
*/
|
|
||||||
int opal_crs_blcr_module_init(void);
|
|
||||||
int opal_crs_blcr_module_finalize(void);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Actual funcationality
|
|
||||||
*/
|
|
||||||
int opal_crs_blcr_checkpoint( pid_t pid,
|
|
||||||
opal_crs_base_snapshot_t *snapshot,
|
|
||||||
opal_crs_base_ckpt_options_t *options,
|
|
||||||
opal_crs_state_type_t *state);
|
|
||||||
|
|
||||||
int opal_crs_blcr_restart( opal_crs_base_snapshot_t *snapshot,
|
|
||||||
bool spawn_child,
|
|
||||||
pid_t *child_pid);
|
|
||||||
|
|
||||||
int opal_crs_blcr_disable_checkpoint(void);
|
|
||||||
int opal_crs_blcr_enable_checkpoint(void);
|
|
||||||
|
|
||||||
int opal_crs_blcr_prelaunch(int32_t rank,
|
|
||||||
char *base_snapshot_dir,
|
|
||||||
char **app,
|
|
||||||
char **cwd,
|
|
||||||
char ***argv,
|
|
||||||
char ***env);
|
|
||||||
|
|
||||||
int opal_crs_blcr_reg_thread(void);
|
|
||||||
|
|
||||||
END_C_DECLS
|
|
||||||
|
|
||||||
#endif /* MCA_CRS_BLCR_EXPORT_H */
|
|
@ -1,145 +0,0 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
|
||||||
* reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
|
|
||||||
#include "opal/constants.h"
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/crs/base/base.h"
|
|
||||||
#include "crs_blcr.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Local functionality
|
|
||||||
*/
|
|
||||||
static int crs_blcr_register (void);
|
|
||||||
static int crs_blcr_open(void);
|
|
||||||
static int crs_blcr_close(void);
|
|
||||||
|
|
||||||
bool opal_crs_blcr_dev_null = false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Instantiate the public struct with all of our public information
|
|
||||||
* and pointer to our public functions in it
|
|
||||||
*/
|
|
||||||
opal_crs_blcr_component_t mca_crs_blcr_component = {
|
|
||||||
/* First do the base component stuff */
|
|
||||||
{
|
|
||||||
/* Handle the general mca_component_t struct containing
|
|
||||||
* meta information about the component itself
|
|
||||||
*/
|
|
||||||
.base_version = {
|
|
||||||
OPAL_CRS_BASE_VERSION_2_0_0,
|
|
||||||
|
|
||||||
/* Component name and version */
|
|
||||||
.mca_component_name = "blcr",
|
|
||||||
MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
|
|
||||||
OPAL_RELEASE_VERSION),
|
|
||||||
|
|
||||||
/* Component open and close functions */
|
|
||||||
.mca_open_component = crs_blcr_open,
|
|
||||||
.mca_close_component = crs_blcr_close,
|
|
||||||
.mca_query_component = opal_crs_blcr_component_query,
|
|
||||||
.mca_register_component_params = crs_blcr_register
|
|
||||||
},
|
|
||||||
.base_data = {
|
|
||||||
/* The component is checkpoint ready */
|
|
||||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
||||||
},
|
|
||||||
|
|
||||||
.verbose = 0,
|
|
||||||
.output_handle = -1,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
static int crs_blcr_register (void)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
mca_crs_blcr_component.super.priority = 10;
|
|
||||||
ret = mca_base_component_var_register (&mca_crs_blcr_component.super.base_version,
|
|
||||||
"priority", "Priority of the CRS blcr component "
|
|
||||||
"(default: 10)". MCA_BASE_VAR_TYPE_INT, NULL,
|
|
||||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
|
|
||||||
&mca_crs_blcr_component.super.priority);
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
mca_crs_blcr_component.super.verbose = 0;
|
|
||||||
ret = mca_base_component_var_register (&mca_crs_blcr_component.super.base_version,
|
|
||||||
"verbose",
|
|
||||||
"Verbose level for the CRS blcr component",
|
|
||||||
MCA_BASE_VAR_TYPE_INT, NULL, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_crs_blcr_component.super.verbose);
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
opal_crs_blcr_dev_null = false;
|
|
||||||
ret = mca_base_component_var_register (&mca_crs_blcr_component.super.base_version,
|
|
||||||
"dev_null",
|
|
||||||
"Not for general use! For debugging only! Save checkpoint to /dev/null. [Default = disabled]",
|
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
|
|
||||||
&opal_crs_blcr_dev_null);
|
|
||||||
return (0 > ret) ? ret : OPAL_SUCCESS
|
|
||||||
}
|
|
||||||
|
|
||||||
static int crs_blcr_open(void)
|
|
||||||
{
|
|
||||||
/* If there is a custom verbose level for this component than use it
|
|
||||||
* otherwise take our parents level and output channel
|
|
||||||
*/
|
|
||||||
if ( 0 != mca_crs_blcr_component.super.verbose) {
|
|
||||||
mca_crs_blcr_component.super.output_handle = opal_output_open(NULL);
|
|
||||||
opal_output_set_verbosity(mca_crs_blcr_component.super.output_handle,
|
|
||||||
mca_crs_blcr_component.super.verbose);
|
|
||||||
} else {
|
|
||||||
mca_crs_blcr_component.super.output_handle = opal_crs_base_framework.framework_output;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Debug output
|
|
||||||
*/
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: open()");
|
|
||||||
opal_output_verbose(20, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: open: priority = %d",
|
|
||||||
mca_crs_blcr_component.super.priority);
|
|
||||||
opal_output_verbose(20, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: open: verbosity = %d",
|
|
||||||
mca_crs_blcr_component.super.verbose);
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: open: dev_null = %s",
|
|
||||||
(opal_crs_blcr_dev_null == true ? "True" : "False"));
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int crs_blcr_close(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: close()");
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
@ -1,866 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
|
||||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
|
||||||
*
|
|
||||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
|
|
||||||
#include <sched.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <errno.h>
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <sys/wait.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
|
|
||||||
#include "opal/util/show_help.h"
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
#include "opal/util/argv.h"
|
|
||||||
#include "opal/constants.h"
|
|
||||||
|
|
||||||
#include "opal/mca/base/mca_base_var.h"
|
|
||||||
|
|
||||||
#include "opal/threads/threads.h"
|
|
||||||
#include "opal/threads/mutex.h"
|
|
||||||
#include "opal/threads/condition.h"
|
|
||||||
|
|
||||||
#include "opal/mca/event/event.h"
|
|
||||||
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/crs/base/base.h"
|
|
||||||
|
|
||||||
#include "crs_blcr.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Blcr module
|
|
||||||
*/
|
|
||||||
static opal_crs_base_module_t blcr_module = {
|
|
||||||
/** Initialization Function */
|
|
||||||
opal_crs_blcr_module_init,
|
|
||||||
/** Finalization Function */
|
|
||||||
opal_crs_blcr_module_finalize,
|
|
||||||
|
|
||||||
/** Checkpoint interface */
|
|
||||||
opal_crs_blcr_checkpoint,
|
|
||||||
|
|
||||||
/** Restart Command Access */
|
|
||||||
opal_crs_blcr_restart,
|
|
||||||
|
|
||||||
/** Disable checkpoints */
|
|
||||||
opal_crs_blcr_disable_checkpoint,
|
|
||||||
/** Enable checkpoints */
|
|
||||||
opal_crs_blcr_enable_checkpoint,
|
|
||||||
|
|
||||||
/** Prelaunch */
|
|
||||||
opal_crs_blcr_prelaunch,
|
|
||||||
|
|
||||||
/** Register Thread */
|
|
||||||
opal_crs_blcr_reg_thread
|
|
||||||
};
|
|
||||||
|
|
||||||
/***************************
|
|
||||||
* Snapshot Class Functions
|
|
||||||
***************************/
|
|
||||||
OBJ_CLASS_DECLARATION(opal_crs_blcr_snapshot_t);
|
|
||||||
|
|
||||||
struct opal_crs_blcr_snapshot_t {
|
|
||||||
/** Base CRS snapshot type */
|
|
||||||
opal_crs_base_snapshot_t super;
|
|
||||||
char * context_filename;
|
|
||||||
};
|
|
||||||
typedef struct opal_crs_blcr_snapshot_t opal_crs_blcr_snapshot_t;
|
|
||||||
|
|
||||||
void opal_crs_blcr_construct(opal_crs_blcr_snapshot_t *obj);
|
|
||||||
void opal_crs_blcr_destruct( opal_crs_blcr_snapshot_t *obj);
|
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(opal_crs_blcr_snapshot_t,
|
|
||||||
opal_crs_base_snapshot_t,
|
|
||||||
opal_crs_blcr_construct,
|
|
||||||
opal_crs_blcr_destruct);
|
|
||||||
|
|
||||||
/******************
|
|
||||||
* Local Functions
|
|
||||||
******************/
|
|
||||||
static int blcr_get_checkpoint_filename(char **fname, pid_t pid);
|
|
||||||
static int opal_crs_blcr_thread_callback(void *arg);
|
|
||||||
static int opal_crs_blcr_signal_callback(void *arg);
|
|
||||||
|
|
||||||
static int opal_crs_blcr_restart_cmd(char *fname, char **cmd);
|
|
||||||
|
|
||||||
static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot);
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_CRDEBUG == 1
|
|
||||||
static void MPIR_checkpoint_debugger_crs_hook(cr_hook_event_t event);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*************************
|
|
||||||
* Local Global Variables
|
|
||||||
*************************/
|
|
||||||
#if OPAL_ENABLE_CRDEBUG == 1
|
|
||||||
static opal_thread_t *checkpoint_thread_id = NULL;
|
|
||||||
static bool blcr_crdebug_refreshed_env = false;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static cr_client_id_t client_id;
|
|
||||||
static cr_callback_id_t cr_thread_callback_id;
|
|
||||||
static cr_callback_id_t cr_signal_callback_id;
|
|
||||||
static int blcr_current_state = OPAL_CRS_NONE;
|
|
||||||
|
|
||||||
static char *blcr_restart_cmd = NULL;
|
|
||||||
static char *blcr_checkpoint_cmd = NULL;
|
|
||||||
|
|
||||||
static opal_condition_t blcr_cond;
|
|
||||||
static opal_mutex_t blcr_lock;
|
|
||||||
|
|
||||||
static pid_t my_pid = -1;
|
|
||||||
|
|
||||||
void opal_crs_blcr_construct(opal_crs_blcr_snapshot_t *snapshot) {
|
|
||||||
snapshot->context_filename = NULL;
|
|
||||||
snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name);
|
|
||||||
}
|
|
||||||
|
|
||||||
void opal_crs_blcr_destruct( opal_crs_blcr_snapshot_t *snapshot) {
|
|
||||||
if(NULL != snapshot->context_filename) {
|
|
||||||
free(snapshot->context_filename);
|
|
||||||
snapshot->context_filename = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*****************
|
|
||||||
* MCA Functions
|
|
||||||
*****************/
|
|
||||||
int opal_crs_blcr_component_query(mca_base_module_t **module, int *priority)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: component_query()");
|
|
||||||
|
|
||||||
*priority = mca_crs_blcr_component.super.priority;
|
|
||||||
*module = (mca_base_module_t *)&blcr_module;
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_blcr_module_init(void)
|
|
||||||
{
|
|
||||||
void *crs_blcr_thread_callback_arg = NULL;
|
|
||||||
void *crs_blcr_signal_callback_arg = NULL;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: module_init()");
|
|
||||||
|
|
||||||
blcr_restart_cmd = strdup("cr_restart");
|
|
||||||
blcr_checkpoint_cmd = strdup("cr_checkpoint");
|
|
||||||
|
|
||||||
my_pid = getpid();
|
|
||||||
|
|
||||||
if( !opal_cr_is_tool ) {
|
|
||||||
/* We need to make the lock and condition variable before
|
|
||||||
* starting the thread, since the thread uses these vars.
|
|
||||||
*/
|
|
||||||
OBJ_CONSTRUCT(&blcr_lock, opal_mutex_t);
|
|
||||||
OBJ_CONSTRUCT(&blcr_cond, opal_condition_t);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Initialize BLCR
|
|
||||||
*/
|
|
||||||
client_id = cr_init();
|
|
||||||
if (0 > client_id) {
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"Error: crs:blcr: module_init: cr_init failed (%d)\n", client_id);
|
|
||||||
return OPAL_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_CRDEBUG == 1
|
|
||||||
blcr_crdebug_refreshed_env = false;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
blcr_restart_cmd = strdup("cr_restart");
|
|
||||||
blcr_checkpoint_cmd = strdup("cr_checkpoint");
|
|
||||||
|
|
||||||
if( !opal_cr_is_tool ) {
|
|
||||||
/*
|
|
||||||
* Register the thread handler
|
|
||||||
*/
|
|
||||||
cr_thread_callback_id = cr_register_callback(opal_crs_blcr_thread_callback,
|
|
||||||
crs_blcr_thread_callback_arg,
|
|
||||||
CR_THREAD_CONTEXT);
|
|
||||||
/*
|
|
||||||
* Register the signal handler
|
|
||||||
* - even though we do not use it
|
|
||||||
*/
|
|
||||||
cr_signal_callback_id = cr_register_callback(opal_crs_blcr_signal_callback,
|
|
||||||
crs_blcr_signal_callback_arg,
|
|
||||||
CR_SIGNAL_CONTEXT);
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_CRDEBUG == 1
|
|
||||||
/*
|
|
||||||
* Checkpoint/restart enabled debugging hooks
|
|
||||||
* "NO_CALLBACKS" -> non-MPI threads
|
|
||||||
* "SIGNAL_CONTEXT" -> MPI threads
|
|
||||||
* "THREAD_CONTEXT" -> BLCR threads
|
|
||||||
*/
|
|
||||||
cr_register_hook(CR_HOOK_CONT_NO_CALLBACKS, MPIR_checkpoint_debugger_crs_hook);
|
|
||||||
cr_register_hook(CR_HOOK_CONT_SIGNAL_CONTEXT, MPIR_checkpoint_debugger_crs_hook);
|
|
||||||
|
|
||||||
cr_register_hook(CR_HOOK_RSTRT_NO_CALLBACKS, MPIR_checkpoint_debugger_crs_hook);
|
|
||||||
cr_register_hook(CR_HOOK_RSTRT_SIGNAL_CONTEXT, MPIR_checkpoint_debugger_crs_hook);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Now that we are done with init, set the state to running
|
|
||||||
*/
|
|
||||||
blcr_current_state = OPAL_CRS_RUNNING;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: module_init() --> Finished [%d]",
|
|
||||||
opal_cr_is_tool);
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_blcr_prelaunch(int32_t rank,
|
|
||||||
char *base_snapshot_dir,
|
|
||||||
char **app,
|
|
||||||
char **cwd,
|
|
||||||
char ***argv,
|
|
||||||
char ***env)
|
|
||||||
{
|
|
||||||
char * tmp_env_var = NULL;
|
|
||||||
|
|
||||||
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
|
|
||||||
opal_setenv(tmp_env_var,
|
|
||||||
"0", true, env);
|
|
||||||
free(tmp_env_var);
|
|
||||||
tmp_env_var = NULL;
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_blcr_reg_thread(void)
|
|
||||||
{
|
|
||||||
cr_client_id_t loc_client_id;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Initialize BLCR
|
|
||||||
*/
|
|
||||||
loc_client_id = cr_init();
|
|
||||||
if (0 > loc_client_id) {
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"Error: crs:blcr: reg_thread: cr_init failed (%d)\n", loc_client_id);
|
|
||||||
return OPAL_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_blcr_module_finalize(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: module_finalize()");
|
|
||||||
|
|
||||||
/* Cleanup some memory */
|
|
||||||
if( NULL != blcr_restart_cmd ) {
|
|
||||||
free(blcr_restart_cmd);
|
|
||||||
blcr_restart_cmd = NULL;
|
|
||||||
}
|
|
||||||
if( NULL != blcr_checkpoint_cmd ) {
|
|
||||||
free(blcr_checkpoint_cmd);
|
|
||||||
blcr_checkpoint_cmd = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( !opal_cr_is_tool ) {
|
|
||||||
OBJ_DESTRUCT(&blcr_lock);
|
|
||||||
OBJ_DESTRUCT(&blcr_cond);
|
|
||||||
|
|
||||||
if( OPAL_CRS_RUNNING == blcr_current_state ) {
|
|
||||||
/* Unload the thread callback */
|
|
||||||
cr_replace_callback(cr_thread_callback_id, NULL, NULL, CR_THREAD_CONTEXT);
|
|
||||||
/* Unload the signal callback */
|
|
||||||
cr_replace_callback(cr_signal_callback_id, NULL, NULL, CR_SIGNAL_CONTEXT);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_CRDEBUG == 1
|
|
||||||
/*
|
|
||||||
* Checkpoint/restart enabled debugging hooks
|
|
||||||
*/
|
|
||||||
cr_register_hook(CR_HOOK_CONT_NO_CALLBACKS, NULL);
|
|
||||||
cr_register_hook(CR_HOOK_CONT_SIGNAL_CONTEXT, NULL);
|
|
||||||
|
|
||||||
cr_register_hook(CR_HOOK_RSTRT_NO_CALLBACKS, NULL);
|
|
||||||
cr_register_hook(CR_HOOK_RSTRT_SIGNAL_CONTEXT, NULL);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* BLCR does not have a finalization routine */
|
|
||||||
blcr_current_state = OPAL_CRS_NONE;
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_blcr_checkpoint(pid_t pid,
|
|
||||||
opal_crs_base_snapshot_t *base_snapshot,
|
|
||||||
opal_crs_base_ckpt_options_t *options,
|
|
||||||
opal_crs_state_type_t *state)
|
|
||||||
{
|
|
||||||
int ret, exit_status = OPAL_SUCCESS;
|
|
||||||
opal_crs_blcr_snapshot_t *snapshot = NULL;
|
|
||||||
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
|
|
||||||
cr_checkpoint_args_t cr_args;
|
|
||||||
static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1);
|
|
||||||
#endif
|
|
||||||
int fd = 0;
|
|
||||||
char *loc_fname = NULL;
|
|
||||||
|
|
||||||
if( pid != my_pid ) {
|
|
||||||
opal_output(0, "crs:blcr: checkpoint(%d, ---): Checkpointing of peers not allowed!", pid);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: checkpoint(%d, ---)", pid);
|
|
||||||
|
|
||||||
snapshot = (opal_crs_blcr_snapshot_t *)base_snapshot;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Update the snapshot metadata
|
|
||||||
*/
|
|
||||||
snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name);
|
|
||||||
blcr_get_checkpoint_filename(&(snapshot->context_filename), pid);
|
|
||||||
|
|
||||||
if( NULL == snapshot->super.metadata ) {
|
|
||||||
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: checkpoint(): Error: Unable to open the file (%s)",
|
|
||||||
snapshot->super.metadata_filename);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);
|
|
||||||
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->context_filename);
|
|
||||||
|
|
||||||
fclose(snapshot->super.metadata );
|
|
||||||
snapshot->super.metadata = NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we can checkpointing ourselves do so:
|
|
||||||
* use cr_request_checkpoint() if available, and cr_request_file() if not
|
|
||||||
*/
|
|
||||||
if( opal_crs_blcr_dev_null ) {
|
|
||||||
loc_fname = strdup("/dev/null");
|
|
||||||
} else {
|
|
||||||
asprintf(&loc_fname, "%s/%s", snapshot->super.snapshot_directory, snapshot->context_filename);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_CRDEBUG == 1
|
|
||||||
/* Make sure to identify the checkpointing thread, so that it is not
|
|
||||||
* prevented from requesting the checkpoint after the debugger detaches
|
|
||||||
*/
|
|
||||||
opal_cr_debug_set_current_ckpt_thread_self();
|
|
||||||
checkpoint_thread_id = opal_thread_get_self();
|
|
||||||
blcr_crdebug_refreshed_env = false;
|
|
||||||
|
|
||||||
/* If checkpoint/restart enabled debugging then mark detachment place */
|
|
||||||
if( MPIR_debug_with_checkpoint ) {
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: checkpoint(): Detaching debugger...");
|
|
||||||
MPIR_checkpoint_debugger_detach();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: checkpoint SELF <%s>",
|
|
||||||
loc_fname);
|
|
||||||
|
|
||||||
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1
|
|
||||||
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
|
|
||||||
fd = open(loc_fname,
|
|
||||||
O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE,
|
|
||||||
S_IRUSR | S_IWUSR);
|
|
||||||
if( fd < 0 ) {
|
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)",
|
|
||||||
loc_fname, pid);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
cr_initialize_checkpoint_args_t(&cr_args);
|
|
||||||
cr_args.cr_scope = CR_SCOPE_PROC;
|
|
||||||
cr_args.cr_fd = fd;
|
|
||||||
if( options->stop ) {
|
|
||||||
cr_args.cr_signal = SIGSTOP;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = cr_request_checkpoint(&cr_args, &cr_handle);
|
|
||||||
if( ret < 0 ) {
|
|
||||||
close(cr_args.cr_fd);
|
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)",
|
|
||||||
pid, loc_fname);
|
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Wait for checkpoint to finish */
|
|
||||||
do {
|
|
||||||
ret = cr_poll_checkpoint(&cr_handle, NULL);
|
|
||||||
if( ret < 0 ) {
|
|
||||||
/* Check if restarting. This is not an error. */
|
|
||||||
if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) {
|
|
||||||
ret = 0;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
/* If Call was interrupted by a signal, retry the call */
|
|
||||||
else if (errno == EINTR) {
|
|
||||||
;
|
|
||||||
}
|
|
||||||
/* Otherwise this is a real error that we need to deal with */
|
|
||||||
else {
|
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)",
|
|
||||||
pid, loc_fname, ret);
|
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} while( ret < 0 );
|
|
||||||
|
|
||||||
/* Close the file */
|
|
||||||
close(cr_args.cr_fd);
|
|
||||||
#else
|
|
||||||
/* Request a checkpoint be taken of the current process.
|
|
||||||
* Since we are not guaranteed to finish the checkpoint before this
|
|
||||||
* returns, we also need to wait for it.
|
|
||||||
*/
|
|
||||||
cr_request_file(loc_fname);
|
|
||||||
|
|
||||||
/* Wait for checkpoint to finish */
|
|
||||||
do {
|
|
||||||
usleep(1000); /* JJH Do we really want to sleep? */
|
|
||||||
} while(CR_STATE_IDLE != cr_status());
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
*state = blcr_current_state;
|
|
||||||
free(loc_fname);
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
if( NULL != snapshot->super.metadata ) {
|
|
||||||
fclose(snapshot->super.metadata );
|
|
||||||
snapshot->super.metadata = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_blcr_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
|
|
||||||
{
|
|
||||||
opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
|
|
||||||
char **cr_argv = NULL;
|
|
||||||
char *cr_cmd = NULL;
|
|
||||||
char *cr_full_cmd = NULL;
|
|
||||||
int ret;
|
|
||||||
int exit_status = OPAL_SUCCESS;
|
|
||||||
int status;
|
|
||||||
|
|
||||||
snapshot->super = *base_snapshot;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: restart(--, %d)", spawn_child);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we need to reconstruct the snapshot,
|
|
||||||
*/
|
|
||||||
if(snapshot->super.cold_start) {
|
|
||||||
if( OPAL_SUCCESS != (ret = blcr_cold_start(snapshot)) ) {
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_restart: Unable to reconstruct the snapshot.");
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Get the restart command
|
|
||||||
*/
|
|
||||||
if ( OPAL_SUCCESS != (ret = opal_crs_blcr_restart_cmd(snapshot->context_filename, &cr_cmd)) ) {
|
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
if ( NULL == (cr_argv = opal_argv_split(cr_cmd, ' ')) ) {
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Need to shutdown the event engine before this.
|
|
||||||
* for some reason the BLCR checkpointer and our event engine don't get
|
|
||||||
* along very well.
|
|
||||||
*/
|
|
||||||
opal_progress_finalize();
|
|
||||||
(void) mca_base_framework_close(&opal_event_base_framework);
|
|
||||||
|
|
||||||
if (!spawn_child) {
|
|
||||||
cr_full_cmd = opal_argv_join(cr_argv, ' ');
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_restart: SELF: exec :(%s, %s):",
|
|
||||||
blcr_restart_cmd, cr_full_cmd);
|
|
||||||
|
|
||||||
status = execvp(blcr_restart_cmd, cr_argv);
|
|
||||||
|
|
||||||
if(status < 0) {
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_restart: SELF: Child failed to execute :(%d):", status);
|
|
||||||
}
|
|
||||||
opal_show_help("help-opal-crs-blcr.txt", "blcr:restart_failed_exec", true,
|
|
||||||
status,
|
|
||||||
blcr_restart_cmd,
|
|
||||||
cr_full_cmd);
|
|
||||||
|
|
||||||
exit_status = status;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Restart by starting a new process
|
|
||||||
*/
|
|
||||||
else {
|
|
||||||
*child_pid = fork();
|
|
||||||
|
|
||||||
if( 0 == *child_pid) {
|
|
||||||
/* Child Process */
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_restart: CHILD: exec :(%s, %s):",
|
|
||||||
blcr_restart_cmd,
|
|
||||||
opal_argv_join(cr_argv, ' '));
|
|
||||||
|
|
||||||
status = execvp(blcr_restart_cmd, cr_argv);
|
|
||||||
|
|
||||||
if(status < 0) {
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_restart: CHILD: Child failed to execute :(%d):", status);
|
|
||||||
}
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_restart: CHILD: execvp returned %d", status);
|
|
||||||
|
|
||||||
exit_status = status;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
else if(*child_pid > 0) {
|
|
||||||
/* Parent is done once it is started. */
|
|
||||||
;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_restart: CHILD: fork failed :(%d):", *child_pid);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
if(NULL != cr_cmd)
|
|
||||||
free(cr_cmd);
|
|
||||||
if(NULL != cr_argv)
|
|
||||||
opal_argv_free(cr_argv);
|
|
||||||
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_blcr_disable_checkpoint(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: disable_checkpoint()");
|
|
||||||
/*
|
|
||||||
* Enter the BLCR Critical Section
|
|
||||||
*/
|
|
||||||
cr_enter_cs(client_id);
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_blcr_enable_checkpoint(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: enable_checkpoint()");
|
|
||||||
/*
|
|
||||||
* Leave the BLCR Critical Section
|
|
||||||
*/
|
|
||||||
cr_leave_cs(client_id);
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*****************************
|
|
||||||
* Local Function Definitions
|
|
||||||
*****************************/
|
|
||||||
static int opal_crs_blcr_thread_callback(void *arg) {
|
|
||||||
const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info();
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: thread_callback()");
|
|
||||||
|
|
||||||
OPAL_THREAD_LOCK(&blcr_lock);
|
|
||||||
blcr_current_state = OPAL_CRS_CHECKPOINT;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Allow the checkpoint to be taken, if we requested it
|
|
||||||
*/
|
|
||||||
#if CRS_BLCR_HAVE_INFO_REQUESTER == 1
|
|
||||||
if( ckpt_info->requester != my_pid ) {
|
|
||||||
ret = cr_checkpoint(CR_CHECKPOINT_OMIT);
|
|
||||||
blcr_current_state = OPAL_CRS_RUNNING;
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: thread_callback(); WARNING: An external agent attempted to checkpoint this process "
|
|
||||||
"when it did not expect to be checkpointed. Skipping this checkpoint request."
|
|
||||||
" [%d != %d].", ckpt_info->requester, my_pid);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_CRS_PRE_CKPT,
|
|
||||||
OPAL_CR_INC_STATE_PREPARE)) ) {
|
|
||||||
;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = cr_checkpoint(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Restarting
|
|
||||||
*/
|
|
||||||
if ( 0 < ret ) {
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: thread_callback: Restarting.");
|
|
||||||
blcr_current_state = OPAL_CRS_RESTART;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Continuing
|
|
||||||
*/
|
|
||||||
else {
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: thread_callback: Continue.");
|
|
||||||
blcr_current_state = OPAL_CRS_CONTINUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_CRS_POST_CKPT,
|
|
||||||
(blcr_current_state == OPAL_CRS_CONTINUE ?
|
|
||||||
OPAL_CR_INC_STATE_CONTINUE :
|
|
||||||
OPAL_CR_INC_STATE_RESTART))) ) {
|
|
||||||
;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_THREAD_UNLOCK(&blcr_lock);
|
|
||||||
opal_condition_signal(&blcr_cond);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int opal_crs_blcr_signal_callback(void *arg) {
|
|
||||||
const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info();
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Allow the checkpoint to be taken, if we requested it
|
|
||||||
*/
|
|
||||||
#if CRS_BLCR_HAVE_INFO_REQUESTER == 1
|
|
||||||
if( ckpt_info->requester != my_pid ) {
|
|
||||||
ret = cr_checkpoint(CR_CHECKPOINT_OMIT);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
ret = cr_checkpoint(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int opal_crs_blcr_restart_cmd(char *fname, char **cmd)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: restart_cmd(%s, ---)", fname);
|
|
||||||
|
|
||||||
if (NULL == fname) {
|
|
||||||
opal_output_verbose(10, opal_crs_base_framework.framework_output,
|
|
||||||
"crs:blcr: restart_cmd: Error: filename is NULL!");
|
|
||||||
return OPAL_CRS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
asprintf(cmd, "%s %s", blcr_restart_cmd, fname);
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int blcr_get_checkpoint_filename(char **fname, pid_t pid)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: get_checkpoint_filename(--, %d)", pid);
|
|
||||||
|
|
||||||
asprintf(fname, "ompi_blcr_context.%d", pid);
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
|
|
||||||
int ret, exit_status = OPAL_SUCCESS;
|
|
||||||
char **tmp_argv = NULL;
|
|
||||||
char * component_name = NULL;
|
|
||||||
int prev_pid;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: cold_start()");
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Find the snapshot directory, read the metadata file
|
|
||||||
*/
|
|
||||||
if( NULL == snapshot->super.metadata ) {
|
|
||||||
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "r")) ) {
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: checkpoint(): Error: Unable to open the file (%s)",
|
|
||||||
snapshot->super.metadata_filename);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
|
|
||||||
&component_name, &prev_pid) ) ) {
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
|
|
||||||
snapshot->super.metadata_filename, ret);
|
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
snapshot->super.component_name = strdup(component_name);
|
|
||||||
|
|
||||||
/* Compare the component strings to make sure this is our snapshot before going further */
|
|
||||||
if ( 0 != strncmp(mca_crs_blcr_component.super.base_version.mca_component_name,
|
|
||||||
component_name, strlen(component_name)) ) {
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
|
|
||||||
component_name, mca_crs_blcr_component.super.base_version.mca_component_name);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Context Filename
|
|
||||||
*/
|
|
||||||
opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
|
|
||||||
if( NULL == tmp_argv ) {
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: blcr_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
|
|
||||||
CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
asprintf(&snapshot->context_filename, "%s/%s", snapshot->super.snapshot_directory, tmp_argv[0]);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Reset the cold_start flag
|
|
||||||
*/
|
|
||||||
snapshot->super.cold_start = false;
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
if(NULL != tmp_argv) {
|
|
||||||
opal_argv_free(tmp_argv);
|
|
||||||
tmp_argv = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( NULL != snapshot->super.metadata ) {
|
|
||||||
fclose(snapshot->super.metadata);
|
|
||||||
snapshot->super.metadata = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_CRDEBUG == 1
|
|
||||||
static void MPIR_checkpoint_debugger_crs_hook(cr_hook_event_t event) {
|
|
||||||
opal_thread_t *my_thread_id = NULL;
|
|
||||||
my_thread_id = opal_thread_get_self();
|
|
||||||
|
|
||||||
/* Non-MPI threads */
|
|
||||||
if(event == CR_HOOK_RSTRT_NO_CALLBACKS ) {
|
|
||||||
/* wait for the MPI thread to refresh the environment for us */
|
|
||||||
while(!blcr_crdebug_refreshed_env) {
|
|
||||||
sched_yield();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* MPI threads */
|
|
||||||
else if(event == CR_HOOK_RSTRT_SIGNAL_CONTEXT ) {
|
|
||||||
if( opal_thread_self_compare(checkpoint_thread_id) ) {
|
|
||||||
opal_cr_refresh_environ(my_pid);
|
|
||||||
blcr_crdebug_refreshed_env = true;
|
|
||||||
} else {
|
|
||||||
while(!blcr_crdebug_refreshed_env) {
|
|
||||||
sched_yield();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Some debugging output
|
|
||||||
*/
|
|
||||||
/* Non-MPI threads */
|
|
||||||
if( event == CR_HOOK_CONT_NO_CALLBACKS ) {
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Waiting in Continue (Non-MPI). (%d)",
|
|
||||||
(int)my_thread_id->t_handle);
|
|
||||||
}
|
|
||||||
else if(event == CR_HOOK_RSTRT_NO_CALLBACKS ) {
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Waiting in Restart (Non-MPI). (%d)",
|
|
||||||
(int)my_thread_id->t_handle);
|
|
||||||
}
|
|
||||||
/* MPI Threads */
|
|
||||||
else if( event == CR_HOOK_CONT_SIGNAL_CONTEXT ) {
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Waiting in Continue (MPI).");
|
|
||||||
}
|
|
||||||
else if(event == CR_HOOK_RSTRT_SIGNAL_CONTEXT ) {
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Waiting in Restart (MPI).");
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Enter the breakpoint function.
|
|
||||||
* If no debugger intends on attaching, then this function is expected to
|
|
||||||
* return immediately.
|
|
||||||
*
|
|
||||||
* If this is an MPI thread then odds are that this is the checkpointing
|
|
||||||
* thread, in which case this function will return immediately allowing
|
|
||||||
* it to prepare the MPI library before signaling to the debugger that
|
|
||||||
* it is safe to attach, if necessary.
|
|
||||||
*/
|
|
||||||
MPIR_checkpoint_debugger_waitpoint();
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
|
||||||
"crs:blcr: MPIR_checkpoint_debugger_crs_hook: Finished...");
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,28 +0,0 @@
|
|||||||
-*- text -*-
|
|
||||||
#
|
|
||||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
||||||
# University Research and Technology
|
|
||||||
# Corporation. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
||||||
# of Tennessee Research Foundation. All rights
|
|
||||||
# reserved.
|
|
||||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
# University of Stuttgart. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
# This is the US/English general help file for Open PAL CRS framework.
|
|
||||||
#
|
|
||||||
[blcr:restart_failed_exec]
|
|
||||||
Error: BLCR was not able to restart the process because exec failed.
|
|
||||||
Check the installation of BLCR on all of the machines in your
|
|
||||||
system. The following information may be of help:
|
|
||||||
Return Code : %d
|
|
||||||
BLCR Restart Command : %s
|
|
||||||
Restart Command Line : %s
|
|
@ -1,51 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2004-2007 The Trustees of Indiana University.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
# University of Stuttgart. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
|
||||||
#
|
|
||||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
CFLAGS = $(crs_criu_CFLAGS)
|
|
||||||
AM_CPPFLAGS = $(crs_criu_CPPFLAGS)
|
|
||||||
|
|
||||||
sources = \
|
|
||||||
crs_criu.h \
|
|
||||||
crs_criu_component.c \
|
|
||||||
crs_criu_module.c
|
|
||||||
|
|
||||||
# Make the output library in this directory, and name it either
|
|
||||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
|
||||||
# (for static builds).
|
|
||||||
|
|
||||||
if MCA_BUILD_opal_crs_criu_DSO
|
|
||||||
component_noinst =
|
|
||||||
component_install = mca_crs_criu.la
|
|
||||||
else
|
|
||||||
component_noinst = libmca_crs_criu.la
|
|
||||||
component_install =
|
|
||||||
endif
|
|
||||||
|
|
||||||
mcacomponentdir = $(opallibdir)
|
|
||||||
mcacomponent_LTLIBRARIES = $(component_install)
|
|
||||||
mca_crs_criu_la_SOURCES = $(sources)
|
|
||||||
mca_crs_criu_la_LDFLAGS = -module -avoid-version $(crs_criu_LDFLAGS)
|
|
||||||
mca_crs_criu_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
|
|
||||||
$(crs_criu_LIBS)
|
|
||||||
|
|
||||||
noinst_LTLIBRARIES = $(component_noinst)
|
|
||||||
libmca_crs_criu_la_SOURCES = $(sources)
|
|
||||||
libmca_crs_criu_la_LDFLAGS = -module -avoid-version $(crs_criu_LDFLAGS)
|
|
||||||
libmca_crs_criu_la_LIBADD = $(crs_criu_LIBS)
|
|
@ -1,93 +0,0 @@
|
|||||||
# -*- shell-script -*-
|
|
||||||
#
|
|
||||||
# Copyright (c) 2004-2010 The Trustees of Indiana University.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
# University of Stuttgart. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
|
||||||
# Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
|
||||||
# Copyright (c) 2015 Research Organization for Information Science
|
|
||||||
# and Technology (RIST). All rights reserved.
|
|
||||||
#
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
# MCA_crs_criu_CONFIG([action-if-found], [action-if-not-found])
|
|
||||||
# -----------------------------------------------------------
|
|
||||||
AC_DEFUN([MCA_opal_crs_criu_CONFIG],[
|
|
||||||
OPAL_VAR_SCOPE_PUSH([check_crs_criu_good check_crs_criu_dir_msg check_crs_criu_libdir_msg check_crs_criu_dir check_crs_criu_libdir])
|
|
||||||
AC_CONFIG_FILES([opal/mca/crs/criu/Makefile])
|
|
||||||
|
|
||||||
AC_ARG_WITH([criu],
|
|
||||||
[AC_HELP_STRING([--with-criu(=DIR)],
|
|
||||||
[Path to CRIU Installation])])
|
|
||||||
OPAL_CHECK_WITHDIR([criu], [$with_criu], [include/criu/criu.h])
|
|
||||||
AC_ARG_WITH([criu-libdir],
|
|
||||||
[AC_HELP_STRING([--with-criu-libdir=DIR],
|
|
||||||
[Search for CRIU libraries in DIR])])
|
|
||||||
OPAL_CHECK_WITHDIR([criu-libdir], [$with_criu_libdir], [libcriu.*])
|
|
||||||
|
|
||||||
# If we do not want FT or CRIU, don't compile this component
|
|
||||||
AS_IF([test "$opal_want_ft_cr" = "1" && test "$with_criu" = "yes"],
|
|
||||||
[check_crs_criu_good=yes],
|
|
||||||
[check_crs_criu_good=no])
|
|
||||||
|
|
||||||
# Defaults
|
|
||||||
check_crs_criu_dir_msg="compiler default"
|
|
||||||
check_crs_criu_libdir_msg="linker default"
|
|
||||||
check_crs_criu_dir=""
|
|
||||||
check_crs_criu_libdir=""
|
|
||||||
|
|
||||||
# Determine the search paths for the headers and libraries
|
|
||||||
AS_IF([test $check_crs_criu_good = yes],
|
|
||||||
[AS_IF([test ! -z "$with_criu" && test "$with_criu" != "yes"],
|
|
||||||
[check_crs_criu_dir="$with_criu"
|
|
||||||
check_crs_criu_dir_msg="$with_criu (from --with-criu)"])
|
|
||||||
AS_IF([test ! -z "$with_criu_libdir" && test "$with_criu_libdir" != "yes"],
|
|
||||||
[check_crs_criu_libdir="$with_criu_libdir"
|
|
||||||
check_crs_criu_libdir_msg="$with_criu_libdir (from --with-criu-libdir)"])
|
|
||||||
])
|
|
||||||
|
|
||||||
AS_IF([test $check_crs_criu_good = yes],
|
|
||||||
[AC_MSG_CHECKING([for CRIU dir])
|
|
||||||
AC_MSG_RESULT([$check_crs_criu_dir_msg])
|
|
||||||
AC_MSG_CHECKING([for CRIU library dir])
|
|
||||||
AC_MSG_RESULT([$check_crs_criu_libdir_msg])
|
|
||||||
OPAL_CHECK_PACKAGE([crs_criu_check],
|
|
||||||
[criu/criu.h],
|
|
||||||
[criu],
|
|
||||||
[criu_init_opts],
|
|
||||||
[],
|
|
||||||
[$check_crs_criu_dir],
|
|
||||||
[$check_crs_criu_libdir],
|
|
||||||
[check_crs_criu_good="yes"],
|
|
||||||
[check_crs_criu_good="no"])
|
|
||||||
])
|
|
||||||
|
|
||||||
crs_criu_CFLAGS="$CFLAGS $crs_criu_check_CFLAGS"
|
|
||||||
crs_criu_CPPFLAGS="$CPPFLAGS $crs_criu_check_CPPFLAGS"
|
|
||||||
crs_criu_LDFLAGS="$LDFLAGS $crs_criu_check_LDFLAGS"
|
|
||||||
crs_criu_LIBS="$LIBS $crs_criu_check_LIBS"
|
|
||||||
|
|
||||||
AS_IF([test $check_crs_criu_good = yes],
|
|
||||||
[ AC_SUBST([crs_criu_CFLAGS])
|
|
||||||
AC_SUBST([crs_criu_CPPFLAGS])
|
|
||||||
AC_SUBST([crs_criu_LDFLAGS])
|
|
||||||
AC_SUBST([crs_criu_LIBS])
|
|
||||||
$1],
|
|
||||||
[AS_IF([test ! -z "$with_criu" && test "$with_criu" != "no"],
|
|
||||||
[AC_MSG_WARN([CRIU support requested but not found. Perhaps you need to enable FT support, or specify the location of the CRIU libraries...?])
|
|
||||||
AC_MSG_ERROR([Aborting.])])
|
|
||||||
$2])
|
|
||||||
|
|
||||||
OPAL_VAR_SCOPE_POP
|
|
||||||
])dnl
|
|
@ -1,88 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @file
|
|
||||||
*
|
|
||||||
* CRIU CRS component - support checkpoint/restart using CRIU
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MCA_CRS_CRIU_EXPORT_H
|
|
||||||
#define MCA_CRS_CRIU_EXPORT_H
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
|
|
||||||
|
|
||||||
#include "opal/mca/mca.h"
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/base/base.h"
|
|
||||||
|
|
||||||
#include <criu/criu.h>
|
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
|
||||||
|
|
||||||
#define LOG_FILE ("criu.log")
|
|
||||||
|
|
||||||
/* Local Component structures */
|
|
||||||
struct opal_crs_criu_component_t {
|
|
||||||
/* Base CRS component */
|
|
||||||
opal_crs_base_component_t super;
|
|
||||||
|
|
||||||
/* criu log file */
|
|
||||||
char *log_file;
|
|
||||||
/* criu log level */
|
|
||||||
int log_level;
|
|
||||||
/* criu tcp established */
|
|
||||||
bool tcp_established;
|
|
||||||
/* criu shell job */
|
|
||||||
bool shell_job;
|
|
||||||
/* criu external unix sockets */
|
|
||||||
bool ext_unix_sk;
|
|
||||||
/* criu leave tasks in running state after checkpoint */
|
|
||||||
bool leave_running;
|
|
||||||
};
|
|
||||||
typedef struct opal_crs_criu_component_t opal_crs_criu_component_t;
|
|
||||||
|
|
||||||
OPAL_MODULE_DECLSPEC extern opal_crs_criu_component_t mca_crs_criu_component;
|
|
||||||
|
|
||||||
int opal_crs_criu_component_query(mca_base_module_t **module, int *priority);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Module functions
|
|
||||||
*/
|
|
||||||
int opal_crs_criu_module_init(void);
|
|
||||||
int opal_crs_criu_module_finalize(void);
|
|
||||||
int opal_crs_criu_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot,
|
|
||||||
opal_crs_base_ckpt_options_t *options,
|
|
||||||
opal_crs_state_type_t *state);
|
|
||||||
|
|
||||||
int opal_crs_criu_restart(opal_crs_base_snapshot_t *snapshot,
|
|
||||||
bool spawn_child, pid_t *child_pid);
|
|
||||||
|
|
||||||
int opal_crs_criu_disable_checkpoint(void);
|
|
||||||
int opal_crs_criu_enable_checkpoint(void);
|
|
||||||
|
|
||||||
int opal_crs_criu_prelaunch(int32_t rank, char *base_snapshot_dir, char **app,
|
|
||||||
char **cwd, char ***argv, char ***env);
|
|
||||||
|
|
||||||
int opal_crs_criu_reg_thread(void);
|
|
||||||
|
|
||||||
|
|
||||||
END_C_DECLS
|
|
||||||
|
|
||||||
#endif /* MCA_CRS_CRIU_EXPORT_H */
|
|
@ -1,213 +0,0 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
|
||||||
* reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
|
|
||||||
#include "opal/constants.h"
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/crs/base/base.h"
|
|
||||||
#include "crs_criu.h"
|
|
||||||
|
|
||||||
/* Local functionality */
|
|
||||||
static int crs_criu_register(void);
|
|
||||||
static int crs_criu_open(void);
|
|
||||||
static int crs_criu_close(void);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Instantiate the public struct with all of our public information
|
|
||||||
* and pointer to our public functions in it
|
|
||||||
*/
|
|
||||||
opal_crs_criu_component_t mca_crs_criu_component = {
|
|
||||||
/* First do the base component stuff */
|
|
||||||
{
|
|
||||||
/* Handle the general mca_component_t struct containing
|
|
||||||
* meta information about the component itself
|
|
||||||
*/
|
|
||||||
.base_version = {
|
|
||||||
OPAL_CRS_BASE_VERSION_2_0_0,
|
|
||||||
|
|
||||||
/* Component name and version */
|
|
||||||
.mca_component_name = "criu",
|
|
||||||
MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
|
|
||||||
OPAL_RELEASE_VERSION),
|
|
||||||
|
|
||||||
/* Component open and close functions */
|
|
||||||
.mca_open_component = crs_criu_open,
|
|
||||||
.mca_close_component = crs_criu_close,
|
|
||||||
.mca_query_component = opal_crs_criu_component_query,
|
|
||||||
.mca_register_component_params = crs_criu_register,
|
|
||||||
},
|
|
||||||
.base_data = {
|
|
||||||
/* The component is checkpoint ready */
|
|
||||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
||||||
},
|
|
||||||
|
|
||||||
.verbose = 0,
|
|
||||||
.output_handle = -1,
|
|
||||||
},
|
|
||||||
/* criu log file */
|
|
||||||
LOG_FILE,
|
|
||||||
/* criu log level */
|
|
||||||
0,
|
|
||||||
/* criu tcp established */
|
|
||||||
true,
|
|
||||||
/* criu shell job */
|
|
||||||
true,
|
|
||||||
/* criu external unix sockets */
|
|
||||||
true,
|
|
||||||
/* criu leave tasks in running state after checkpoint */
|
|
||||||
true
|
|
||||||
};
|
|
||||||
|
|
||||||
static int crs_criu_register(void)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
mca_base_component_t *component = &mca_crs_criu_component.super.base_version;
|
|
||||||
|
|
||||||
mca_crs_criu_component.super.priority = 10;
|
|
||||||
ret = mca_base_component_var_register(component, "priority",
|
|
||||||
"Priority of the CRS criu component (default: 10)",
|
|
||||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
|
|
||||||
&mca_crs_criu_component.super.priority);
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
mca_crs_criu_component.super.verbose = 0;
|
|
||||||
ret = mca_base_component_var_register(component, "verbose",
|
|
||||||
"Verbose level for the CRS criu component",
|
|
||||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_crs_criu_component.super.verbose);
|
|
||||||
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = mca_base_component_var_register(component, "log", "Name of CRIU logfile (default: criu.log)",
|
|
||||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_crs_criu_component.log_file);
|
|
||||||
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = mca_base_component_var_register(component, "log_level",
|
|
||||||
"Verbose level for the CRS criu component (default: 0)",
|
|
||||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_crs_criu_component.log_level);
|
|
||||||
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = mca_base_component_var_register(component, "tcp_established",
|
|
||||||
"Checkpoint/restore established TCP connections (default: true)",
|
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_crs_criu_component.tcp_established);
|
|
||||||
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = mca_base_component_var_register(component, "shell_job",
|
|
||||||
"Allow to dump and restore shell jobs (default: true)",
|
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_crs_criu_component.shell_job);
|
|
||||||
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = mca_base_component_var_register(component, "ext_unix_sk",
|
|
||||||
"Allow external unix connections (default: true)",
|
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_crs_criu_component.ext_unix_sk);
|
|
||||||
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = mca_base_component_var_register(component, "leave_running",
|
|
||||||
"Leave tasks in running state after checkpoint (default: true)",
|
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_crs_criu_component.leave_running);
|
|
||||||
|
|
||||||
return (0 > ret) ? ret : OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int crs_criu_open(void)
|
|
||||||
{
|
|
||||||
int oh;
|
|
||||||
|
|
||||||
/* If there is a custom verbose level for this component than use it
|
|
||||||
* otherwise take our parents level and output channel
|
|
||||||
*/
|
|
||||||
if (0 != mca_crs_criu_component.super.verbose) {
|
|
||||||
mca_crs_criu_component.super.output_handle = opal_output_open(NULL);
|
|
||||||
opal_output_set_verbosity(mca_crs_criu_component.super.output_handle,
|
|
||||||
mca_crs_criu_component.super.verbose);
|
|
||||||
} else {
|
|
||||||
mca_crs_criu_component.super.output_handle = opal_crs_base_framework.framework_output;
|
|
||||||
}
|
|
||||||
|
|
||||||
oh = mca_crs_criu_component.super.output_handle;
|
|
||||||
/*
|
|
||||||
* Debug output
|
|
||||||
*/
|
|
||||||
opal_output_verbose(10, oh, "crs:criu: open()");
|
|
||||||
opal_output_verbose(20, oh, "crs:criu: open: priority = %d",
|
|
||||||
mca_crs_criu_component.super.priority);
|
|
||||||
opal_output_verbose(20, oh, "crs:criu: open: verbosity = %d",
|
|
||||||
mca_crs_criu_component.super.verbose);
|
|
||||||
opal_output_verbose(20, oh, "crs:criu: open: log_file = %s",
|
|
||||||
mca_crs_criu_component.log_file);
|
|
||||||
opal_output_verbose(20, oh, "crs:criu: open: log_level = %d",
|
|
||||||
mca_crs_criu_component.log_level);
|
|
||||||
opal_output_verbose(20, oh, "crs:criu: open: tcp_established = %d",
|
|
||||||
mca_crs_criu_component.tcp_established);
|
|
||||||
opal_output_verbose(20, oh, "crs:criu: open: shell_job = %d",
|
|
||||||
mca_crs_criu_component.shell_job);
|
|
||||||
opal_output_verbose(20, oh, "crs:criu: open: ext_unix_sk = %d",
|
|
||||||
mca_crs_criu_component.ext_unix_sk);
|
|
||||||
opal_output_verbose(20, oh, "crs:criu: open: leave_running = %d",
|
|
||||||
mca_crs_criu_component.leave_running);
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int crs_criu_close(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
|
||||||
"crs:criu: close()");
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
@ -1,261 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
|
||||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
|
||||||
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <errno.h>
|
|
||||||
|
|
||||||
#include "opal/util/show_help.h"
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
#include "opal/util/argv.h"
|
|
||||||
#include "opal/constants.h"
|
|
||||||
|
|
||||||
#include "opal/mca/base/mca_base_var.h"
|
|
||||||
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/crs/base/base.h"
|
|
||||||
|
|
||||||
#include "crs_criu.h"
|
|
||||||
|
|
||||||
/* CRIU module */
|
|
||||||
static opal_crs_base_module_t criu_module = {
|
|
||||||
/* Initialization Function */
|
|
||||||
opal_crs_criu_module_init,
|
|
||||||
/* Finalization Function */
|
|
||||||
opal_crs_criu_module_finalize,
|
|
||||||
|
|
||||||
/* Checkpoint interface */
|
|
||||||
opal_crs_criu_checkpoint,
|
|
||||||
|
|
||||||
/* Restart Command Access */
|
|
||||||
opal_crs_criu_restart,
|
|
||||||
|
|
||||||
/* Disable checkpoints */
|
|
||||||
opal_crs_criu_disable_checkpoint,
|
|
||||||
/* Enable checkpoints */
|
|
||||||
opal_crs_criu_enable_checkpoint,
|
|
||||||
|
|
||||||
/* Prelaunch */
|
|
||||||
opal_crs_criu_prelaunch,
|
|
||||||
|
|
||||||
/* Register Thread */
|
|
||||||
opal_crs_criu_reg_thread
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Snapshot Class Functions */
|
|
||||||
OBJ_CLASS_DECLARATION(opal_crs_criu_snapshot_t);
|
|
||||||
|
|
||||||
struct opal_crs_criu_snapshot_t {
|
|
||||||
/* Base CRS snapshot type */
|
|
||||||
opal_crs_base_snapshot_t super;
|
|
||||||
};
|
|
||||||
typedef struct opal_crs_criu_snapshot_t opal_crs_criu_snapshot_t;
|
|
||||||
|
|
||||||
void opal_crs_criu_construct(opal_crs_criu_snapshot_t *obj);
|
|
||||||
void opal_crs_criu_destruct(opal_crs_criu_snapshot_t *obj);
|
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(opal_crs_criu_snapshot_t,
|
|
||||||
opal_crs_base_snapshot_t,
|
|
||||||
opal_crs_criu_construct,
|
|
||||||
opal_crs_criu_destruct);
|
|
||||||
|
|
||||||
void opal_crs_criu_construct(opal_crs_criu_snapshot_t *snapshot)
|
|
||||||
{
|
|
||||||
snapshot->super.component_name = strdup(mca_crs_criu_component.super.base_version.mca_component_name);
|
|
||||||
}
|
|
||||||
|
|
||||||
void opal_crs_criu_destruct(opal_crs_criu_snapshot_t *snapshot)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_criu_component_query(mca_base_module_t **module, int *priority)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
|
||||||
"crs:criu: component_query()");
|
|
||||||
|
|
||||||
*priority = mca_crs_criu_component.super.priority;
|
|
||||||
*module = (mca_base_module_t *)&criu_module;
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_criu_module_init(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
|
||||||
"crs:criu: module_init()");
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_criu_module_finalize(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
|
||||||
"crs:criu: module_finalize()");
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void criu_error(int ret, pid_t pid)
|
|
||||||
{
|
|
||||||
switch (ret) {
|
|
||||||
case -EBADE:
|
|
||||||
opal_output(0, "crs:criu:(PID:%d):RPC has returned fail", pid);
|
|
||||||
break;
|
|
||||||
case -ECONNREFUSED:
|
|
||||||
opal_output(0, "crs:criu:(PID:%d):Unable to connect to CRIU", pid);
|
|
||||||
break;
|
|
||||||
case -ECOMM:
|
|
||||||
opal_output(0, "crs:criu:(PID:%d):Unable to send/recv msg to/from CRIU", pid);
|
|
||||||
break;
|
|
||||||
case -EINVAL:
|
|
||||||
opal_output(0, "crs:criu:(PID:%d):CRIU doesn't support this type of request."
|
|
||||||
"You should probably update CRIU", pid);
|
|
||||||
break;
|
|
||||||
case -EBADMSG:
|
|
||||||
opal_output(0, "crs:criu:(PID:%d):Unexpected response from CRIU."
|
|
||||||
"You should probably update CRIU", pid);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
opal_output(0, "crs:criu:(PID:%d):Unknown error type code."
|
|
||||||
"You should probably update CRIU", pid);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_criu_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|
||||||
opal_crs_base_ckpt_options_t *options,
|
|
||||||
opal_crs_state_type_t *state)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
int fd = 0;
|
|
||||||
int oh = mca_crs_criu_component.super.output_handle;
|
|
||||||
opal_crs_criu_snapshot_t *snapshot = NULL;
|
|
||||||
char *dest = NULL;
|
|
||||||
|
|
||||||
opal_output_verbose(10, oh, "crs:criu: checkpoint(%d, ---)", pid);
|
|
||||||
|
|
||||||
snapshot = (opal_crs_criu_snapshot_t *)base_snapshot;
|
|
||||||
snapshot->super.component_name = strdup(mca_crs_criu_component.super.base_version.mca_component_name);
|
|
||||||
|
|
||||||
if (NULL == snapshot->super.metadata) {
|
|
||||||
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a"))) {
|
|
||||||
opal_output(oh, "crs:criu: checkpoint(): Error: Unable to open the file (%s)",
|
|
||||||
snapshot->super.metadata_filename);
|
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);
|
|
||||||
|
|
||||||
fclose(snapshot->super.metadata);
|
|
||||||
snapshot->super.metadata = NULL;
|
|
||||||
|
|
||||||
ret = criu_init_opts();
|
|
||||||
|
|
||||||
if (ret < 0) {
|
|
||||||
criu_error(ret, pid);
|
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
opal_output_verbose(10, oh, "crs:criu: criu_init_opts() returned %d", ret);
|
|
||||||
|
|
||||||
dest = snapshot->super.snapshot_directory;
|
|
||||||
opal_output_verbose(10, oh, "crs:criu: opening snapshot directory %s", dest);
|
|
||||||
fd = open(dest, O_DIRECTORY);
|
|
||||||
|
|
||||||
if (fd < 0) {
|
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
opal_output(oh, "crs:criu: checkpoint(): Error: Unable to open checkpoint "
|
|
||||||
"directory (%s) for pid (%d)", dest, pid);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* http://criu.org/C_API */
|
|
||||||
criu_set_images_dir_fd(fd);
|
|
||||||
criu_set_pid(pid);
|
|
||||||
|
|
||||||
criu_set_log_file(mca_crs_criu_component.log_file);
|
|
||||||
criu_set_log_level(mca_crs_criu_component.log_level);
|
|
||||||
criu_set_tcp_established(mca_crs_criu_component.tcp_established);
|
|
||||||
criu_set_shell_job(mca_crs_criu_component.shell_job);
|
|
||||||
criu_set_ext_unix_sk(mca_crs_criu_component.ext_unix_sk);
|
|
||||||
criu_set_leave_running(mca_crs_criu_component.leave_running);
|
|
||||||
ret = criu_dump();
|
|
||||||
|
|
||||||
if (ret < 0) {
|
|
||||||
criu_error(ret, pid);
|
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
*state = OPAL_CRS_CONTINUE;
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
|
|
||||||
if (fd > 0) {
|
|
||||||
close(fd);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (OPAL_CRS_ERROR == *state) {
|
|
||||||
return OPAL_ERROR;
|
|
||||||
}
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_criu_restart(opal_crs_base_snapshot_t *snapshot,
|
|
||||||
bool spawn_child, pid_t *child_pid)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
|
||||||
"crs:criu: %s", __func__);
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_criu_disable_checkpoint(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
|
||||||
"crs:criu: %s", __func__);
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_criu_enable_checkpoint(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
|
||||||
"crs:criu: %s", __func__);
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_criu_prelaunch(int32_t rank, char *base_snapshot_dir,
|
|
||||||
char **app, char **cwd, char ***argv,
|
|
||||||
char ***env)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
|
||||||
"crs:criu: %s", __func__);
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_criu_reg_thread(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_criu_component.super.output_handle,
|
|
||||||
"crs:criu: %s", __func__);
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
@ -1,7 +0,0 @@
|
|||||||
#
|
|
||||||
# owner/status file
|
|
||||||
# owner: institution that is responsible for this package
|
|
||||||
# status: e.g. active, maintenance, unmaintained
|
|
||||||
#
|
|
||||||
owner: CISCO
|
|
||||||
status: maintenance
|
|
@ -1,43 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2010 The Trustees of Indiana University.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
CFLAGS = $(crs_dmtcp_CFLAGS)
|
|
||||||
AM_CPPFLAGS = $(crs_dmtcp_CPPFLAGS)
|
|
||||||
|
|
||||||
sources = \
|
|
||||||
crs_dmtcp.h \
|
|
||||||
crs_dmtcp_component.c \
|
|
||||||
crs_dmtcp_module.c
|
|
||||||
|
|
||||||
# Make the output library in this directory, and name it either
|
|
||||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
|
||||||
# (for static builds).
|
|
||||||
|
|
||||||
if MCA_BUILD_opal_crs_dmtcp_DSO
|
|
||||||
component_noinst =
|
|
||||||
component_install = mca_crs_dmtcp.la
|
|
||||||
else
|
|
||||||
component_noinst = libmca_crs_dmtcp.la
|
|
||||||
component_install =
|
|
||||||
endif
|
|
||||||
|
|
||||||
mcacomponentdir = $(opallibdir)
|
|
||||||
mcacomponent_LTLIBRARIES = $(component_install)
|
|
||||||
mca_crs_dmtcp_la_SOURCES = $(sources)
|
|
||||||
mca_crs_dmtcp_la_LDFLAGS = -module -avoid-version $(crs_dmtcp_LDFLAGS)
|
|
||||||
mca_crs_dmtcp_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
|
|
||||||
$(crs_dmtcp_LIBS)
|
|
||||||
|
|
||||||
noinst_LTLIBRARIES = $(component_noinst)
|
|
||||||
libmca_crs_dmtcp_la_SOURCES = $(sources)
|
|
||||||
libmca_crs_dmtcp_la_LDFLAGS = -module -avoid-version $(crs_dmtcp_LDFLAGS)
|
|
||||||
libmca_crs_dmtcp_la_LIBADD = $(crs_dmtcp_LIBS)
|
|
@ -1,140 +0,0 @@
|
|||||||
# -*- shell-script -*-
|
|
||||||
#
|
|
||||||
# Copyright (c) 2010 The Trustees of Indiana University.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2015 Research Organization for Information Science
|
|
||||||
# and Technology (RIST). All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
# MCA_opal_crs_dmtcp_CONFIG([action-if-found], [action-if-not-found])
|
|
||||||
# -----------------------------------------------------------
|
|
||||||
AC_DEFUN([MCA_opal_crs_dmtcp_CONFIG],[
|
|
||||||
AC_CONFIG_FILES([opal/mca/crs/dmtcp/Makefile])
|
|
||||||
|
|
||||||
OPAL_VAR_SCOPE_PUSH([opal_check_crs_dmtcp_good opal_opal_check_crs_dmtcp_save_CPPFLAGS opal_opal_check_crs_dmtcp_save_LDFLAGS opal_opal_check_crs_dmtcp_save_LIBS opal_check_crs_dmtcp_dir_msg opal_check_crs_dmtcp_libdir_msg opal_check_crs_dmtcp_dir opal_check_crs_dmtcp_libdir])
|
|
||||||
|
|
||||||
|
|
||||||
opal_check_crs_dmtcp_good="no"
|
|
||||||
|
|
||||||
# Configure option to specify where to look for DMTCP headers
|
|
||||||
# --with-dmtcp(=DIR)
|
|
||||||
AC_ARG_WITH([dmtcp],
|
|
||||||
[AC_HELP_STRING([--with-dmtcp(=DIR)],
|
|
||||||
[Path to DMTCP Installation])])
|
|
||||||
OPAL_CHECK_WITHDIR([dmtcp], [$with_dmtcp], [include/mtcp.h])
|
|
||||||
|
|
||||||
# Configure option to specify where to look for DMTCP libraries
|
|
||||||
# (Default: $with_dmtcp/lib)
|
|
||||||
# --with-dmtcp-libdir=DIR
|
|
||||||
AC_ARG_WITH([dmtcp-libdir],
|
|
||||||
[AC_HELP_STRING([--with-dmtcp-libdir=DIR],
|
|
||||||
[Search for DMTCP libraries in DIR])])
|
|
||||||
OPAL_CHECK_WITHDIR([dmtcp-libdir], [$with_dmtcp_libdir], [libmtcp.so])
|
|
||||||
|
|
||||||
#
|
|
||||||
# Check if Open MPI was compiled with Checkpoint/Restart support
|
|
||||||
# If not, then we do not compile this component
|
|
||||||
#
|
|
||||||
AS_IF([test "$opal_want_ft" = "0"],
|
|
||||||
[opal_check_crs_dmtcp_good="no"],
|
|
||||||
[opal_check_crs_dmtcp_good="yes"])
|
|
||||||
|
|
||||||
#
|
|
||||||
# Check if the user explicitly requested -not- to build the DMTCP component
|
|
||||||
# If so, the we do not compile this component
|
|
||||||
#
|
|
||||||
AS_IF([test "$with_dmtcp" = "no" || test "$opal_check_crs_dmtcp_good" = "no"],
|
|
||||||
[opal_check_crs_dmtcp_good="no"],
|
|
||||||
[opal_check_crs_dmtcp_good="yes"])
|
|
||||||
|
|
||||||
# Save some flags
|
|
||||||
opal_opal_check_crs_dmtcp_save_CPPFLAGS=$CPPFLAGS
|
|
||||||
opal_opal_check_crs_dmtcp_save_LDFLAGS=$LDFLAGS
|
|
||||||
opal_opal_check_crs_dmtcp_save_LIBS=$LIBS
|
|
||||||
|
|
||||||
#
|
|
||||||
# Now to check if the library is usable
|
|
||||||
#
|
|
||||||
opal_check_crs_dmtcp_dir_msg="compiler default"
|
|
||||||
opal_check_crs_dmtcp_libdir_msg="linker default"
|
|
||||||
opal_check_crs_dmtcp_dir=""
|
|
||||||
opal_check_crs_dmtcp_libdir=""
|
|
||||||
|
|
||||||
# Determine the search paths for the headers and libraries
|
|
||||||
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
|
|
||||||
[AS_IF([test ! -z "$with_dmtcp" && test "$with_dmtcp" != "yes"],
|
|
||||||
[opal_check_crs_dmtcp_dir="$with_dmtcp"
|
|
||||||
opal_check_crs_dmtcp_dir_msg="$with_dmtcp (from --with-dmtcp)"])
|
|
||||||
AS_IF([test ! -z "$with_dmtcp_libdir" && test "$with_dmtcp_libdir" != "yes"],
|
|
||||||
[opal_check_crs_dmtcp_libdir="$with_dmtcp_libdir"
|
|
||||||
opal_check_crs_dmtcp_libdir_msg="$with_dmtcp_libdir (from --with-dmtcp-libdir)"])
|
|
||||||
])
|
|
||||||
|
|
||||||
# Look for DMTCP.
|
|
||||||
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
|
|
||||||
[AC_MSG_CHECKING([for DMTCP dir])
|
|
||||||
AC_MSG_RESULT([$opal_check_crs_dmtcp_dir_msg])
|
|
||||||
AC_MSG_CHECKING([for DMTCP library dir])
|
|
||||||
AC_MSG_RESULT([$opal_check_crs_dmtcp_libdir_msg])
|
|
||||||
OPAL_CHECK_PACKAGE([crs_dmtcp_check],
|
|
||||||
[mtcp.h],
|
|
||||||
[mtcp],
|
|
||||||
[mtcp_init],
|
|
||||||
[],
|
|
||||||
[$opal_check_crs_dmtcp_dir],
|
|
||||||
[$opal_check_crs_dmtcp_libdir],
|
|
||||||
[opal_check_crs_dmtcp_good="yes"],
|
|
||||||
[opal_check_crs_dmtcp_good="no"])
|
|
||||||
])
|
|
||||||
|
|
||||||
# When we restart a thread, we use execlp() to exec the "mtcp_restart"
|
|
||||||
# command. We don't care what its path is, but it does need to exist in
|
|
||||||
# the PATH.
|
|
||||||
AC_CHECK_PROG([mtcp_restart_command_exists], ["mtcp_restart"], ["yes"], ["no"])
|
|
||||||
AS_IF([test "$mtcp_restart_command_exists" = "no"],
|
|
||||||
[opal_check_crs_dmtcp_good="no"
|
|
||||||
AS_IF([test ! -z "$with_dmtcp" && test "$with_dmtcp" != "no"],
|
|
||||||
[AC_MSG_WARN([mtcp_restart not found in PATH.])
|
|
||||||
AC_MSG_ERROR([Aborting.])])])
|
|
||||||
|
|
||||||
#
|
|
||||||
# If '-lmtcp' or
|
|
||||||
# '-I' or '-L' was needed to link to MTCP, then OPAL_CHECK_PACKAGE
|
|
||||||
# sets the crs_mtcp_check_* variables, which we use below.
|
|
||||||
#
|
|
||||||
|
|
||||||
crs_dmtcp_CFLAGS="$CFLAGS $crs_dmtcp_check_CFLAGS"
|
|
||||||
crs_dmtcp_CPPFLAGS="$CPPFLAGS $crs_dmtcp_check_CPPFLAGS"
|
|
||||||
crs_dmtcp_LDFLAGS="$LDFLAGS $crs_dmtcp_check_LDFLAGS"
|
|
||||||
crs_dmtcp_LIBS="$crs_dmtcp_check_LIBS $LIBS"
|
|
||||||
|
|
||||||
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
|
|
||||||
[$1])
|
|
||||||
|
|
||||||
CPPFLAGS=$opal_opal_check_crs_dmtcp_save_CPPFLAGS
|
|
||||||
LDFLAGS="$crs_dmtcp_check_LDFLAGS $opal_opal_check_crs_dmtcp_save_LDFLAGS"
|
|
||||||
LIBS="$crs_dmtcp_LIBS $opal_opal_check_crs_dmtcp_save_LIBS"
|
|
||||||
|
|
||||||
AC_SUBST([crs_dmtcp_CFLAGS])
|
|
||||||
AC_SUBST([crs_dmtcp_CPPFLAGS])
|
|
||||||
AC_SUBST([crs_dmtcp_LDFLAGS])
|
|
||||||
AC_SUBST([crs_dmtcp_LIBS])
|
|
||||||
|
|
||||||
# If all is good at this point then post any compiler options to
|
|
||||||
# the build environment. If all is not good at this point and
|
|
||||||
# DMTCP was explicitly requested, then error out.
|
|
||||||
|
|
||||||
AS_IF([test "$opal_check_crs_dmtcp_good" = "yes"],
|
|
||||||
[$1],
|
|
||||||
[AS_IF([test ! -z "$with_dmtcp" && test "$with_dmtcp" != "no"],
|
|
||||||
[AC_MSG_WARN([DMTCP support requested but not found. Perhaps you need to specify the location of the DMTCP libraries.])
|
|
||||||
AC_MSG_ERROR([Aborting.])])
|
|
||||||
$2])
|
|
||||||
OPAL_VAR_SCOPE_POP
|
|
||||||
])dnl
|
|
@ -1,87 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2010 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @file
|
|
||||||
*
|
|
||||||
* DMTCP CRS component
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MCA_CRS_DMTCP_EXPORT_H
|
|
||||||
#define MCA_CRS_DMTCP_EXPORT_H
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
|
|
||||||
|
|
||||||
#include "opal/mca/mca.h"
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/base/base.h"
|
|
||||||
|
|
||||||
/* JJH NOTE: Include your library header here */
|
|
||||||
/* #include <libmtcp.h> */
|
|
||||||
#include <mtcp.h>
|
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Local Component Structure
|
|
||||||
*/
|
|
||||||
struct opal_crs_dmtcp_component_t {
|
|
||||||
/** Base CRS component */
|
|
||||||
opal_crs_base_component_t super;
|
|
||||||
|
|
||||||
/** JJH: Add additional items here as needed internally */
|
|
||||||
};
|
|
||||||
typedef struct opal_crs_dmtcp_component_t opal_crs_dmtcp_component_t;
|
|
||||||
OPAL_MODULE_DECLSPEC extern opal_crs_dmtcp_component_t mca_crs_dmtcp_component;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Component query command
|
|
||||||
* - Called during opal_init() to determine if this component should be selected.
|
|
||||||
*/
|
|
||||||
int opal_crs_dmtcp_component_query(mca_base_module_t **module, int *priority);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Module functions
|
|
||||||
*/
|
|
||||||
int opal_crs_dmtcp_module_init(void);
|
|
||||||
int opal_crs_dmtcp_module_finalize(void);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Actual CRS funcationality
|
|
||||||
*/
|
|
||||||
int opal_crs_dmtcp_checkpoint( pid_t pid,
|
|
||||||
opal_crs_base_snapshot_t *snapshot,
|
|
||||||
opal_crs_base_ckpt_options_t *options,
|
|
||||||
opal_crs_state_type_t *state);
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_restart( opal_crs_base_snapshot_t *snapshot,
|
|
||||||
bool spawn_child,
|
|
||||||
pid_t *child_pid);
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_disable_checkpoint(void);
|
|
||||||
int opal_crs_dmtcp_enable_checkpoint(void);
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_prelaunch(int32_t rank,
|
|
||||||
char *base_snapshot_dir,
|
|
||||||
char **app,
|
|
||||||
char **cwd,
|
|
||||||
char ***argv,
|
|
||||||
char ***env);
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_reg_thread(void);
|
|
||||||
|
|
||||||
END_C_DECLS
|
|
||||||
|
|
||||||
#endif /* MCA_CRS_DMTCP_EXPORT_H */
|
|
@ -1,133 +0,0 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2010 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
|
||||||
* reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
|
|
||||||
#include "opal/constants.h"
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/crs/base/base.h"
|
|
||||||
#include "crs_dmtcp.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Local functionality
|
|
||||||
*/
|
|
||||||
static int crs_dmtcp_register (void);
|
|
||||||
static int crs_dmtcp_open(void);
|
|
||||||
static int crs_dmtcp_close(void);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Instantiate the public struct with all of our public information
|
|
||||||
* and pointer to our public functions in it
|
|
||||||
*/
|
|
||||||
opal_crs_dmtcp_component_t mca_crs_dmtcp_component = {
|
|
||||||
/* First do the base component stuff */
|
|
||||||
{
|
|
||||||
/* Handle the general mca_component_t struct containing
|
|
||||||
* meta information about the component itself
|
|
||||||
*/
|
|
||||||
.base_version = {
|
|
||||||
OPAL_CRS_BASE_VERSION_2_0_0,
|
|
||||||
|
|
||||||
/* Component name and version */
|
|
||||||
.mca_component_name = "dmtcp",
|
|
||||||
MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
|
|
||||||
OPAL_RELEASE_VERSION),
|
|
||||||
|
|
||||||
/* Component open and close functions */
|
|
||||||
.mca_open_component = crs_dmtcp_open,
|
|
||||||
.mca_close_component = crs_dmtcp_close,
|
|
||||||
.mca_query_component = opal_crs_dmtcp_component_query,
|
|
||||||
.mca_register_component_params = crs_dmtcp_register,
|
|
||||||
},
|
|
||||||
.base_data = {
|
|
||||||
/* The component is checkpoint ready */
|
|
||||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
||||||
},
|
|
||||||
|
|
||||||
.verbose = 0,
|
|
||||||
.output_handle = -1
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
static int crs_dmtcp_register (void)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
/*
|
|
||||||
* User can adjust the relative priority of this component with respect
|
|
||||||
* to other CRS components available for selection.
|
|
||||||
*/
|
|
||||||
mca_crs_dmtcp_component.super.priority = 20
|
|
||||||
ret = mca_base_component_var_register (&mca_crs_dmtcp_component.super.base_version,
|
|
||||||
"priority", "Priority of the CRS dmtcp component "
|
|
||||||
"(default: 20)", MCA_BASE_VAR_TYPE_INT, NULL,
|
|
||||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9,
|
|
||||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
|
||||||
&mca_crs_dmtcp_component.super.priority);
|
|
||||||
if (0 > ret) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Adjust the verbosity level for this component. Default off or 0.
|
|
||||||
*/
|
|
||||||
mca_crs_dmtcp_component.super.verbose = 0;
|
|
||||||
ret = mca_base_component_var_register (&mca_crs_dmtcp_component.super.base_version,
|
|
||||||
"verbose",
|
|
||||||
"Verbose level for the CRS dmtcp component",
|
|
||||||
MCA_BASE_VAR_TYPE_INT, NULL,MCA_BASE_VAR_FLAG_SETTABLE,
|
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_crs_dmtcp_component.super.verbose);
|
|
||||||
return (0 > ret) ? ret : OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int crs_dmtcp_open(void)
|
|
||||||
{
|
|
||||||
/* If there is a custom verbose level for this component than use it
|
|
||||||
* otherwise take our parents level and output channel
|
|
||||||
*/
|
|
||||||
if ( 0 != mca_crs_dmtcp_component.super.verbose) {
|
|
||||||
mca_crs_dmtcp_component.super.output_handle = opal_output_open(NULL);
|
|
||||||
opal_output_set_verbosity(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
mca_crs_dmtcp_component.super.verbose);
|
|
||||||
} else {
|
|
||||||
mca_crs_dmtcp_component.super.output_handle = opal_crs_base_framework.framework_output;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Debug output
|
|
||||||
*/
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: open()");
|
|
||||||
opal_output_verbose(20, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: open: priority = %d",
|
|
||||||
mca_crs_dmtcp_component.super.priority);
|
|
||||||
opal_output_verbose(20, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: open: verbosity = %d",
|
|
||||||
mca_crs_dmtcp_component.super.verbose);
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int crs_dmtcp_close(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: close()");
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
@ -1,709 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2010 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
|
|
||||||
#include <sched.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <errno.h>
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <sys/wait.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <sys/syscall.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
#include "opal/util/argv.h"
|
|
||||||
#include "opal/constants.h"
|
|
||||||
|
|
||||||
#include "opal/mca/base/mca_base_var.h"
|
|
||||||
|
|
||||||
#include "opal/threads/mutex.h"
|
|
||||||
#include "opal/threads/condition.h"
|
|
||||||
|
|
||||||
#include "opal/mca/event/event.h"
|
|
||||||
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/crs/base/base.h"
|
|
||||||
|
|
||||||
#include "crs_dmtcp.h"
|
|
||||||
|
|
||||||
#define MTCP_RESTART_COMMAND "mtcp_restart"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* DMTCP module
|
|
||||||
*/
|
|
||||||
static opal_crs_base_module_t dmtcp_module = {
|
|
||||||
/** Initialization Function */
|
|
||||||
opal_crs_dmtcp_module_init,
|
|
||||||
/** Finalization Function */
|
|
||||||
opal_crs_dmtcp_module_finalize,
|
|
||||||
|
|
||||||
/** Checkpoint interface */
|
|
||||||
opal_crs_dmtcp_checkpoint,
|
|
||||||
|
|
||||||
/** Restart Command Access */
|
|
||||||
opal_crs_dmtcp_restart,
|
|
||||||
|
|
||||||
/** Disable checkpoints */
|
|
||||||
opal_crs_dmtcp_disable_checkpoint,
|
|
||||||
/** Enable checkpoints */
|
|
||||||
opal_crs_dmtcp_enable_checkpoint,
|
|
||||||
|
|
||||||
/** Prelaunch */
|
|
||||||
opal_crs_dmtcp_prelaunch,
|
|
||||||
|
|
||||||
/** Register Thread */
|
|
||||||
opal_crs_dmtcp_reg_thread
|
|
||||||
};
|
|
||||||
|
|
||||||
/***************************
|
|
||||||
* Snapshot Class Functions
|
|
||||||
***************************/
|
|
||||||
OBJ_CLASS_DECLARATION(opal_crs_dmtcp_snapshot_t);
|
|
||||||
|
|
||||||
struct opal_crs_dmtcp_snapshot_t {
|
|
||||||
/** Base CRS snapshot type */
|
|
||||||
opal_crs_base_snapshot_t super;
|
|
||||||
char * context_filename;
|
|
||||||
};
|
|
||||||
typedef struct opal_crs_dmtcp_snapshot_t opal_crs_dmtcp_snapshot_t;
|
|
||||||
|
|
||||||
void opal_crs_dmtcp_construct(opal_crs_dmtcp_snapshot_t *obj);
|
|
||||||
void opal_crs_dmtcp_destruct(opal_crs_dmtcp_snapshot_t *obj);
|
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(opal_crs_dmtcp_snapshot_t,
|
|
||||||
opal_crs_base_snapshot_t,
|
|
||||||
opal_crs_dmtcp_construct,
|
|
||||||
opal_crs_dmtcp_destruct);
|
|
||||||
|
|
||||||
/******************
|
|
||||||
* Local Functions
|
|
||||||
******************/
|
|
||||||
static int dmtcp_cold_start(opal_crs_dmtcp_snapshot_t *snapshot);
|
|
||||||
static int dmtcp_generate_full_ckpt_path(opal_crs_dmtcp_snapshot_t *snapshot);
|
|
||||||
static void dmtcp_sleep_between_ckpt_callback(int interval);
|
|
||||||
static void dmtcp_pre_ckpt_callback(char **ckpt_filename);
|
|
||||||
static void dmtcp_post_ckpt_callback(int is_restarting,
|
|
||||||
char *mtcp_restore_argv_start_addr);
|
|
||||||
static int dmtcp_should_ckpt_fd_callback(int fd);
|
|
||||||
|
|
||||||
/*************************
|
|
||||||
* Local Global Variables
|
|
||||||
*************************/
|
|
||||||
static char *full_ckpt_path = NULL;
|
|
||||||
static pthread_cond_t checkpoint_cond = PTHREAD_COND_INITIALIZER;
|
|
||||||
static pthread_cond_t checkpoint_done_cond = PTHREAD_COND_INITIALIZER;
|
|
||||||
static pthread_mutex_t checkpoint_mutex = PTHREAD_MUTEX_INITIALIZER;
|
|
||||||
static int post_ckpt_state;
|
|
||||||
|
|
||||||
void opal_crs_dmtcp_construct(opal_crs_dmtcp_snapshot_t *snapshot) {
|
|
||||||
snapshot->context_filename = NULL;
|
|
||||||
snapshot->super.component_name =
|
|
||||||
strdup(mca_crs_dmtcp_component.super.base_version.mca_component_name);
|
|
||||||
}
|
|
||||||
|
|
||||||
void opal_crs_dmtcp_destruct( opal_crs_dmtcp_snapshot_t *snapshot) {
|
|
||||||
if(NULL != snapshot->context_filename) {
|
|
||||||
free(snapshot->context_filename);
|
|
||||||
snapshot->context_filename = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*****************
|
|
||||||
* MCA Functions
|
|
||||||
*****************/
|
|
||||||
int opal_crs_dmtcp_component_query(mca_base_module_t **module, int *priority)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: component_query()");
|
|
||||||
|
|
||||||
*priority = mca_crs_dmtcp_component.super.priority;
|
|
||||||
*module = (mca_base_module_t *)&dmtcp_module;
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_module_init(void)
|
|
||||||
{
|
|
||||||
char *temp_checkpoint_name;
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: module_init()");
|
|
||||||
|
|
||||||
/*
|
|
||||||
* JJH NOTE: Call any initialization routines you require
|
|
||||||
*/
|
|
||||||
mtcp_set_callbacks(dmtcp_sleep_between_ckpt_callback, /* sleep_between_ckpt */
|
|
||||||
dmtcp_pre_ckpt_callback, /* pre_ckpt */
|
|
||||||
dmtcp_post_ckpt_callback, /* post_ckpt */
|
|
||||||
dmtcp_should_ckpt_fd_callback, /* ckpt_fd */
|
|
||||||
NULL); /* write_ckpt_header */
|
|
||||||
|
|
||||||
/* This serves to simply initialize MTCP. The checkpoint file will
|
|
||||||
* actually be set by our pre_ckpt callback (which takes it from the
|
|
||||||
* snapshot given to the CRS checkpoint function), and the interval will be
|
|
||||||
* ignored, substituted for a synchronization signal that is handled by our
|
|
||||||
* sleep_between_ckpt callback.
|
|
||||||
*/
|
|
||||||
|
|
||||||
asprintf(&temp_checkpoint_name, "checkpoint.dmtcp.%ld", syscall(SYS_getpid));
|
|
||||||
mtcp_init(temp_checkpoint_name, 0, 1);
|
|
||||||
mtcp_ok();
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: leaving module_init()");
|
|
||||||
|
|
||||||
free(temp_checkpoint_name);
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_module_finalize(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: module_finalize()");
|
|
||||||
|
|
||||||
/*
|
|
||||||
* JJH NOTE: Call any finalization routines you require
|
|
||||||
*/
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_prelaunch(int32_t rank,
|
|
||||||
char *base_snapshot_dir,
|
|
||||||
char **app,
|
|
||||||
char **cwd,
|
|
||||||
char ***argv,
|
|
||||||
char ***env)
|
|
||||||
{
|
|
||||||
char * tmp_env_var = NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The below should be left untouched for now
|
|
||||||
*/
|
|
||||||
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
|
|
||||||
opal_setenv(tmp_env_var,
|
|
||||||
"0", true, env);
|
|
||||||
free(tmp_env_var);
|
|
||||||
tmp_env_var = NULL;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: leaving module_prelaunch()");
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_reg_thread(void)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* JJH NOTE: If you require that all threads that may call into MTCP
|
|
||||||
* explicitly register with MTCP, then place the necessary
|
|
||||||
* initialization here.
|
|
||||||
*/
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: leaving module_reg_thread()");
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_checkpoint(pid_t pid,
|
|
||||||
opal_crs_base_snapshot_t *base_snapshot,
|
|
||||||
opal_crs_base_ckpt_options_t *options,
|
|
||||||
opal_crs_state_type_t *state)
|
|
||||||
{
|
|
||||||
int unlock_retval, exit_status = OPAL_SUCCESS;
|
|
||||||
char buf[BUFSIZ];
|
|
||||||
opal_crs_dmtcp_snapshot_t *snapshot;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: about to lock mutex for checkpoint()");
|
|
||||||
|
|
||||||
pthread_mutex_lock(&checkpoint_mutex);
|
|
||||||
snapshot = (opal_crs_dmtcp_snapshot_t *) base_snapshot;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: checkpoint(%d, ---)", pid);
|
|
||||||
|
|
||||||
/* Are we checkpointing ourselves or a peer.
|
|
||||||
* JJH NOTE: This will only ever be called when pid == getpid()
|
|
||||||
* This is an old interface argument, that is no longer used.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* bricka (2010-05-14): According to crs.h, 0 also indicates checkpointing
|
|
||||||
* self.
|
|
||||||
*/
|
|
||||||
if((pid != 0) && (pid != syscall(SYS_getpid)) ) {
|
|
||||||
/* MTCP can only checkpoint a single process: we can only checkpoint
|
|
||||||
* ourself. */
|
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* the metadata file should always be NULL at this point */
|
|
||||||
if ( NULL != snapshot->super.metadata) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: checkpoint(): Error: Metadata file already open");
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Update the snapshot metadata with the component name so opal-restart can
|
|
||||||
* pick the correct CRS to restart with.
|
|
||||||
*/
|
|
||||||
snapshot->super.component_name = strdup(mca_crs_dmtcp_component.super.base_version.mca_component_name);
|
|
||||||
|
|
||||||
if( NULL == snapshot->super.metadata ) {
|
|
||||||
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: checkpoint(): Error: Unable to open the file (%s)",
|
|
||||||
snapshot->super.metadata_filename);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* The filename of the checkpoint will be changed by our pre_ckpt hook
|
|
||||||
* based on the options given to this function. */
|
|
||||||
if(dmtcp_generate_full_ckpt_path(snapshot) == -1) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_checkpoint: unable to generate context filename.");
|
|
||||||
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* JJH NOTE: You can write however much or little data you want to the
|
|
||||||
* metadata file. The metadata file is stored with the local
|
|
||||||
* checkpoint, and provided at restart time to help the
|
|
||||||
* CRS component deteremine how to restart from any files
|
|
||||||
* that is left in this directory during checkpoint.
|
|
||||||
* Use the command below to write key/value strings to the
|
|
||||||
* metadata file.
|
|
||||||
* (Just as we did above with the component name).
|
|
||||||
*/
|
|
||||||
if ( 0 > fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name)) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_checkpoint: unable to print component name to metadata");
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( 0 > fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->context_filename)) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_checkpoint: unable to print context name to metadata");
|
|
||||||
}
|
|
||||||
|
|
||||||
fclose(snapshot->super.metadata );
|
|
||||||
snapshot->super.metadata = NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* JJH NOTE: Setup and request a checkpoint of this process.
|
|
||||||
*/
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_checkpoint: will checkpoint to file: %s",
|
|
||||||
full_ckpt_path);
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_checkpoint: about to signal checkpoint");
|
|
||||||
|
|
||||||
/* Now that we have set the requested filename, we simply need to start
|
|
||||||
* the checkpoint. */
|
|
||||||
pthread_cond_signal(&checkpoint_cond);
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_checkpoint: signalled checkpoint");
|
|
||||||
|
|
||||||
/* We want to wait for the checkpoint to finish before we continue (in
|
|
||||||
* particular, we need the post_ckpt hook to happen so that we know the
|
|
||||||
* status of the checkpoint)
|
|
||||||
*/
|
|
||||||
pthread_cond_wait(&checkpoint_done_cond, &checkpoint_mutex);
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_checkpoint: received checkpoint_done signal");
|
|
||||||
|
|
||||||
/* We have now been checkpointed. Note that the state of the checkpoint
|
|
||||||
* (OPAL_CRS_CONTINUE, etc.) has been recorded by the post_ckpt hook.
|
|
||||||
*/
|
|
||||||
*state = post_ckpt_state;
|
|
||||||
exit_status = OPAL_SUCCESS;
|
|
||||||
|
|
||||||
free(full_ckpt_path);
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
unlock_retval = pthread_mutex_unlock(&checkpoint_mutex);
|
|
||||||
|
|
||||||
if( 0 != unlock_retval ) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_checkpoint: unable to unlock mutex at end of checkpoint: %s",
|
|
||||||
strerror_r(unlock_retval, buf, BUFSIZ));
|
|
||||||
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( NULL != snapshot->super.metadata ) {
|
|
||||||
fclose(snapshot->super.metadata );
|
|
||||||
snapshot->super.metadata = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
|
|
||||||
{
|
|
||||||
int ret, exit_status = OPAL_SUCCESS;
|
|
||||||
int exec_status;
|
|
||||||
|
|
||||||
opal_crs_dmtcp_snapshot_t *snapshot = OBJ_NEW(opal_crs_dmtcp_snapshot_t);
|
|
||||||
snapshot->super = *base_snapshot;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: restart(--, %d)", spawn_child);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* JJH NOTE: 'cold_start' indicates that this process is being restarted from
|
|
||||||
* opal-restart instead of from within an already running process.
|
|
||||||
* In the current code base, this is always set to true since it
|
|
||||||
* does not allow a process to request a restart of itself.
|
|
||||||
*/
|
|
||||||
if(snapshot->super.cold_start) {
|
|
||||||
/*
|
|
||||||
* Read the metadata left by the checkpoint() of this process
|
|
||||||
*/
|
|
||||||
if( OPAL_SUCCESS != (ret = dmtcp_cold_start(snapshot)) ) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_restart: Unable to reconstruct the snapshot.");
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* JJH NOTE: Nearly all of the time the 'spawn_child' argument is set to
|
|
||||||
* 'false' indicating that the restart function is expected to
|
|
||||||
* call exec() directly. It is only set to 'true' if the user
|
|
||||||
* explicitly tells opal-restart to spawn off the child, which
|
|
||||||
* rarely/never happens. So I would not worry about that option.
|
|
||||||
*/
|
|
||||||
if( spawn_child ) {
|
|
||||||
pid_t child_pid = fork();
|
|
||||||
|
|
||||||
if(child_pid > 0)
|
|
||||||
goto cleanup;
|
|
||||||
else if(child_pid < 0) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_restart: Unable to spawn child.");
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* JJH NOTE: Restart the process by replacing this process
|
|
||||||
*/
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_restart: About to invoke command: %s with argv: %s %s",
|
|
||||||
MTCP_RESTART_COMMAND,
|
|
||||||
MTCP_RESTART_COMMAND,
|
|
||||||
snapshot->context_filename);
|
|
||||||
|
|
||||||
exec_status = execlp(MTCP_RESTART_COMMAND, MTCP_RESTART_COMMAND, snapshot->context_filename, NULL);
|
|
||||||
|
|
||||||
/* If we get down here, something has broken. */
|
|
||||||
|
|
||||||
if(exec_status < 0)
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_restart: error in replacing process: %s",
|
|
||||||
strerror(errno));
|
|
||||||
else
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_restart: exec() returned!");
|
|
||||||
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_disable_checkpoint(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: disable_checkpoint()");
|
|
||||||
|
|
||||||
/*
|
|
||||||
* JJH NOTE: Enter a critical section. This is not really used in the code
|
|
||||||
* at the moment.
|
|
||||||
*/
|
|
||||||
mtcp_no();
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_crs_dmtcp_enable_checkpoint(void)
|
|
||||||
{
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: enable_checkpoint()");
|
|
||||||
/*
|
|
||||||
* JJH NOTE: Leave a critical section. This is not really used in the code
|
|
||||||
* at the moment.
|
|
||||||
*/
|
|
||||||
mtcp_ok();
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*****************************
|
|
||||||
* Local Function Definitions
|
|
||||||
*****************************/
|
|
||||||
static int dmtcp_cold_start(opal_crs_dmtcp_snapshot_t *snapshot) {
|
|
||||||
int ret, exit_status = OPAL_SUCCESS;
|
|
||||||
char **tmp_argv = NULL;
|
|
||||||
char * component_name = NULL;
|
|
||||||
int prev_pid;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Find the snapshot directory, read the metadata file for
|
|
||||||
* component name and previous pid
|
|
||||||
*/
|
|
||||||
if( NULL == snapshot->super.metadata ) {
|
|
||||||
if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "r")) ) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_cold_start(): Error: Unable to open the file (%s)",
|
|
||||||
snapshot->super.metadata_filename);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
|
|
||||||
&component_name, &prev_pid) ) ) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
|
|
||||||
snapshot->super.metadata_filename, ret);
|
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
snapshot->super.component_name = strdup(component_name);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Compare the component strings to make sure this is our snapshot before going further.
|
|
||||||
* JJH NOTE: This will nearly always be true since opal-restart also checks this metadata.
|
|
||||||
*/
|
|
||||||
if ( 0 != strncmp(mca_crs_dmtcp_component.super.base_version.mca_component_name,
|
|
||||||
component_name, strlen(component_name)) ) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
|
|
||||||
component_name, mca_crs_dmtcp_component.super.base_version.mca_component_name);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Read context information from the metadata file
|
|
||||||
*/
|
|
||||||
opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
|
|
||||||
if( NULL == tmp_argv ) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: dmtcp_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
|
|
||||||
CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
asprintf(&(snapshot->context_filename), "%s/%s", snapshot->super.snapshot_directory, tmp_argv[0]);
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: cold_start(%s)", snapshot->context_filename);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Reset the cold_start flag
|
|
||||||
*/
|
|
||||||
snapshot->super.cold_start = false;
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
if(NULL != tmp_argv) {
|
|
||||||
opal_argv_free(tmp_argv);
|
|
||||||
tmp_argv = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( NULL != snapshot->super.metadata ) {
|
|
||||||
fclose(snapshot->super.metadata);
|
|
||||||
snapshot->super.metadata = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a snapshot, generate the context filename and its full path.
|
|
||||||
*
|
|
||||||
* @param snapshot the snapshot with request information
|
|
||||||
*/
|
|
||||||
static int dmtcp_generate_full_ckpt_path(opal_crs_dmtcp_snapshot_t *snapshot)
|
|
||||||
{
|
|
||||||
int retval;
|
|
||||||
retval = asprintf(&(snapshot->context_filename), "ompi_dmtcp_context.%ld", syscall(SYS_getpid));
|
|
||||||
if(retval == -1)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
return asprintf(&full_ckpt_path, "%s/%s", snapshot->super.snapshot_directory, snapshot->context_filename);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This is a callback function to call the actual checkpointing routine.
|
|
||||||
* Instead of waiting for a specific interval as MTCP does, we will wait on a
|
|
||||||
* synchronization signal that will allow us to checkpoint on demand. The
|
|
||||||
* argument to this function will be ignored.
|
|
||||||
*/
|
|
||||||
static void dmtcp_sleep_between_ckpt_callback(int interval)
|
|
||||||
{
|
|
||||||
int signal_retval;
|
|
||||||
char buf[BUFSIZ];
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: called sleep_between_ckpt callback");
|
|
||||||
|
|
||||||
pthread_mutex_lock(&checkpoint_mutex);
|
|
||||||
|
|
||||||
/* If the MPI checkpoint thread is waiting on the checkpoint_done_cond and
|
|
||||||
* this thread is here, it means that a checkpoint has just completed.
|
|
||||||
* Let's signal the MPI checkpoint thread to resume. */
|
|
||||||
signal_retval = pthread_cond_signal(&checkpoint_done_cond);
|
|
||||||
|
|
||||||
if( 0 != signal_retval) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: post_ckpt_callback(): Unable to signal checkpoint done: %s",
|
|
||||||
strerror_r(signal_retval, buf, BUFSIZ));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* now we simply wait for the signal to checkpoint */
|
|
||||||
pthread_cond_wait(&checkpoint_cond, &checkpoint_mutex);
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: received sync signal to checkpoint.");
|
|
||||||
|
|
||||||
/* We have now been instructed to checkpoint, so we return. Note that the
|
|
||||||
* mutex is still locked: the post_ckpt callback will unlock it. */
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This is a callback function that is invoked before the checkpoint actually
|
|
||||||
* occurs. It enables us to do any logging that is necessary, as well as change
|
|
||||||
* the filename that the checkpoint will be written to. We expect that this
|
|
||||||
* filename will be pulled from the checkpoint options.
|
|
||||||
*
|
|
||||||
* @param ckpt_filename a pointer in which to store the desired checkpoint
|
|
||||||
* filename
|
|
||||||
*/
|
|
||||||
static void dmtcp_pre_ckpt_callback(char **ckpt_filename)
|
|
||||||
{
|
|
||||||
*ckpt_filename = full_ckpt_path;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This is a callback function that is invoked after the checkpoint has
|
|
||||||
* finished. It enables us to do any logging that is necessary, as well as
|
|
||||||
* report whether this is called from a restart or a checkpoint. We will report
|
|
||||||
* this status, signal the CRS code to continue running, and then release the
|
|
||||||
* mutex that we are holding.
|
|
||||||
*
|
|
||||||
* @param is_restarting whether or not this is being called as part of a restart
|
|
||||||
* @param mtcp_restore_argv_start_addr unused
|
|
||||||
*/
|
|
||||||
static void dmtcp_post_ckpt_callback(int is_restarting, char *mtcp_restore_argv_start_addr)
|
|
||||||
{
|
|
||||||
int unlock_retval;
|
|
||||||
char buf[BUFSIZ];
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: in post_ckpt_callback, restarting: %d", is_restarting);
|
|
||||||
if(is_restarting)
|
|
||||||
post_ckpt_state = OPAL_CRS_RESTART;
|
|
||||||
else
|
|
||||||
post_ckpt_state = OPAL_CRS_CONTINUE;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: unlocking at end of post_ckpt_callback");
|
|
||||||
|
|
||||||
unlock_retval = pthread_mutex_unlock(&checkpoint_mutex);
|
|
||||||
|
|
||||||
if( 0 != unlock_retval) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: post_ckpt_callback(): Unable to unlock mutex: %s",
|
|
||||||
strerror_r(unlock_retval, buf, BUFSIZ));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This is a callback function that is invoked by DMTCP to see if it should
|
|
||||||
* checkpoint the given file descriptor.
|
|
||||||
*
|
|
||||||
* If the file descriptor is a socket, named-pipe or pseudo-terminal, DMTCP
|
|
||||||
* should skip checkpointing them.
|
|
||||||
*
|
|
||||||
* If we can't determine the type of fd (stat and/or readlink failed), we ask
|
|
||||||
* DMTCP to try to checkpoint them anyways with the assumption that DMTCP would
|
|
||||||
* warn users of any such case.
|
|
||||||
*
|
|
||||||
* @param fd file descriptor to checkpoint
|
|
||||||
* @return: 1 if DMTCP should ckpt the file descriptor, 0 otherwise.
|
|
||||||
*/
|
|
||||||
static int dmtcp_should_ckpt_fd_callback(int fd)
|
|
||||||
{
|
|
||||||
struct stat stat_buf;
|
|
||||||
char device_name[PATH_MAX];
|
|
||||||
char proc_filename[64];
|
|
||||||
char buf[BUFSIZ];
|
|
||||||
|
|
||||||
if (fstat(fd, &stat_buf) != 0) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: should_ckpt_fd_callback(): error stat()'ing %d: %s",
|
|
||||||
fd, strerror_r(errno, buf, BUFSIZ));
|
|
||||||
return 1;
|
|
||||||
/* Don't checkpoint sockets and FIFOs */
|
|
||||||
} else if (S_ISSOCK(stat_buf.st_mode) || S_ISFIFO(stat_buf.st_mode)) {
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: skipping checkpointing socket/fifo: %d",
|
|
||||||
fd);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
memset(device_name, 0, sizeof device_name);
|
|
||||||
sprintf(proc_filename, "/proc/self/fd/%d", fd);
|
|
||||||
if (readlink(proc_filename, device_name, sizeof(device_name) - 1) <= 0) {
|
|
||||||
opal_output(mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: should_ckpt_fd_callback(): readlink(%d) failed: %s",
|
|
||||||
fd, strerror_r(errno, buf, BUFSIZ));
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Don't checkpoint ptys */
|
|
||||||
if (strstr(device_name, "/dev/pts/") == 0 ||
|
|
||||||
strstr(device_name, "/dev/pty") == 0 ||
|
|
||||||
strstr(device_name, "/dev/tty") == 0) {
|
|
||||||
opal_output_verbose(10, mca_crs_dmtcp_component.super.output_handle,
|
|
||||||
"crs:dmtcp: skipping checkpointing %s",
|
|
||||||
device_name);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Checkpoint fd by default */
|
|
||||||
return 1;
|
|
||||||
}
|
|
@ -1,7 +0,0 @@
|
|||||||
#
|
|
||||||
# owner/status file
|
|
||||||
# owner: institution that is responsible for this package
|
|
||||||
# status: e.g. active, maintenance, unmaintained
|
|
||||||
#
|
|
||||||
owner: U Brit.Columbia
|
|
||||||
status: unmaintained
|
|
Загрузка…
Ссылка в новой задаче
Block a user