1
1

Merging in the jjhursey-ft-cr-stable branch (r13912 : HEAD).

This merge adds Checkpoint/Restart support to Open MPI. The initial
frameworks and components support a LAM/MPI-like implementation.

This commit follows the risk assessment presented to the Open MPI core
development group on Feb. 22, 2007.

This commit closes trac:158

More details to follow.

This commit was SVN r14051.

The following SVN revisions from the original message are invalid or
inconsistent and therefore were not cross-referenced:
  r13912

The following Trac tickets were found above:
  Ticket 158 --> https://svn.open-mpi.org/trac/ompi/ticket/158
This commit is contained in:
Josh Hursey 2007-03-16 23:11:45 +00:00
parent 924cb0af11
commit dadca7da88
691 changed files with 30217 additions and 1182 deletions

View File

@ -1,6 +1,6 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
dnl Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
dnl University Research and Technology
dnl Corporation. All rights reserved.
dnl Copyright (c) 2004-2005 The University of Tennessee and The University
@ -520,6 +520,105 @@ elif test "$with_cross" != "" ; then
. "$with_cross"
fi
#
# --with-ft=TYPE
# TYPE:
# - LAM (synonym for 'cr' currently)
# - cr
# /* General FT sections */
# #if OPAL_ENABLE_FT == 0 /* FT Disabled globaly */
# #if OPAL_ENABLE_FT == 1 /* FT Enabled globaly */
# /* CR Specific sections */
# #if OPAL_ENABLE_FT_CR == 0 /* FT Ckpt/Restart Disabled */
# #if OPAL_ENABLE_FT_CR == 1 /* FT Ckpt/Restart Enabled */
#
AC_MSG_CHECKING([if want fault tolerance])
AC_ARG_WITH(ft,
[AC_HELP_STRING([--with-ft=TYPE],
[Specify the type of fault tolerance to enable. Options: LAM (LAM/MPI-like), cr (Checkpoint/Restart) (default: disabled)])],
[ompi_want_ft=1],
[ompi_want_ft=0])
if test "$with_ft" = "no" -o "$ompi_want_ft" = "0"; then
ompi_want_ft=0
ompi_want_ft_cr=0
AC_MSG_RESULT([Disabled fault tolerance])
else
ompi_want_ft=1
ompi_want_ft_cr=0
ompi_want_ft_type=none
# Default value
if test "$with_ft" = "" -o "$with_ft" = "yes"; then
ompi_want_ft_type=cr
ompi_want_ft_cr=1
elif test "$with_ft" = "LAM"; then
ompi_want_ft_type=lam
ompi_want_ft_cr=1
elif test "$with_ft" = "lam"; then
ompi_want_ft_type=lam
ompi_want_ft_cr=1
elif test "$with_ft" = "CR"; then
ompi_want_ft_type=cr
ompi_want_ft_cr=1
elif test "$with_ft" = "cr"; then
ompi_want_ft_type=cr
ompi_want_ft_cr=1
else
AC_MSG_RESULT([Unrecognized FT TYPE: $with_ft])
AC_MSG_ERROR([Cannot continue])
fi
AC_MSG_RESULT([Enabled $with_ft ($ompi_want_ft_type)])
AC_MSG_WARN([**************************************************])
AC_MSG_WARN([*** Fault Tolerance Integration into Open MPI is *])
AC_MSG_WARN([*** a research quality implementation, and care *])
AC_MSG_WARN([*** should be used when choosing to enable it. *])
AC_MSG_WARN([**************************************************])
fi
AC_DEFINE_UNQUOTED([OPAL_ENABLE_FT], [$ompi_want_ft],
[Enable fault tolerance general components and logic])
AC_DEFINE_UNQUOTED([OPAL_ENABLE_FT_CR], [$ompi_want_ft_cr],
[Enable fault tolerance checkpoint/restart components and logic])
AM_CONDITIONAL(WANT_FT, test "$ompi_want_ft" = "1")
#
# Fault Tolerance Components and Logic
#
# --enable-ft-thread
# #if OPAL_ENABLE_FT_THREAD == 0 /* Disabled */
# #if OPAL_ENABLE_FT_THREAD == 1 /* Enabled */
#
AC_MSG_CHECKING([if want fault tolerance thread])
AC_ARG_ENABLE([ft_thread],
[AC_HELP_STRING([--enable-ft-thread],
[Enable fault tolerance thread running inside all processes. Requires progress threads (default: disabled)])])
if test "$ompi_want_ft" = "0"; then
ompi_want_ft_thread=0
AC_MSG_RESULT([Disabled (fault tolerance disabled --without-ft-style)])
elif test "$enable_ft_thread" = "yes"; then
# This check may not fire since progress threads are checked after this section :/
if test "$OMPI_ENABLE_PROGRESS_THREADS" = "0"; then
AC_MSG_RESULT([Must enable progress threads to use this option])
AC_MSG_ERROR([Cannot continue])
else
AC_MSG_RESULT([yes])
ompi_want_ft_thread=1
AC_MSG_WARN([**************************************************])
AC_MSG_WARN([*** Fault Tolerance with a thread in Open MPI *])
AC_MSG_WARN([*** is an experimental, research quality option. *])
AC_MSG_WARN([*** It requires progress threads to be used, and *])
AC_MSG_WARN([*** care should be used when enabling these *])
AC_MSG_WARN([*** options. *])
AC_MSG_WARN([**************************************************])
fi
else
ompi_want_ft_thread=0
AC_MSG_RESULT([Disabled])
fi
AC_DEFINE_UNQUOTED([OPAL_ENABLE_FT_THREAD], [$ompi_want_ft_thread],
[Enable fault tolerance thread in Open PAL])
AM_CONDITIONAL(WANT_FT_THREAD, test "$ompi_want_ft_thread" = "1")
#
# Do we want to install binaries?
#

View File

@ -1,6 +1,6 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
@ -1155,6 +1155,8 @@ AC_CONFIG_FILES([
opal/tools/wrappers/Makefile
opal/tools/wrappers/opalcc-wrapper-data.txt
opal/tools/wrappers/opalc++-wrapper-data.txt
opal/tools/opal-checkpoint/Makefile
opal/tools/opal-restart/Makefile
orte/Makefile
orte/include/Makefile
@ -1170,6 +1172,8 @@ AC_CONFIG_FILES([
orte/tools/wrappers/Makefile
orte/tools/wrappers/ortecc-wrapper-data.txt
orte/tools/wrappers/ortec++-wrapper-data.txt
orte/tools/orte-checkpoint/Makefile
orte/tools/orte-restart/Makefile
orte/tools/orte-ps/Makefile
orte/tools/orte-clean/Makefile

View File

@ -19,6 +19,10 @@
amca_paramdir = $(AMCA_PARAM_SETS_DIR)
dist_amca_param_DATA = amca-param-sets/example.conf
if WANT_FT
dist_amca_param_DATA += amca-param-sets/ft-enable-cr
endif
EXTRA_DIST = \
dist/make_dist_tarball \
dist/linux/openmpi.spec \

View File

@ -0,0 +1,34 @@
#
# An Aggregate MCA Parameter Set to enable checkpoint/restart capabilities
# for a job.
#
# Usage:
# shell$ mpirun -am ft-enable-cr ./app
#
#
# OPAL Parameters
# - Select only checkpoint ready components
# - Enable Additional FT infrastructure
# - Auto-select OPAL CRS component
#
mca_base_component_distill_checkpoint_ready=1
ft_cr_enabled=1
crs=
#
# ORTE Parameters
# - Wrap the RML
# - Use the 'full' Snapshot Coordinator
#
rml_wrapper=ftrm
snapc=full
#filem=rsh
#
# OMPI Parameters
# - Wrap the PML
# - Use the LAM/MPI-like Coordinated Checkpoint/Restart Coordination Protocol
#
pml_wrapper=crcpw
crcp=coord

View File

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -437,6 +437,8 @@ int ompi_ddt_get_pack_description( ompi_datatype_t* datatype,
if( NULL == datatype->packed_description ) {
if( datatype->flags & DT_FLAG_PREDEFINED ) {
datatype->packed_description = malloc( 2 * sizeof(int) );
} else if( NULL == args ) {
return OMPI_ERROR;
} else {
datatype->packed_description = malloc( args->total_pack_size );
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -46,8 +46,8 @@ mca_allocator_base_component_t mca_allocator_basic_component = {
/* Next the MCA v1.0.0 module meta data */
{
/* Whether the module is checkpointable or not */
false
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_allocator_basic_component_init
};

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -125,8 +125,8 @@ mca_allocator_base_component_t mca_allocator_bucket_component = {
/* Next the MCA v1.0.0 module meta data */
{
/* Whether the module is checkpointable or not */
false
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_allocator_bucket_module_init
};

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
@ -22,10 +22,11 @@ headers += \
base/bml_base_endpoint.h
libmca_bml_la_SOURCES += \
base/bml_base_btl.c \
base/bml_base_btl.c \
base/bml_base_btl.h \
base/bml_base_endpoint.h \
base/bml_base_endpoint.c \
base/bml_base_endpoint.c \
base/bml_base_init.c \
base/bml_base_close.c \
base/bml_base_open.c
base/bml_base_open.c \
base/bml_base_ft.c

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -53,6 +53,7 @@ OMPI_DECLSPEC int mca_bml_base_init(bool enable_progress_threads,
OMPI_DECLSPEC int mca_bml_base_close(void);
OMPI_DECLSPEC bool mca_bml_base_inited(void);
OMPI_DECLSPEC int mca_bml_base_ft_event(int state);
/*

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/mca/bml/bml.h"
#include "ompi/mca/bml/base/base.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/base.h"
#include "ompi/mca/bml/base/bml_base_endpoint.h"
#include "ompi/mca/bml/base/bml_base_btl.h"
int mca_bml_base_ft_event(int state)
{
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
/*
* BML is expected to call ft_event in
* - BTL(s)
* - MPool(s)
* Currently you can't do this from outside a component
* So just return Unimplemented
*/
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_ERR_NOT_IMPLEMENTED;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -38,7 +38,8 @@ mca_bml_base_module_t mca_bml = {
NULL, /* bml_register */
NULL, /* bml_register_error */
NULL, /* bml_finalize*/
NULL /* bml_progress */
NULL, /* bml_progress */
NULL /* FT event */
};
mca_bml_base_component_t mca_bml_component;
@ -98,4 +99,3 @@ int mca_bml_base_init( bool enable_progress_threads,
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -34,6 +34,9 @@
#include "ompi/types.h"
#include "ompi/class/ompi_free_list.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#define OMPI_ENABLE_DEBUG_RELIABILITY 0
/*
@ -445,7 +448,6 @@ typedef struct mca_bml_base_module_t* (*mca_bml_base_component_init_fn_t)(
typedef int (*mca_bml_base_module_progress_fn_t)(void);
/**
* BML component descriptor. Contains component version information
* and component open/close/init functions.
@ -455,7 +457,6 @@ struct mca_bml_base_component_1_0_0_t {
mca_base_component_t bml_version;
mca_base_component_data_1_0_0_t bml_data;
mca_bml_base_component_init_fn_t bml_init;
};
typedef struct mca_bml_base_component_1_0_0_t mca_bml_base_component_1_0_0_t;
typedef struct mca_bml_base_component_1_0_0_t mca_bml_base_component_t;
@ -610,7 +611,12 @@ typedef int (*mca_bml_base_module_register_error_cb_fn_t)(
mca_btl_base_module_error_cb_fn_t cbfunc
);
/**
* Fault Tolerance Event Notification Function
* @param status Checkpoint Status
* @return OMPI_SUCCESS or failure status
*/
typedef int (*mca_bml_base_module_ft_event_fn_t)(int status);
/**
@ -638,6 +644,7 @@ struct mca_bml_base_module_t {
mca_bml_base_module_progress_fn_t bml_progress;
mca_bml_base_module_ft_event_fn_t bml_ft_event;
};
typedef struct mca_bml_base_module_t mca_bml_base_module_t;

View File

@ -52,7 +52,8 @@ mca_bml_r2_module_t mca_bml_r2 = {
mca_bml_r2_register,
mca_bml_r2_register_error,
mca_bml_r2_finalize,
mca_bml_r2_progress
mca_bml_r2_progress,
mca_bml_r2_ft_event
}
};
@ -797,3 +798,114 @@ int mca_bml_r2_component_fini(void)
}
int mca_bml_r2_ft_event(int state) {
size_t btl_idx;
int ret;
ompi_proc_t** procs = NULL;
size_t num_procs;
#if 0
opal_output(0, "bml:r2: ft_event: *** R2 BML *** (%d)\n", state);
#endif
if(OPAL_CRS_CHECKPOINT == state) {
/* Do nothing for now */
}
else if(OPAL_CRS_CONTINUE == state) {
/* Since nothingin Checkpoint, we are fine here */
}
else if(OPAL_CRS_RESTART == state) {
procs = ompi_proc_all(&num_procs);
if(NULL == procs)
goto END_PRE_RESTART;
if (OMPI_SUCCESS != (ret = mca_bml_r2_del_procs(num_procs, procs) ) ) {
goto END_PRE_RESTART;
}
END_PRE_RESTART:
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
/*
* Call ft_event in:
* - BTL
* - MPool
*/
for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) {
#if 0
opal_output(0, "bml:r2: ft_event: Notify the %s BTL.\n",
(mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
#endif
/*
* Close the btl
*/
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) {
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(state) ) ) {
continue;
}
}
/*
* Close its mpool
*/
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool) {
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(state) ) ) {
continue;
}
}
}
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
struct mca_bml_base_endpoint_t ** endpoints = NULL;
ompi_bitmap_t reachable;
OBJ_CONSTRUCT(&reachable, ompi_bitmap_t);
if( OMPI_SUCCESS != (ret = ompi_bitmap_init(&reachable, num_procs)) ) {
goto END_POST_RESTART;
}
endpoints = (struct mca_bml_base_endpoint_t **) malloc ( num_procs *
sizeof(struct mca_bml_base_endpoint_t*));
if ( NULL == endpoints ) {
goto END_POST_RESTART;
}
/* Don't need to do this again since we still have the
* values from PRE_RESTART
* procs = ompi_proc_all(&num_procs);
*/
if (OMPI_SUCCESS != (ret = mca_bml_r2_add_procs(num_procs, procs, endpoints, &reachable) ) ) {
goto END_POST_RESTART;
}
END_POST_RESTART:
if ( NULL != endpoints ) {
free ( endpoints) ;
}
OBJ_DESTRUCT(&reachable);
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
if( NULL != procs)
free(procs);
return OMPI_SUCCESS;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -96,6 +96,8 @@ int mca_bml_r2_finalize( void );
int mca_bml_r2_component_fini(void);
int mca_bml_r2_ft_event(int status);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -46,8 +46,8 @@ mca_bml_base_component_1_0_0_t mca_bml_r2_component = {
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_bml_r2_component_init
};

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -114,6 +114,9 @@
#include "ompi/types.h"
#include "ompi/mca/mpool/mpool.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
@ -544,6 +547,12 @@ typedef void (*mca_btl_base_module_dump_fn_t)(
int verbose
);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Status
* @return OMPI_SUCCESS or failure status
*/
typedef int (*mca_btl_base_module_ft_event_fn_t)(int state);
/**
* BTL module interface functions and attributes.
@ -582,6 +591,7 @@ struct mca_btl_base_module_t {
/* register a default error handler */
mca_btl_base_module_register_error_fn_t btl_register_error;
mca_btl_base_module_ft_event_fn_t btl_ft_event;
};
typedef struct mca_btl_base_module_t mca_btl_base_module_t;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -87,8 +87,8 @@ mca_btl_gm_module_t mca_btl_gm_module = {
#endif
mca_btl_base_dump,
NULL, /* mpool */
mca_btl_gm_register_error_cb
mca_btl_gm_register_error_cb,
mca_btl_gm_ft_event
}
};
@ -956,3 +956,22 @@ int mca_btl_gm_finalize(struct mca_btl_base_module_t* btl)
return OMPI_SUCCESS;
}
int mca_btl_gm_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -338,6 +338,12 @@ extern mca_btl_base_descriptor_t* mca_btl_gm_prepare_dst(
size_t reserve,
size_t* size);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_btl_gm_ft_event(int state);
/**
* Acquire a send token - queue the fragment if none available

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -73,9 +73,8 @@ mca_btl_gm_component_t mca_btl_gm_component = {
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
mca_btl_gm_component_init,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -63,7 +63,8 @@ mca_btl_mvapi_module_t mca_btl_mvapi_module = {
mca_btl_mvapi_get,
mca_btl_mvapi_dump,
NULL, /* mpool */
NULL /* error call back registration */
NULL, /* error call back registration */
mca_btl_mvapi_ft_event
}
};
@ -827,3 +828,23 @@ void mca_btl_mvapi_dump(
opal_output( 0, "sd_wqe_hp %d\n", endpoint->sd_wqe_hp );
opal_output( 0, "sd_wqe_lp %d\n", endpoint->sd_wqe_lp );
}
int mca_btl_mvapi_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -515,6 +515,13 @@ extern void mca_btl_mvapi_dump(
int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t* mvapi_btl);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_btl_mvapi_ft_event(int state);
#if defined(c_plusplus) || defined(__cplusplus)
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -77,9 +77,8 @@ mca_btl_mvapi_component_t mca_btl_mvapi_component = {
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
mca_btl_mvapi_component_init,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -477,6 +477,27 @@ int mca_btl_mx_finalize( struct mca_btl_base_module_t* btl )
return OMPI_SUCCESS;
}
int mca_btl_mx_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}
mca_btl_mx_module_t mca_btl_mx_module = {
{
&mca_btl_mx_component.super,
@ -502,7 +523,7 @@ mca_btl_mx_module_t mca_btl_mx_module = {
NULL, /* get */
mca_btl_base_dump,
NULL, /* mpool */
NULL /* register error */
NULL, /* register error */
mca_btl_mx_ft_event
}
};

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -290,6 +290,13 @@ mca_btl_mx_prepare_dst( struct mca_btl_base_module_t* btl,
size_t reserve,
size_t* size );
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_btl_mx_ft_event(int state);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -58,9 +58,8 @@ mca_btl_mx_component_t mca_btl_mx_component = {
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
mca_btl_mx_component_init,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -78,7 +78,8 @@ mca_btl_openib_module_t mca_btl_openib_module = {
mca_btl_openib_get,
mca_btl_base_dump,
NULL, /* mpool */
mca_btl_openib_register_error_cb /* error call back registration */
mca_btl_openib_register_error_cb, /* error call back registration */
mca_btl_openib_ft_event
}
};
@ -905,3 +906,23 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
return OMPI_SUCCESS;
}
int mca_btl_openib_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -440,6 +440,13 @@ extern void mca_btl_openib_send_frag_return(mca_btl_base_module_t* btl,
int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t* openib_btl);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_btl_openib_ft_event(int state);
#define BTL_OPENIB_HP_QP 0
#define BTL_OPENIB_LP_QP 1

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -111,9 +111,8 @@ mca_btl_openib_component_t mca_btl_openib_component = {
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
btl_openib_component_init,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -66,7 +66,8 @@ mca_btl_portals_module_t mca_btl_portals_module = {
mca_btl_portals_get,
mca_btl_base_dump,
NULL, /* mpool */
NULL /* register error */
NULL, /* register error */
NULL
},
};

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
@ -56,9 +56,8 @@ mca_btl_portals_component_t mca_btl_portals_component = {
/* Next the MCA v1.0.0 module meta data */
{
/* Whether the module is checkpointable or not */
false
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
mca_btl_portals_component_init,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -67,7 +67,8 @@ mca_btl_base_module_t mca_btl_self = {
mca_btl_self_rdma, /* get */
mca_btl_base_dump,
NULL, /* mpool */
NULL /* register error cb */
NULL, /* register error cb */
mca_btl_self_ft_event
};
@ -399,3 +400,23 @@ int mca_btl_self_rdma( struct mca_btl_base_module_t* btl,
des->des_cbfunc(btl,endpoint,des,OMPI_SUCCESS);
return OMPI_SUCCESS;
}
int mca_btl_self_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -255,7 +255,14 @@ int mca_btl_self_rdma(
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor
);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_btl_self_ft_event(int state);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -63,8 +63,8 @@ mca_btl_self_component_t mca_btl_self_component = {
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_btl_self_component_init,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -108,7 +108,8 @@ mca_btl_sm_t mca_btl_sm[2] = {
NULL, /* get */
mca_btl_base_dump,
NULL, /* mpool */
mca_btl_sm_register_error_cb /* register error */
mca_btl_sm_register_error_cb, /* register error */
mca_btl_sm_ft_event
}
},
{
@ -136,7 +137,8 @@ mca_btl_sm_t mca_btl_sm[2] = {
NULL, /* get function */
mca_btl_base_dump,
NULL, /* mpool */
mca_btl_sm_register_error_cb /* register error */
mca_btl_sm_register_error_cb, /* register error */
mca_btl_sm_ft_event
}
}
};
@ -922,3 +924,23 @@ int mca_btl_sm_send(
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank, endpoint->peer_smp_rank, frag->hdr, rc);
return rc;
}
int mca_btl_sm_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -345,6 +345,12 @@ extern int mca_btl_sm_send(
mca_btl_base_tag_t tag
);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_btl_sm_ft_event(int state);
#if OMPI_ENABLE_PROGRESS_THREADS == 1
void mca_btl_sm_component_event_thread(opal_object_t*);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -78,8 +78,8 @@ mca_btl_sm_component_t mca_btl_sm_component = {
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
mca_btl_sm_component_init,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
@ -61,7 +61,8 @@ mca_btl_tcp_module_t mca_btl_t