1
1

275 строки
8.2 KiB
C
Исходник Обычный вид История

/*
Fix a checkpoint/restart bug that causes a restarted application to occasionally throw a SIGSEGV or SIGPIPE due to invalid socket descriptors. The problem was caused by a bad ordering between the restart of the ORTE level tcp connections (in the OOB - out-of-band communication) and the Open MPI level tcp connections (BTLs). Before this commit ORTE would shutdown and restart the OOB completely before the OMPI level restarted its tcp connections. What would happen is that a socket descriptor used by the OMPI level on checkpoint was assigned to the ORTE level on restart. But the OMPI level had no knowledge that the socket descriptor it was previously using has been recycled so it closed it on restart. This caused the ORTE level to break as the newly created socket descriptor was closed without its knowledge. The fix is to have the OMPI level shutdown tcp connections, allow the ORTE level to restart, and then allow the OMPi level to restart its connections. This seems obvious, and I'm surprised that this bug has not cropped up sooner. I'm confident that this specific problem has been fixed with this commit. Thanks to Eric Roman and Tamer El Sayed for their help in identifying this problem, and patience while I was fixing it. * Add a new state {{{OPAL_CRS_RESTART_PRE}}}. This state identifies when we are on the down slope of the INC (finalize-like) which is useful when you want to close, but not reopen a component set for fear of interfering with a lower level. * Use this new state in OMPI level coordination. Here we want to make sure to play well with both the OMPI/BTL/TCP and ORTE/OOB/TCP components. * Update ft_event functions in PML and BML to handle the new restart state. * Add an additional flag to the error output in OOB/TCP so we can see what the socket descriptor was on failure as this can be helpful in debugging. This commit was SVN r18276.
2008-04-24 17:54:22 +00:00
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Checkpoint and Restart Service (CRS) Interface
*
* General Description:
*
* The OPAL Checkpoint and Restart Service (CRS) has been created to create an
* abstract notion of a single process checkpointer for upper levels to
* incorporate checkpoint/restart calls genericly into their code. This keeps
* the upper levels from becoming too tied to a specfic checkpoint and restart
* implementation.
*
* This interface will change in the future to allow for some additional
* specialized functionality such as memory inclusion/exclusion, explicit
* restarting while running, and others.
*
* Words to the Wise:
*
* The CRS module must adhere to the API exactly inorder to be fully supported.
* How the module goes about conforming to the API is an internal module issue
* and in no cases should the module impose restrictions upon the upper layers
* as this is an API violation.
*
*/
#ifndef MCA_CRS_H
#define MCA_CRS_H
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* States of the module
*/
enum opal_crs_state_type_t {
OPAL_CRS_CHECKPOINT,
Fix a checkpoint/restart bug that causes a restarted application to occasionally throw a SIGSEGV or SIGPIPE due to invalid socket descriptors. The problem was caused by a bad ordering between the restart of the ORTE level tcp connections (in the OOB - out-of-band communication) and the Open MPI level tcp connections (BTLs). Before this commit ORTE would shutdown and restart the OOB completely before the OMPI level restarted its tcp connections. What would happen is that a socket descriptor used by the OMPI level on checkpoint was assigned to the ORTE level on restart. But the OMPI level had no knowledge that the socket descriptor it was previously using has been recycled so it closed it on restart. This caused the ORTE level to break as the newly created socket descriptor was closed without its knowledge. The fix is to have the OMPI level shutdown tcp connections, allow the ORTE level to restart, and then allow the OMPi level to restart its connections. This seems obvious, and I'm surprised that this bug has not cropped up sooner. I'm confident that this specific problem has been fixed with this commit. Thanks to Eric Roman and Tamer El Sayed for their help in identifying this problem, and patience while I was fixing it. * Add a new state {{{OPAL_CRS_RESTART_PRE}}}. This state identifies when we are on the down slope of the INC (finalize-like) which is useful when you want to close, but not reopen a component set for fear of interfering with a lower level. * Use this new state in OMPI level coordination. Here we want to make sure to play well with both the OMPI/BTL/TCP and ORTE/OOB/TCP components. * Update ft_event functions in PML and BML to handle the new restart state. * Add an additional flag to the error output in OOB/TCP so we can see what the socket descriptor was on failure as this can be helpful in debugging. This commit was SVN r18276.
2008-04-24 17:54:22 +00:00
OPAL_CRS_RESTART_PRE,
OPAL_CRS_RESTART, /* RESTART_POST */
OPAL_CRS_CONTINUE,
OPAL_CRS_TERM,
OPAL_CRS_RUNNING,
OPAL_CRS_ERROR
};
typedef enum opal_crs_state_type_t opal_crs_state_type_t;
/**
* Structure for Single process snapshot
* Each component is assumed to have extened this definition
* in the same way they exten the opal_crs_base_compoinent_t below.
*/
struct opal_crs_base_snapshot_1_0_0_t {
/** This is an object, so must have super */
opal_list_item_t super;
/** MCA Component name */
char * component_name;
/** Unique name of snapshot */
char * reference_name;
/** Absolute path the the snapshot directory */
char * local_location;
char * remote_location;
/** Cold Start:
* If we are restarting cold, then we need to recreate this structure
* opal_restart would set this, and let the component do the heavy lifting
* of recreating the structure, sicne it doesn't know exactly how to.
*/
bool cold_start;
};
typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_1_0_0_t;
typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_snapshot_t);
/**
* Module initialization function.
* Returns OPAL_SUCCESS
*/
typedef int (*opal_crs_base_module_init_fn_t)
(void);
/**
* Module finalization function.
* Returns OPAL_SUCCESS
*/
typedef int (*opal_crs_base_module_finalize_fn_t)
(void);
/**
* Call the underlying checkpointer.
* Returns OPAL_SUCCESS upon success, and OPAL_ERROR otherwise.
*
* Arguments:
* pid = PID of the process to checkpoint, or 0 if checkpointing self.
* fname = the filename where the checkpoint has been written.
* state = The state at which the checkpoint is exiting
* - OPAL_CRS_CONTINUE
* Continuing after a checkpoint has been taken
* - OPAL_CRS_RESTART
* Restarting from a checkpoint
* - OPAL_CRS_ERROR
* Checkpoint was not successful.
*
* The 'fname' string is owned by the caller: if appropriate, it must be eventually
* freed by the caller.
*/
typedef int (*opal_crs_base_module_checkpoint_fn_t)
(pid_t pid,
opal_crs_base_snapshot_t *snapshot,
opal_crs_state_type_t *state);
/**
* Call the underlying restart command for this process
* Returns OPAL_SUCCESS or OPAL_CRS_ERROR
*
* Arguments:
* fname = Checkpoint filename
* spawn_child = true if the restarted process should be forked as a new process,
* in which case 'child_pid' will be returned.
* false if the restarted process should overwrite the current
* process space.
* child_pid = PID of the child that was started, if applicable
*
*/
typedef int (*opal_crs_base_module_restart_fn_t)
(opal_crs_base_snapshot_t *snapshot,
bool spawn_child,
pid_t *child_pid);
/**
* Disable the checkpointer
* Returns OPAL_SUCCESS or OPAL_CRS_ERROR
*
* This should set a flag/mutex to disallow checkpoints to occur.
* If a checkpoint were to occur while checkpoints are disabled,
* they should block until reenabled.
* A quality module implementation would notify the user that the
* checkpoint has been delayed until the program is out of this critical
* section of code.
*/
typedef int (*opal_crs_base_module_disable_checkpoint_fn_t)
(void);
/**
* Enable the checkpointer
* Returns OPAL_SUCCESS or OPAL_CRS_ERROR
*
* This should set a flag/mutex to allow checkpoints to occur
*/
typedef int (*opal_crs_base_module_enable_checkpoint_fn_t)
(void);
/**
* Prepare the CRS component for process launch.
* Some CRS components need to take action before the
* process is ever launched to do such things as:
* - seed the process environment
* - LD_PRELOAD
* - Analyze the binary before launch
*
* @param rank Rank of the process to be started
* @param app Absolute pathname of argv[0]
* @param argv Standard argv-style array, including a final NULL pointer
* @param env Standard environ-style array, including a final NULL pointer
*/
typedef int (*opal_crs_base_module_prelaunch_fn_t)
(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
/**
* Register another thread that may call this library.
* Some CR systems require that each thread that will call into their library
* register individually before doing so.
*
* Returns OPAL_SUCCESS or OPAL_ERROR
*/
typedef int (*opal_crs_base_module_reg_thread_fn_t)
(void);
/**
* Structure for CRS components.
*/
struct opal_crs_base_component_2_0_0_t {
/** MCA base component */
mca_base_component_t base_version;
/** MCA base data */
mca_base_component_data_t base_data;
/** Verbosity Level */
int verbose;
/** Output Handle for opal_output */
int output_handle;
/** Default Priority */
int priority;
};
typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_2_0_0_t;
typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_t;
/**
* Structure for CRS modules
*/
struct opal_crs_base_module_1_0_0_t {
/** Initialization Function */
opal_crs_base_module_init_fn_t crs_init;
/** Finalization Function */
opal_crs_base_module_finalize_fn_t crs_finalize;
/** Checkpoint interface */
opal_crs_base_module_checkpoint_fn_t crs_checkpoint;
/** Restart Interface */
opal_crs_base_module_restart_fn_t crs_restart;
/** Disable checkpoints */
opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint;
/** Enable checkpoints */
opal_crs_base_module_enable_checkpoint_fn_t crs_enable_checkpoint;
/** Pre Launch */
opal_crs_base_module_prelaunch_fn_t crs_prelaunch;
/** Per thread registration */
opal_crs_base_module_reg_thread_fn_t crs_reg_thread;
};
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t;
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t;
OPAL_DECLSPEC extern opal_crs_base_module_t opal_crs;
/**
* Macro for use in components that are of type CRS
*/
#define OPAL_CRS_BASE_VERSION_2_0_0 \
MCA_BASE_VERSION_2_0_0, \
"crs", 2, 0, 0
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* OPAL_CRS_H */