1
1
openmpi/orte/mca/snapc/snapc.h
Josh Hursey 3b4073e32c This commit fixes the checkpoint/restart functionality on the trunk. Included in this commit are:
* Extension to the ESS framework to support C/R
 * Fixed support for {{{snapc_base_establish_global_snapshot_dir}}}
 * Fixed FileM support
 * Misc. minor code modifications

There are some outstanding visability issues that I want to fix next.

This commit was SVN r17725.
2008-03-05 04:57:23 +00:00

256 строки
8.3 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Snapshot Coordination (SNAPC) Interface
*
* Terminology:
* ------------
* Global Snapshot Coordinator:
* - HNP(s) coordination function.
* Local Snapshot Coordinator
* - VHNP(s) [e.g., orted] coordination function
* Application Snapshot Coordinator
* - Application level coordinaton function
* Local Snapshot
* - Snapshot generated by a single process in the parallel job
* Local Snapshot Reference
* - A generic reference to the physical Local Snapshot
* Global Snapshot
* - Snapshot generated for the entire parallel job
* Global Snapshot Reference
* - A generic reference to the physical Global Snapshot
*
* General Description:
* ---------------------
* This framework is tasked with:
* - Initiating the checkpoint in the system
* - Physically moving the local snapshot files to a location
* Initially this location, is the node on which the Head Node Process (HNP)
* is running, but later this will be a replicated checkpoint server or
* the like.
* - Generating a 'global snapshot handle' that the user can use to restart
* the parallel job.
*
* Each component will have 3 teirs of behavior that must behave in concert:
* - Global Snapshot Coordinator
* This is the HNPs tasks. Mostly distributing the notification of the
* checkpoint, and then compiling the physical and virtual nature of the
* global snapshot handle.
* - Local Snapshot Coordinator
* This is the VHNPs (or orted, if available) tasks. This will involve
* working with the Global Snapshot Coordinator to route the physical
* and virtual 'local snapshot's from the application to the desired
* location. This process must also notify the Global Snapshot Coordinator
* when it's set of processes have completed the checkpoint.
* - Application Snapshot Coordinator
* This is the application level coordinator. This is very light, just
* a subscription to be triggered when it needs to checkpoint, and then,
* once finished with the checkpoint, notify the Local Snapshot Coordinator
* that it is complete.
* If there is no orted (so no bootproxy), then the application assumes the
* responsibility of the Local Snapshot Coordinator as well.
*
*/
#ifndef MCA_SNAPC_H
#define MCA_SNAPC_H
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/class/opal_object.h"
BEGIN_C_DECLS
/**
* States that a process can be in while checkpointing
*/
/* Doing no checkpoint -- Quiet state */
#define ORTE_SNAPC_CKPT_STATE_NONE 0
/* There has been a request for a checkpoint from one of the applications */
#define ORTE_SNAPC_CKPT_STATE_REQUEST 1
/* There is a Pending checkpoint for this process */
#define ORTE_SNAPC_CKPT_STATE_PENDING 2
/* There is a Pending checkpoint for this process, terminate the process after checkpoint */
#define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 3
/* Running the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_RUNNING 4
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 5
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FINISHED 6
/* Unable to checkpoint this job */
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 7
/* Reached an error */
#define ORTE_SNAPC_CKPT_STATE_ERROR 8
/**
* Definition of a orte local snapshot.
* Similar to the opal_crs_base_snapshot_t except that it
* contains process contact information.
*/
struct orte_snapc_base_snapshot_1_0_0_t {
opal_crs_base_snapshot_t crs_snapshot_super;
/** ORTE Process name */
orte_process_name_t process_name;
/** PID of the application process that generated this snapshot */
pid_t process_pid;
/** State of the checkpoint */
size_t state;
/** Terminate this process after a checkpoint */
bool term;
};
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_1_0_0_t;
typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_snapshot_t);
/**
* Definition of the global snapshot.
* Each component is assumed to have extened this definition
* in the same way they extern the orte_snapc_base_compoinent_t below.
*/
struct orte_snapc_base_global_snapshot_1_0_0_t {
/** This is an object, so must have super */
opal_list_item_t super;
/** A list of orte_snapc_base_snapshot_ts */
opal_list_t snapshots;
/* ORTE SnapC Component used to generate the global snapshot */
char * component_name;
/** Unique name of the global snapshot */
char * reference_name;
/** Location of the global snapshot Absolute path */
char * local_location;
/** Sequence Number */
int seq_num;
/** Beginning timestamp */
char * start_time;
/** Ending timestamp */
char * end_time;
};
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_global_snapshot_t);
/**
* Query function for SNAPC components.
* Returns a priority to rank it agaianst other available SNAPC components.
*/
typedef struct orte_snapc_base_module_1_0_0_t *
(*orte_snapc_base_component_query_1_0_0_fn_t)
(int *priority);
/**
* Module initialization function.
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_module_init_fn_t)
(bool seed, bool app);
/**
* Module finalization function.
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_module_finalize_fn_t)
(void);
/**
* Setup the necessary structures for this job
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_setup_job_fn_t)
(orte_jobid_t jobid);
/**
* Setup the necessary structures for this job
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_release_job_fn_t)
(orte_jobid_t jobid);
/**
* Structure for SNAPC v1.0.0 components.
*/
struct orte_snapc_base_component_1_0_0_t {
/** MCA base component */
mca_base_component_t snapc_version;
/** MCA base data */
mca_base_component_data_1_0_0_t snapc_data;
/** Component Query for Selection Function */
orte_snapc_base_component_query_1_0_0_fn_t snapc_query;
/** Verbosity Level */
int verbose;
/** Output Handle for opal_output */
int output_handle;
/** Default Priority */
int priority;
};
typedef struct orte_snapc_base_component_1_0_0_t orte_snapc_base_component_1_0_0_t;
typedef struct orte_snapc_base_component_1_0_0_t orte_snapc_base_component_t;
/**
* Structure for SNAPC v1.0.0 modules
*/
struct orte_snapc_base_module_1_0_0_t {
/** Initialization Function */
orte_snapc_base_module_init_fn_t snapc_init;
/** Finalization Function */
orte_snapc_base_module_finalize_fn_t snapc_finalize;
/** Setup structures for a job */
orte_snapc_base_setup_job_fn_t setup_job;
/** Release job */
orte_snapc_base_release_job_fn_t release_job;
};
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_1_0_0_t;
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_t;
ORTE_DECLSPEC extern orte_snapc_base_module_t orte_snapc;
/**
* Macro for use in components that are of type SNAPC v1.0.0
*/
#define ORTE_SNAPC_BASE_VERSION_1_0_0 \
/* SNAPC v1.0 is chained to MCA v1.0 */ \
MCA_BASE_VERSION_1_0_0, \
/* SNAPC v1.0 */ \
"snapc", 1, 0, 0
END_C_DECLS
#endif /* ORTE_SNAPC_H */