/* * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** * @file * * Snapshot Coordination (SNAPC) Interface * * Terminology: * ------------ * Global Snapshot Coordinator: * - HNP(s) coordination function. * Local Snapshot Coordinator * - VHNP(s) [e.g., orted] coordination function * Application Snapshot Coordinator * - Application level coordinaton function * Local Snapshot * - Snapshot generated by a single process in the parallel job * Local Snapshot Reference * - A generic reference to the physical Local Snapshot * Global Snapshot * - Snapshot generated for the entire parallel job * Global Snapshot Reference * - A generic reference to the physical Global Snapshot * * General Description: * --------------------- * This framework is tasked with: * - Initiating the checkpoint in the system * - Physically moving the local snapshot files to a location * Initially this location, is the node on which the Head Node Process (HNP) * is running, but later this will be a replicated checkpoint server or * the like. * - Generating a 'global snapshot handle' that the user can use to restart * the parallel job. * * Each component will have 3 teirs of behavior that must behave in concert: * - Global Snapshot Coordinator * This is the HNPs tasks. Mostly distributing the notification of the * checkpoint, and then compiling the physical and virtual nature of the * global snapshot handle. * - Local Snapshot Coordinator * This is the VHNPs (or orted, if available) tasks. This will involve * working with the Global Snapshot Coordinator to route the physical * and virtual 'local snapshot's from the application to the desired * location. This process must also notify the Global Snapshot Coordinator * when it's set of processes have completed the checkpoint. * - Application Snapshot Coordinator * This is the application level coordinator. This is very light, just * a subscription to be triggered when it needs to checkpoint, and then, * once finished with the checkpoint, notify the Local Snapshot Coordinator * that it is complete. * If there is no orted (so no bootproxy), then the application assumes the * responsibility of the Local Snapshot Coordinator as well. * */ #ifndef MCA_SNAPC_H #define MCA_SNAPC_H #include "orte_config.h" #include "orte/constants.h" #include "orte/types.h" #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" #include "opal/class/opal_object.h" #include "opal/util/output.h" BEGIN_C_DECLS /** * States that a process can be in while checkpointing */ /* Doing no checkpoint -- Quiet state */ #define ORTE_SNAPC_CKPT_STATE_NONE 0 /* There has been a request for a checkpoint from one of the applications */ #define ORTE_SNAPC_CKPT_STATE_REQUEST 1 /* There is a Pending checkpoint for this process */ #define ORTE_SNAPC_CKPT_STATE_PENDING 2 /* There is a Pending checkpoint for this process, terminate the process after checkpoint */ #define ORTE_SNAPC_CKPT_STATE_PENDING_TERM 3 /* Running the checkpoint */ #define ORTE_SNAPC_CKPT_STATE_RUNNING 4 /* Finished the checkpoint */ #define ORTE_SNAPC_CKPT_STATE_FILE_XFER 5 /* Finished the checkpoint */ #define ORTE_SNAPC_CKPT_STATE_FINISHED 6 /* Unable to checkpoint this job */ #define ORTE_SNAPC_CKPT_STATE_NO_CKPT 7 /* Reached an error */ #define ORTE_SNAPC_CKPT_STATE_ERROR 8 /** * Definition of a orte local snapshot. * Similar to the opal_crs_base_snapshot_t except that it * contains process contact information. */ struct orte_snapc_base_snapshot_1_0_0_t { opal_crs_base_snapshot_t crs_snapshot_super; /** ORTE Process name */ orte_process_name_t process_name; /** PID of the application process that generated this snapshot */ pid_t process_pid; /** State of the checkpoint */ size_t state; /** Terminate this process after a checkpoint */ bool term; }; typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_1_0_0_t; typedef struct orte_snapc_base_snapshot_1_0_0_t orte_snapc_base_snapshot_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_snapshot_t); /** * Definition of the global snapshot. * Each component is assumed to have extened this definition * in the same way they extern the orte_snapc_base_compoinent_t below. */ struct orte_snapc_base_global_snapshot_1_0_0_t { /** This is an object, so must have super */ opal_list_item_t super; /** A list of orte_snapc_base_snapshot_ts */ opal_list_t snapshots; /* ORTE SnapC Component used to generate the global snapshot */ char * component_name; /** Unique name of the global snapshot */ char * reference_name; /** Location of the global snapshot Absolute path */ char * local_location; /** Sequence Number */ int seq_num; /** Beginning timestamp */ char * start_time; /** Ending timestamp */ char * end_time; }; typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t; typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_global_snapshot_t); /** * Module initialization function. * Returns ORTE_SUCCESS */ typedef int (*orte_snapc_base_module_init_fn_t) (bool seed, bool app); /** * Module finalization function. * Returns ORTE_SUCCESS */ typedef int (*orte_snapc_base_module_finalize_fn_t) (void); /** * Setup the necessary structures for this job * Returns ORTE_SUCCESS */ typedef int (*orte_snapc_base_setup_job_fn_t) (orte_jobid_t jobid); /** * Setup the necessary structures for this job * Returns ORTE_SUCCESS */ typedef int (*orte_snapc_base_release_job_fn_t) (orte_jobid_t jobid); /** * Handle fault tolerance updates * * @param[in] state Fault tolerance state update * * @retval ORTE_SUCCESS The operation completed successfully * @retval ORTE_ERROR An unspecifed error occurred */ typedef int (*orte_snapc_base_ft_event_fn_t)(int state); /** * Structure for SNAPC components. */ struct orte_snapc_base_component_2_0_0_t { /** MCA base component */ mca_base_component_t base_version; /** MCA base data */ mca_base_component_data_t base_data; /** Verbosity Level */ int verbose; /** Output Handle for opal_output */ int output_handle; /** Default Priority */ int priority; }; typedef struct orte_snapc_base_component_2_0_0_t orte_snapc_base_component_2_0_0_t; typedef struct orte_snapc_base_component_2_0_0_t orte_snapc_base_component_t; /** * Structure for SNAPC modules */ struct orte_snapc_base_module_1_0_0_t { /** Initialization Function */ orte_snapc_base_module_init_fn_t snapc_init; /** Finalization Function */ orte_snapc_base_module_finalize_fn_t snapc_finalize; /** Setup structures for a job */ orte_snapc_base_setup_job_fn_t setup_job; /** Release job */ orte_snapc_base_release_job_fn_t release_job; /** Handle any FT Notifications */ orte_snapc_base_ft_event_fn_t ft_event; }; typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_1_0_0_t; typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_t; ORTE_DECLSPEC extern orte_snapc_base_module_t orte_snapc; /** * Macro for use in components that are of type SNAPC */ #define ORTE_SNAPC_BASE_VERSION_2_0_0 \ MCA_BASE_VERSION_2_0_0, \ "snapc", 2, 0, 0 END_C_DECLS #endif /* ORTE_SNAPC_H */