![Nathan Hjelm](/assets/img/avatar_default.png)
This commit adds support for project_framework_component_* parameter matching. This is the first step in allowing the same framework name in multiple projects. This change also bumps the MCA component version to 2.1.0. All master frameworks have been updated to use the new component versioning macro. An mca.h has been added to each project to add a project specific versioning macro of the form PROJECT_MCA_VERSION_2_1_0. Signed-off-by: Nathan Hjelm <hjelmn@me.com>
406 строки
13 KiB
C
406 строки
13 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/**
|
|
* @file
|
|
*
|
|
* Snapshot Coordination (SNAPC) Interface
|
|
*
|
|
* Terminology:
|
|
* ------------
|
|
* Global Snapshot Coordinator:
|
|
* - HNP(s) coordination function.
|
|
* Local Snapshot Coordinator
|
|
* - VHNP(s) [e.g., orted] coordination function
|
|
* Application Snapshot Coordinator
|
|
* - Application level coordinaton function
|
|
* Local Snapshot
|
|
* - Snapshot generated by a single process in the parallel job
|
|
* Local Snapshot Reference
|
|
* - A generic reference to the physical Local Snapshot
|
|
* Global Snapshot
|
|
* - Snapshot generated for the entire parallel job
|
|
* Global Snapshot Reference
|
|
* - A generic reference to the physical Global Snapshot
|
|
*
|
|
* General Description:
|
|
* ---------------------
|
|
* This framework is tasked with:
|
|
* - Initiating the checkpoint in the system
|
|
* - Physically moving the local snapshot files to a location
|
|
* Initially this location, is the node on which the Head Node Process (HNP)
|
|
* is running, but later this will be a replicated checkpoint server or
|
|
* the like.
|
|
* - Generating a 'global snapshot handle' that the user can use to restart
|
|
* the parallel job.
|
|
*
|
|
* Each component will have 3 teirs of behavior that must behave in concert:
|
|
* - Global Snapshot Coordinator
|
|
* This is the HNPs tasks. Mostly distributing the notification of the
|
|
* checkpoint, and then compiling the physical and virtual nature of the
|
|
* global snapshot handle.
|
|
* - Local Snapshot Coordinator
|
|
* This is the VHNPs (or orted, if available) tasks. This will involve
|
|
* working with the Global Snapshot Coordinator to route the physical
|
|
* and virtual 'local snapshot's from the application to the desired
|
|
* location. This process must also notify the Global Snapshot Coordinator
|
|
* when it's set of processes have completed the checkpoint.
|
|
* - Application Snapshot Coordinator
|
|
* This is the application level coordinator. This is very light, just
|
|
* a subscription to be triggered when it needs to checkpoint, and then,
|
|
* once finished with the checkpoint, notify the Local Snapshot Coordinator
|
|
* that it is complete.
|
|
* If there is no orted (so no bootproxy), then the application assumes the
|
|
* responsibility of the Local Snapshot Coordinator as well.
|
|
*
|
|
*/
|
|
|
|
#ifndef MCA_SNAPC_H
|
|
#define MCA_SNAPC_H
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
#include "orte/types.h"
|
|
|
|
#include "orte/mca/mca.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/crs/crs.h"
|
|
#include "opal/mca/crs/base/base.h"
|
|
|
|
#include "opal/class/opal_object.h"
|
|
#include "opal/class/opal_pointer_array.h"
|
|
#include "opal/util/output.h"
|
|
|
|
#include "orte/mca/sstore/sstore.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/**
|
|
* States that a process can be in while checkpointing
|
|
*/
|
|
/* Reached an error */
|
|
#define ORTE_SNAPC_CKPT_STATE_ERROR 0
|
|
|
|
/* Doing no checkpoint -- Quiet state */
|
|
#define ORTE_SNAPC_CKPT_STATE_NONE 1
|
|
/* There has been a request for a checkpoint from one of the applications */
|
|
#define ORTE_SNAPC_CKPT_STATE_REQUEST 2
|
|
/* There is a Pending checkpoint for this process */
|
|
#define ORTE_SNAPC_CKPT_STATE_PENDING 3
|
|
/* Running the checkpoint */
|
|
#define ORTE_SNAPC_CKPT_STATE_RUNNING 4
|
|
/* INC Prep Finished */
|
|
#define ORTE_SNAPC_CKPT_STATE_INC_PREPED 5
|
|
/* All Processes have been stopped */
|
|
#define ORTE_SNAPC_CKPT_STATE_STOPPED 6
|
|
/* Finished the checkpoint locally */
|
|
#define ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL 7
|
|
/* Migrating */
|
|
#define ORTE_SNAPC_CKPT_STATE_MIGRATING 8
|
|
/* Finished establishing the checkpoint */
|
|
#define ORTE_SNAPC_CKPT_STATE_ESTABLISHED 9
|
|
/* Processes continuing or have been recovered (finished post-INC) */
|
|
#define ORTE_SNAPC_CKPT_STATE_RECOVERED 10
|
|
/* Unable to checkpoint this job */
|
|
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 11
|
|
/* Unable to restart this job */
|
|
#define ORTE_SNAPC_CKPT_STATE_NO_RESTART 12
|
|
#define ORTE_SNAPC_CKPT_MAX 13
|
|
|
|
/**
|
|
* Sufficiently high shift value to avoid colliding the process
|
|
* checkpointing states above with the ORTE process states
|
|
*/
|
|
#define ORTE_SNAPC_CKPT_SHIFT 131072
|
|
|
|
/* Uniquely encode the SNAPC state */
|
|
#define ORTE_SNAPC_CKPT_NOTIFY(state) (ORTE_SNAPC_CKPT_SHIFT + state)
|
|
|
|
/* Decode the SNAPC state */
|
|
#define ORTE_SNAPC_CKPT_STATE(state) (state - ORTE_SNAPC_CKPT_SHIFT)
|
|
|
|
/* Check whether a state is a SNAPC state or not. */
|
|
#define CHECK_ORTE_SNAPC_CKPT_STATE(state) (state >= ORTE_SNAPC_CKPT_SHIFT)
|
|
|
|
/**
|
|
* Definition of a orte local snapshot.
|
|
* Similar to the opal_crs_base_snapshot_t except that it
|
|
* contains process contact information.
|
|
*/
|
|
struct orte_snapc_base_local_snapshot_1_0_0_t {
|
|
/** List super object */
|
|
opal_list_item_t super;
|
|
|
|
/** ORTE Process name */
|
|
orte_process_name_t process_name;
|
|
|
|
/** State of the checkpoint */
|
|
int state;
|
|
|
|
/** Stable Storage Handle (must equal the global version) */
|
|
orte_sstore_base_handle_t ss_handle;
|
|
};
|
|
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_1_0_0_t;
|
|
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_local_snapshot_t);
|
|
|
|
/**
|
|
* Definition of the global snapshot.
|
|
* Each component is assumed to have extened this definition
|
|
* in the same way they extern the orte_snapc_base_compoinent_t below.
|
|
*/
|
|
struct orte_snapc_base_global_snapshot_1_0_0_t {
|
|
/** This is an object, so must have super */
|
|
opal_list_item_t super;
|
|
|
|
/** A list of orte_snapc_base_snapshot_t's */
|
|
opal_list_t local_snapshots;
|
|
|
|
/** Checkpoint Options */
|
|
opal_crs_base_ckpt_options_t *options;
|
|
|
|
/** Stable Storage Handle */
|
|
orte_sstore_base_handle_t ss_handle;
|
|
};
|
|
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
|
|
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_global_snapshot_t);
|
|
|
|
struct orte_snapc_base_quiesce_1_0_0_t {
|
|
/** Parent is an object type */
|
|
opal_object_t super;
|
|
|
|
/** Current epoch */
|
|
int epoch;
|
|
/** Requested CRS */
|
|
char * crs_name;
|
|
/** Handle for reference */
|
|
char * handle;
|
|
/** snapshot list */
|
|
orte_snapc_base_global_snapshot_t *snapshot;
|
|
|
|
/** Stable Storage Handle */
|
|
orte_sstore_base_handle_t ss_handle;
|
|
/** Stable Storage Snapshot list */
|
|
orte_sstore_base_global_snapshot_info_t *ss_snapshot;
|
|
|
|
/** Target Directory */
|
|
char * target_dir;
|
|
/** Command Line */
|
|
char * cmdline;
|
|
/** State of operation if checkpointing */
|
|
opal_crs_state_type_t cr_state;
|
|
/** Checkpointing? */
|
|
bool checkpointing;
|
|
/** Restarting? */
|
|
bool restarting;
|
|
|
|
/** Migrating? */
|
|
bool migrating;
|
|
/** List of migrating processes */
|
|
int num_migrating;
|
|
opal_pointer_array_t migrating_procs;
|
|
};
|
|
typedef struct orte_snapc_base_quiesce_1_0_0_t orte_snapc_base_quiesce_1_0_0_t;
|
|
typedef struct orte_snapc_base_quiesce_1_0_0_t orte_snapc_base_quiesce_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_quiesce_t);
|
|
|
|
/**
|
|
* Application request for a global checkpoint related operation
|
|
*/
|
|
typedef enum {
|
|
ORTE_SNAPC_OP_NONE = 0,
|
|
ORTE_SNAPC_OP_INIT,
|
|
ORTE_SNAPC_OP_FIN,
|
|
ORTE_SNAPC_OP_FIN_ACK,
|
|
ORTE_SNAPC_OP_CHECKPOINT,
|
|
ORTE_SNAPC_OP_RESTART,
|
|
ORTE_SNAPC_OP_MIGRATE,
|
|
ORTE_SNAPC_OP_QUIESCE_START,
|
|
ORTE_SNAPC_OP_QUIESCE_CHECKPOINT,
|
|
ORTE_SNAPC_OP_QUIESCE_END
|
|
} orte_snapc_base_request_op_event_t;
|
|
|
|
struct orte_snapc_base_request_op_1_0_0_t {
|
|
/** Parent is an object type */
|
|
opal_object_t super;
|
|
|
|
/** Event to request */
|
|
orte_snapc_base_request_op_event_t event;
|
|
|
|
/** Is this request still active */
|
|
bool is_active;
|
|
|
|
/** Leader of the operation */
|
|
int leader;
|
|
|
|
/** Sequence Number */
|
|
int seq_num;
|
|
|
|
/** Global Handle */
|
|
char * global_handle;
|
|
|
|
/** Stable Storage Handle */
|
|
orte_sstore_base_handle_t ss_handle;
|
|
|
|
/** Migrating vpid list of participants */
|
|
int mig_num;
|
|
int *mig_vpids;
|
|
|
|
/** Migrating hostname preference list */
|
|
char (*mig_host_pref)[OPAL_MAX_PROCESSOR_NAME];
|
|
|
|
/** Migrating vpid preference list */
|
|
int *mig_vpid_pref;
|
|
|
|
/** Info key */
|
|
int *mig_off_node;
|
|
};
|
|
typedef struct orte_snapc_base_request_op_1_0_0_t orte_snapc_base_request_op_1_0_0_t;
|
|
typedef struct orte_snapc_base_request_op_1_0_0_t orte_snapc_base_request_op_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_request_op_t);
|
|
|
|
/**
|
|
* Module initialization function.
|
|
* Returns ORTE_SUCCESS
|
|
*/
|
|
typedef int (*orte_snapc_base_module_init_fn_t)
|
|
(bool seed, bool app);
|
|
|
|
/**
|
|
* Module finalization function.
|
|
* Returns ORTE_SUCCESS
|
|
*/
|
|
typedef int (*orte_snapc_base_module_finalize_fn_t)
|
|
(void);
|
|
|
|
/**
|
|
* Setup the necessary structures for this job
|
|
* Returns ORTE_SUCCESS
|
|
*/
|
|
typedef int (*orte_snapc_base_setup_job_fn_t)
|
|
(orte_jobid_t jobid);
|
|
|
|
/**
|
|
* Setup the necessary structures for this job
|
|
* Returns ORTE_SUCCESS
|
|
*/
|
|
typedef int (*orte_snapc_base_release_job_fn_t)
|
|
(orte_jobid_t jobid);
|
|
|
|
|
|
/**
|
|
* Handle fault tolerance updates
|
|
*
|
|
* @param[in] state Fault tolerance state update
|
|
*
|
|
* @retval ORTE_SUCCESS The operation completed successfully
|
|
* @retval ORTE_ERROR An unspecifed error occurred
|
|
*/
|
|
typedef int (*orte_snapc_base_ft_event_fn_t)(int state);
|
|
|
|
/**
|
|
* Start a checkpoint originating from an internal source.
|
|
*
|
|
* This really only makes sense to call from an application, but in the future
|
|
* we may allow the checkpoint operation to use this function from the local
|
|
* coordinator.
|
|
*
|
|
* @param[out] epoch Epoch number to associate with this checkpoint operation
|
|
* Returns ORTE_SUCCESS
|
|
*/
|
|
typedef int (*orte_snapc_base_start_checkpoint_fn_t)
|
|
(orte_snapc_base_quiesce_t *datum);
|
|
|
|
/**
|
|
* Signal end of checkpoint epoch originating from an internal source.
|
|
*
|
|
* @param[in] epoch Epoch number to associate with this checkpoint operation
|
|
* Returns ORTE_SUCCESS
|
|
*/
|
|
typedef int (*orte_snapc_base_end_checkpoint_fn_t)
|
|
(orte_snapc_base_quiesce_t *datum);
|
|
|
|
/**
|
|
* Request a checkpoint related operation to take place
|
|
*/
|
|
typedef int (*orte_snapc_base_request_op_fn_t)
|
|
(orte_snapc_base_request_op_t *datum);
|
|
|
|
/**
|
|
* Structure for SNAPC components.
|
|
*/
|
|
struct orte_snapc_base_component_2_0_0_t {
|
|
/** MCA base component */
|
|
mca_base_component_t base_version;
|
|
/** MCA base data */
|
|
mca_base_component_data_t base_data;
|
|
|
|
/** Verbosity Level */
|
|
int verbose;
|
|
/** Output Handle for opal_output */
|
|
int output_handle;
|
|
/** Default Priority */
|
|
int priority;
|
|
};
|
|
typedef struct orte_snapc_base_component_2_0_0_t orte_snapc_base_component_2_0_0_t;
|
|
typedef struct orte_snapc_base_component_2_0_0_t orte_snapc_base_component_t;
|
|
|
|
/**
|
|
* Structure for SNAPC modules
|
|
*/
|
|
struct orte_snapc_base_module_1_0_0_t {
|
|
/** Initialization Function */
|
|
orte_snapc_base_module_init_fn_t snapc_init;
|
|
/** Finalization Function */
|
|
orte_snapc_base_module_finalize_fn_t snapc_finalize;
|
|
/** Setup structures for a job */
|
|
orte_snapc_base_setup_job_fn_t setup_job;
|
|
/** Release job */
|
|
orte_snapc_base_release_job_fn_t release_job;
|
|
/** Handle any FT Notifications */
|
|
orte_snapc_base_ft_event_fn_t ft_event;
|
|
/** Handle internal request for checkpoint */
|
|
orte_snapc_base_start_checkpoint_fn_t start_ckpt;
|
|
orte_snapc_base_end_checkpoint_fn_t end_ckpt;
|
|
/** Handle a checkpoint related request */
|
|
orte_snapc_base_request_op_fn_t request_op;
|
|
};
|
|
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_1_0_0_t;
|
|
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_t;
|
|
|
|
ORTE_DECLSPEC extern orte_snapc_base_module_t orte_snapc;
|
|
ORTE_DECLSPEC extern orte_snapc_base_component_t orte_snapc_base_selected_component;
|
|
|
|
/**
|
|
* Macro for use in components that are of type SNAPC
|
|
*/
|
|
#define ORTE_SNAPC_BASE_VERSION_2_0_0 \
|
|
ORTE_MCA_BASE_VERSION_2_1_0("snapc", 2, 0, 0)
|
|
|
|
END_C_DECLS
|
|
|
|
#endif /* ORTE_SNAPC_H */
|
|
|