/*
 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2007      Evergrid, Inc. All rights reserved.
 * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
 *
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */
/**
 * @file
 *
 * Checkpoint and Restart Service (CRS) Interface
 * 
 * General Description:
 *
 * The OPAL Checkpoint and Restart Service (CRS) has been created to create an
 * abstract notion of a single process checkpointer for upper levels to 
 * incorporate checkpoint/restart calls genericly into their code. This keeps
 * the upper levels from becoming too tied to a specfic checkpoint and restart 
 * implementation.
 *
 * This interface will change in the future to allow for some additional 
 * specialized functionality such as memory inclusion/exclusion, explicit
 * restarting while running, and others.
 *
 * Words to the Wise:
 *
 * The CRS module must adhere to the API exactly inorder to be fully supported.
 * How the module goes about conforming to the API is an internal module issue
 * and in no cases should the module impose restrictions upon the upper layers
 * as this is an API violation.
 *
 */

#ifndef MCA_CRS_H
#define MCA_CRS_H

#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"

BEGIN_C_DECLS

/**
 * States of the module
 */
enum opal_crs_state_type_t {
    OPAL_CRS_NONE        = 0,
    OPAL_CRS_CHECKPOINT  = 1,
    OPAL_CRS_RESTART_PRE = 2,
    OPAL_CRS_RESTART     = 3, /* RESTART_POST */
    OPAL_CRS_CONTINUE    = 4,
    OPAL_CRS_TERM        = 5,
    OPAL_CRS_RUNNING     = 6,
    OPAL_CRS_ERROR       = 7,
    OPAL_CRS_STATE_MAX   = 8
};
typedef enum opal_crs_state_type_t opal_crs_state_type_t;

/*
 * Possible checkpoint options
 */
struct opal_crs_base_ckpt_options_1_0_0_t {
    /** Parent is an object type */
    opal_object_t super;

    /** Terminate after checkpoint */
    bool term;
    /** Send SIGSTOP after checkpoint */
    bool stop;

    /** INC Prep Only */
    bool inc_prep_only;

    /** INC Recover Only */
    bool inc_recover_only;

#if OPAL_ENABLE_CRDEBUG == 1
    /** Wait for debugger to attach after checkpoint */
    bool attach_debugger;
    /** Do not wait for debugger to reattach after checkpoint */
    bool detach_debugger;
#endif
};
typedef struct opal_crs_base_ckpt_options_1_0_0_t opal_crs_base_ckpt_options_1_0_0_t;
typedef struct opal_crs_base_ckpt_options_1_0_0_t opal_crs_base_ckpt_options_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_ckpt_options_t);

/**
 * Structure for Single process snapshot
 * Each component is assumed to have extened this definition
 * in the same way they exten the opal_crs_base_compoinent_t below.
 */
struct opal_crs_base_snapshot_1_0_0_t {
    /** This is an object, so must have super */
    opal_list_item_t super;

    /** MCA Component name */
    char * component_name;

    /** Metadata filename */
    char * metadata_filename;

    /** Metadata fd */
    FILE * metadata;

    /** Absolute path the the snapshot directory */
    char * snapshot_directory;

    /** Cold Start: 
     * If we are restarting cold, then we need to recreate this structure 
     *  opal_restart would set this, and let the component do the heavy lifting
     *  of recreating the structure, sicne it doesn't know exactly how to.
     */
    bool cold_start;
};
typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_1_0_0_t;
typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_t;

OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_snapshot_t);

/**
 * Module initialization function.
 * Returns OPAL_SUCCESS
 */
typedef int (*opal_crs_base_module_init_fn_t)
     (void);

/**
 * Module finalization function.
 * Returns OPAL_SUCCESS
 */
typedef int (*opal_crs_base_module_finalize_fn_t)
     (void);

/**
 * Call the underlying checkpointer.
 * Returns OPAL_SUCCESS upon success, and OPAL_ERROR otherwise.
 *
 * Arguments:
 *   pid    = PID of the process to checkpoint, or 0 if checkpointing self.
 *   fname  = the filename where the checkpoint has been written.
 *   state = The state at which the checkpoint is exiting
 *     - OPAL_CRS_CONTINUE 
 *       Continuing after a checkpoint has been taken
 *     - OPAL_CRS_RESTART
 *       Restarting from a checkpoint
 *     - OPAL_CRS_ERROR
 *       Checkpoint was not successful.
 *
 * The 'fname' string is owned by the caller: if appropriate, it must be eventually 
 * freed by the caller.
 */
typedef int (*opal_crs_base_module_checkpoint_fn_t)
     (pid_t pid,
      opal_crs_base_snapshot_t *snapshot,
      opal_crs_base_ckpt_options_t *options,
      opal_crs_state_type_t *state);

/**
 * Call the underlying restart command for this process
 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 *
 * Arguments:
 *  fname = Checkpoint filename
 *  spawn_child  = true if the restarted process should be forked as a new process,
 *                      in which case 'child_pid' will be returned.
 *                 false if the restarted process should overwrite the current 
 *                       process space.
 *  child_pid = PID of the child that was started, if applicable
 * 
 */
typedef int (*opal_crs_base_module_restart_fn_t)
     (opal_crs_base_snapshot_t *snapshot,
      bool spawn_child,
      pid_t *child_pid);

/**
 * Disable the checkpointer
 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 *
 * This should set a flag/mutex to disallow checkpoints to occur.
 * If a checkpoint were to occur while checkpoints are disabled,
 * they should block until reenabled.
 * A quality module implementation would notify the user that the
 * checkpoint has been delayed until the program is out of this critical
 * section of code.
 */
typedef int (*opal_crs_base_module_disable_checkpoint_fn_t)
     (void);

/**
 * Enable the checkpointer
 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 *
 * This should set a flag/mutex to allow checkpoints to occur
 */
typedef int (*opal_crs_base_module_enable_checkpoint_fn_t)
     (void);

/**
 * Prepare the CRS component for process launch.
 * Some CRS components need to take action before the
 * process is ever launched to do such things as:
 * - seed the process environment
 * - LD_PRELOAD
 * - Analyze the binary before launch
 *
 * @param rank Rank of the process to be started
 * @param app  Absolute pathname of argv[0]
 * @param argv Standard argv-style array, including a final NULL pointer
 * @param env  Standard environ-style array, including a final NULL pointer
 */
typedef int (*opal_crs_base_module_prelaunch_fn_t)
         (int32_t rank,
          char *base_snapshot_dir,
          char **app, 
          char **cwd, 
          char ***argv,
          char ***env);

/**
 * Register another thread that may call this library.
 * Some CR systems require that each thread that will call into their library
 * register individually before doing so.
 *
 * Returns OPAL_SUCCESS or OPAL_ERROR
 */
typedef int (*opal_crs_base_module_reg_thread_fn_t)
     (void);

/**
 * Structure for CRS components.
 */
struct opal_crs_base_component_2_0_0_t {
    /** MCA base component */
    mca_base_component_t base_version;
    /** MCA base data */
    mca_base_component_data_t base_data;

    /** Verbosity Level */
    int verbose;
    /** Output Handle for opal_output */
    int output_handle;
    /** Default Priority */
    int priority;
};
typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_2_0_0_t;
typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_t;

/**
 * Structure for CRS modules
 */
struct opal_crs_base_module_1_0_0_t {
    /** Initialization Function */
    opal_crs_base_module_init_fn_t           crs_init;
    /** Finalization Function */
    opal_crs_base_module_finalize_fn_t       crs_finalize;

    /** Checkpoint interface */
    opal_crs_base_module_checkpoint_fn_t     crs_checkpoint;

    /** Restart Interface */
    opal_crs_base_module_restart_fn_t        crs_restart;

    /** Disable checkpoints */
    opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint;
    /** Enable checkpoints */
    opal_crs_base_module_enable_checkpoint_fn_t  crs_enable_checkpoint;

    /** Pre Launch */
    opal_crs_base_module_prelaunch_fn_t      crs_prelaunch;

    /** Per thread registration */
    opal_crs_base_module_reg_thread_fn_t      crs_reg_thread;
};
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t;
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t;

OPAL_DECLSPEC extern opal_crs_base_module_t opal_crs;

/**
 * Macro for use in components that are of type CRS
 */
#define OPAL_CRS_BASE_VERSION_2_0_0 \
    MCA_BASE_VERSION_2_0_0, \
    "crs", 2, 0, 0

END_C_DECLS

#endif /* OPAL_CRS_H */