/*
 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2007      Evergrid, Inc. All rights reserved.
 *
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */
/**
 * @file
 *
 * Checkpoint and Restart Service (CRS) Interface
 * 
 * General Description:
 *
 * The OPAL Checkpoint and Restart Service (CRS) has been created to create an
 * abstract notion of a single process checkpointer for upper levels to 
 * incorporate checkpoint/restart calls genericly into their code. This keeps
 * the upper levels from becoming too tied to a specfic checkpoint and restart 
 * implementation.
 *
 * This interface will change in the future to allow for some additional 
 * specialized functionality such as memory inclusion/exclusion, explicit
 * restarting while running, and others.
 *
 * Words to the Wise:
 *
 * The CRS module must adhere to the API exactly inorder to be fully supported.
 * How the module goes about conforming to the API is an internal module issue
 * and in no cases should the module impose restrictions upon the upper layers
 * as this is an API violation.
 *
 */

#ifndef MCA_CRS_H
#define MCA_CRS_H

#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"

#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif

/**
 * States of the module
 */
enum opal_crs_state_type_t {
    OPAL_CRS_CHECKPOINT,
    OPAL_CRS_RESTART,
    OPAL_CRS_CONTINUE,
    OPAL_CRS_TERM,
    OPAL_CRS_RUNNING,
    OPAL_CRS_ERROR
};
typedef enum opal_crs_state_type_t opal_crs_state_type_t;

/**
 * Structure for Single process snapshot
 * Each component is assumed to have extened this definition
 * in the same way they exten the opal_crs_base_compoinent_t below.
 */
struct opal_crs_base_snapshot_1_0_0_t {
    /** This is an object, so must have super */
    opal_list_item_t super;

    /** MCA Component name */
    char * component_name;

    /** Unique name of snapshot */
    char * reference_name;

    /** Absolute path the the snapshot directory */
    char * local_location;
    char * remote_location;

    /** Cold Start: 
     * If we are restarting cold, then we need to recreate this structure 
     *  opal_restart would set this, and let the component do the heavy lifting
     *  of recreating the structure, sicne it doesn't know exactly how to.
     */
    bool cold_start;
};
typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_1_0_0_t;
typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_t;

OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_snapshot_t);

/**
 * Query function for CRS components.
 * Returns a priority to rank it agaianst other available CRS components.
 */
typedef struct opal_crs_base_module_1_0_0_t *
        (*opal_crs_base_component_query_1_0_0_fn_t)
        (int *priority);

/**
 * Module initialization function.
 * Returns OPAL_SUCCESS
 */
typedef int (*opal_crs_base_module_init_fn_t)
     (void);

/**
 * Module finalization function.
 * Returns OPAL_SUCCESS
 */
typedef int (*opal_crs_base_module_finalize_fn_t)
     (void);

/**
 * Call the underlying checkpointer.
 * Returns OPAL_SUCCESS upon success, and OPAL_ERROR otherwise.
 *
 * Arguments:
 *   pid    = PID of the process to checkpoint, or 0 if checkpointing self.
 *   fname  = the filename where the checkpoint has been written.
 *   state = The state at which the checkpoint is exiting
 *     - OPAL_CRS_CONTINUE 
 *       Continuing after a checkpoint has been taken
 *     - OPAL_CRS_RESTART
 *       Restarting from a checkpoint
 *     - OPAL_CRS_ERROR
 *       Checkpoint was not successful.
 *
 * The 'fname' string is owned by the caller: if appropriate, it must be eventually 
 * freed by the caller.
 */
typedef int (*opal_crs_base_module_checkpoint_fn_t)
     (pid_t pid,
      opal_crs_base_snapshot_t *snapshot,
      opal_crs_state_type_t *state);

/**
 * Call the underlying restart command for this process
 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 *
 * Arguments:
 *  fname = Checkpoint filename
 *  spawn_child  = true if the restarted process should be forked as a new process,
 *                      in which case 'child_pid' will be returned.
 *                 false if the restarted process should overwrite the current 
 *                       process space.
 *  child_pid = PID of the child that was started, if applicable
 * 
 */
typedef int (*opal_crs_base_module_restart_fn_t)
     (opal_crs_base_snapshot_t *snapshot,
      bool spawn_child,
      pid_t *child_pid);

/**
 * Disable the checkpointer
 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 *
 * This should set a flag/mutex to disallow checkpoints to occur.
 * If a checkpoint were to occur while checkpoints are disabled,
 * they should block until reenabled.
 * A quality module implementation would notify the user that the
 * checkpoint has been delayed until the program is out of this critical
 * section of code.
 */
typedef int (*opal_crs_base_module_disable_checkpoint_fn_t)
     (void);

/**
 * Enable the checkpointer
 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 *
 * This should set a flag/mutex to allow checkpoints to occur
 */
typedef int (*opal_crs_base_module_enable_checkpoint_fn_t)
     (void);

/**
 * Prepare the CRS component for process launch.
 * Some CRS components need to take action before the
 * process is ever launched to do such things as:
 * - seed the process environment
 * - LD_PRELOAD
 * - Analyze the binary before launch
 *
 * @param rank Rank of the process to be started
 * @param app  Absolute pathname of argv[0]
 * @param argv Standard argv-style array, including a final NULL pointer
 * @param env  Standard environ-style array, including a final NULL pointer
 */
typedef int (*opal_crs_base_module_prelaunch_fn_t)
         (int32_t rank,
          char *base_snapshot_dir,
          char **app, 
          char **cwd, 
          char ***argv,
          char ***env);

/**
 * Register another thread that may call this library.
 * Some CR systems require that each thread that will call into their library
 * register individually before doing so.
 *
 * Returns OPAL_SUCCESS or OPAL_ERROR
 */
typedef int (*opal_crs_base_module_reg_thread_fn_t)
     (void);

/**
 * Structure for CRS v1.0.0 components.
 */
struct opal_crs_base_component_1_0_0_t {
    /** MCA base component */
    mca_base_component_t crs_version;
    /** MCA base data */
    mca_base_component_data_1_0_0_t crs_data;

    /** Component Query for Selection Function */
    opal_crs_base_component_query_1_0_0_fn_t crs_query;
    
    /** Verbosity Level */
    int verbose;
    /** Output Handle for opal_output */
    int output_handle;
    /** Default Priority */
    int priority;
};
typedef struct opal_crs_base_component_1_0_0_t opal_crs_base_component_1_0_0_t;
typedef struct opal_crs_base_component_1_0_0_t opal_crs_base_component_t;

/**
 * Structure for CRS v1.0.0 modules
 */
struct opal_crs_base_module_1_0_0_t {
    /** Initialization Function */
    opal_crs_base_module_init_fn_t           crs_init;
    /** Finalization Function */
    opal_crs_base_module_finalize_fn_t       crs_finalize;

    /** Checkpoint interface */
    opal_crs_base_module_checkpoint_fn_t     crs_checkpoint;

    /** Restart Interface */
    opal_crs_base_module_restart_fn_t        crs_restart;

    /** Disable checkpoints */
    opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint;
    /** Enable checkpoints */
    opal_crs_base_module_enable_checkpoint_fn_t  crs_enable_checkpoint;

    /** Pre Launch */
    opal_crs_base_module_prelaunch_fn_t      crs_prelaunch;

    /** Per thread registration */
    opal_crs_base_module_reg_thread_fn_t      crs_reg_thread;
};
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t;
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t;

OPAL_DECLSPEC extern opal_crs_base_module_t opal_crs;

/**
 * Macro for use in components that are of type CRS v1.0.0
 */
#define OPAL_CRS_BASE_VERSION_1_0_0 \
    /* CRS v1.0 is chained to MCA v1.0 */ \
    MCA_BASE_VERSION_1_0_0, \
    /* CRS v1.0 */ \
    "crs", 1, 0, 0

#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

#endif /* OPAL_CRS_H */