/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2007 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2007-2013 Los Alamos National Security, LLC.  All rights
 *                         reserved.
 * Copyright (c) 2011-2012 Cisco Systems, Inc.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

/**
 * @file
 * OPAL Checkpoint command
 *
 * This command will initiate the checkpoint of a single
 * process that has been compiled with OPAL support.
 */
#include "opal_config.h"

#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif  /* HAVE_UNISTD_H */
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif  /* HAVE_FCNTL_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif  /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>  /* for mkfifo */
#endif  /* HAVE_SYS_STAT_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#include <string.h>
#include <signal.h>

#include "opal/constants.h"

#include "opal/util/cmd_line.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
#include "opal/util/opal_environ.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"

#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"

#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"

/******************
 * Global Vars
 ******************/

/******************
 * Local Functions
 ******************/
static int initialize(int argc, char *argv[]);
static int finalize(void);
static int parse_args(int argc, char *argv[]);
static int notify_process_for_checkpoint(pid_t pid, char **fname, int term,
                                         opal_crs_state_type_t *state);

/*****************************************
 * Global Vars for Command line Arguments
 *****************************************/
typedef struct {
    bool help;
    int pid;
    bool term;
    bool verbose;
    bool quiet;
    char *snapshot_name;
    char *snapshot_loc;
    int output;
} opal_checkpoint_globals_t;

opal_checkpoint_globals_t opal_checkpoint_globals;

opal_cmd_line_init_t cmd_line_opts[] = {
    { NULL,
      'h', NULL, "help",
      0,
      &opal_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
      "This help message" },

    { NULL,
      'v', NULL, "verbose",
      0,
      &opal_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
      "Be Verbose" },

    { NULL,
      'q', NULL, "quiet",
      0,
      &opal_checkpoint_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
      "Be Super Quiet" },

    { NULL,
      '\0', NULL, "term",
      0,
      &opal_checkpoint_globals.term, OPAL_CMD_LINE_TYPE_BOOL,
      "Terminate the application after checkpoint" },

    { NULL,
      'n', NULL, "name",
      1,
      &opal_checkpoint_globals.snapshot_name, OPAL_CMD_LINE_TYPE_STRING,
      "Request a specific snapshot reference." },

    { "crs_base_snapshot_dir",
      'w', NULL, "where",
      1,
      &opal_checkpoint_globals.snapshot_loc, OPAL_CMD_LINE_TYPE_STRING,
      "Where to place the checkpoint files. Note: You must remember this "
      "location to pass into opal-restart, as it may not be able to find "
      "the desired directory." },

    /* End of list */
    { NULL, '\0', NULL, NULL, 0,
      NULL, OPAL_CMD_LINE_TYPE_NULL,
      NULL }
};

int
main(int argc, char *argv[])
{
    int ret, exit_status = OPAL_SUCCESS;
    char *fname = NULL;
    opal_crs_state_type_t cr_state;

    /***************
     * Initialize
     ***************/
    if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*******************************
     * Checkpoint the requested PID
     *******************************/
    opal_output_verbose(10, opal_checkpoint_globals.output,
                        "opal_checkpoint: Checkpointing PID %d",
                        opal_checkpoint_globals.pid);
    if( opal_checkpoint_globals.term ) {
        opal_output_verbose(10, opal_checkpoint_globals.output,
                            "\tTerminating application after checkpoint");
    }

    ret = notify_process_for_checkpoint(opal_checkpoint_globals.pid,
                                        &fname,
                                        opal_checkpoint_globals.term,
                                        &cr_state);
    if (OPAL_SUCCESS != ret ||
        cr_state == OPAL_CRS_ERROR) {
        opal_show_help("help-opal-checkpoint.txt", "ckpt_failure", true,
                       opal_checkpoint_globals.pid, ret, cr_state);
        exit_status = ret;
        goto cleanup;
    }

    if( !opal_checkpoint_globals.quiet ) {
        opal_output(opal_checkpoint_globals.output,
                    "Local Snapshot Reference = %s\n",
                    fname);
    }

 cleanup:
    /***************
     * Cleanup
     ***************/
    if (OPAL_SUCCESS != (ret = finalize())) {
        return ret;
    }

    return exit_status;
}

static int initialize(int argc, char *argv[]) {
    int ret, exit_status = OPAL_SUCCESS;
    char * tmp_env_var = NULL;

    /*
     * Make sure to init util before parse_args
     * to ensure installdirs is setup properly
     * before calling mca_base_open();
     */
    if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
        return ret;
    }

    /*
     * Parse Command Line Arguments
     */
    if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Setup OPAL Output handle from the verbose argument
     */
    if( opal_checkpoint_globals.verbose ) {
        opal_checkpoint_globals.quiet = false; /* Automaticly turn off quiet if it is set */
        opal_checkpoint_globals.output = opal_output_open(NULL);
        opal_output_set_verbosity(opal_checkpoint_globals.output, 10);
    } else {
        opal_checkpoint_globals.output = 0; /* Default=STDOUT */
    }

    /*
     * Disable the checkpoint notification routine for this
     * tool. As we will never need to checkpoint this tool.
     * Note: This must happen before opal_init().
     */
    opal_cr_set_enabled(false);

    /*
     * Select the 'none' CRS component,
     * since we don't actually use a checkpointer
     */
    (void) mca_base_var_env_name("crs", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "none",
                true, &environ);
    free(tmp_env_var);
    tmp_env_var = NULL;

    /*
     * Initialize OPAL
     */
    if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) {
        exit_status = ret;
        goto cleanup;
    }

 cleanup:
    return exit_status;
}

static int finalize(void) {
    int ret = OPAL_SUCCESS;

    if (OPAL_SUCCESS != (ret = opal_finalize())) {
        return ret;
    }

    return OPAL_SUCCESS;
}

static int parse_args(int argc, char *argv[]) {
    int i, ret, len;
    opal_cmd_line_t cmd_line;
    char **app_env = NULL, **global_env = NULL;
    char * tmp_env_var = NULL;
    char *argv0 = NULL;

    memset(&opal_checkpoint_globals, 0, sizeof(opal_checkpoint_globals_t));

    opal_checkpoint_globals.snapshot_name = NULL;
    opal_checkpoint_globals.snapshot_loc  = NULL;

    /* Parse the command line options */
    opal_cmd_line_create(&cmd_line, cmd_line_opts);
    mca_base_open();
    mca_base_cmd_line_setup(&cmd_line);
    ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);

    if (OPAL_SUCCESS != ret) {
        if (OPAL_ERR_SILENT != ret) {
            fprintf(stderr, "%s: command line error (%s)\n", argv[0],
                    opal_strerror(ret));
        }
        return 1;
    }
    if (opal_checkpoint_globals.help) {
        char *str, *args = NULL;
        args = opal_cmd_line_get_usage_msg(&cmd_line);
        str = opal_show_help_string("help-opal-checkpoint.txt", "usage", true,
                                    args);
        if (NULL != str) {
            printf("%s", str);
            free(str);
        }
        free(args);
        /* If we show the help message, that should be all we do */
        exit(0);
    }

    /**
     * Put all of the MCA arguments in the environment
     */
    mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);

    len = opal_argv_count(app_env);
    for(i = 0; i < len; ++i) {
        putenv(app_env[i]);
    }

    len = opal_argv_count(global_env);
    for(i = 0; i < len; ++i) {
        putenv(global_env[i]);
    }

    (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
    tmp_env_var = NULL;

    /**
     * Now start parsing our specific arguments
     */

    if( NULL == opal_checkpoint_globals.snapshot_name )
        opal_checkpoint_globals.snapshot_name = strdup("");
    if( NULL == opal_checkpoint_globals.snapshot_loc ) {
        opal_checkpoint_globals.snapshot_loc = strdup("");
    }

    /* get the remaining bits */
    argv0 = strdup(argv[0]);
    opal_cmd_line_get_tail(&cmd_line, &argc, &argv);

    if (0 == argc) {
        fprintf(stderr, "%s: Nothing to do\n", argv0);
        fprintf(stderr, "Type '%s --help' for usage.\n", argv0);
        free(argv0);
        return OPAL_ERROR;
    }
    free(argv0);

    opal_checkpoint_globals.pid = atoi(argv[0]);
    if ( 0 >= opal_checkpoint_globals.pid ) {
        opal_show_help("help-opal-checkpoint.txt", "invalid_pid", true,
                       opal_checkpoint_globals.pid);
        return OPAL_ERROR;
    }

    return OPAL_SUCCESS;
}

static int
notify_process_for_checkpoint(pid_t pid, char **fname, int term, opal_crs_state_type_t *cr_state)
{
    char *prog_named_pipe_r = NULL, *prog_named_pipe_w = NULL;
    int   prog_named_read_pipe_fd = -1, prog_named_write_pipe_fd = -1;
    char *loc_fname = NULL, *tmp_pid = NULL;
    unsigned char cmd;
    int len, ret;
    int exit_status = OPAL_SUCCESS;
    int s, max_wait_time = 20; /* wait time before giving up on the checkpoint */
    ssize_t tmp_size = 0;
    int value;

    /* A string copy of the pid */
    asprintf(&tmp_pid, "%d", pid);

    /* Flip the read/write files for bi-directionality */
    asprintf(&prog_named_pipe_w, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_R, tmp_pid);
    asprintf(&prog_named_pipe_r, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_W, tmp_pid);

    /*
     * Signal the application telling it that we wish to checkpoint
     */
    if( 0 != (ret = kill(pid, opal_cr_entry_point_signal) ) ) {
        exit_status = ret;
        goto cleanup;
    }

    opal_output_verbose(10, opal_checkpoint_globals.output,
                        "opal_checkpoint: Looking for Named Pipes (%s) (%s)\n",
                        prog_named_pipe_r, prog_named_pipe_w);

    for( s = 0; s < max_wait_time; ++s) {
        /*
         * See if the named pipe exists yet for the PID in question
         */
        if( 0 > (ret = access(prog_named_pipe_r, F_OK) )) {
            /* File doesn't exist yet, keep waiting */
            if( !opal_checkpoint_globals.quiet &&
                s >= max_wait_time - 5 ) {
                opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
                            prog_named_pipe_r, ret, s, max_wait_time);
            }
            sleep(1);
            continue;
        }
        else if( 0 > (ret = access(prog_named_pipe_w, F_OK) )) {
            /* File doesn't exist yet, keep waiting */
            if( !opal_checkpoint_globals.quiet &&
                s >= max_wait_time - 5 ) {
                opal_output(0, "opal-checkpoint: File does not exist yet: <%s> rtn = %d (waited %d/%d sec)\n",
                            prog_named_pipe_w, ret, s, max_wait_time);
            }
            sleep(1);
            continue;
        }
        else {
            break;
        }
    }
    if( s == max_wait_time ) {
        /* The file doesn't exist,
         * This means that the process didn't open up a named pipe for us
         * to access their checkpoint notification routine. Therefore,
         * the application either:
         *  - Doesn't exist
         *  - Isn't checkpointable
         * In either case there is nothing we can do.
         */
        opal_show_help("help-opal-checkpoint.txt", "pid_does_not_exist", true,
                       opal_checkpoint_globals.pid, prog_named_pipe_r, prog_named_pipe_w);

        *cr_state = OPAL_CRS_ERROR;

        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /* The file does exist, so let's use it */

    /*
     * Open
     *  - prog_named_write_pipe:
     *    prog makes this file and opens Read Only
     *    this app. opens it Write Only
     *  - prog_named_read_pipe:
     *    prog makes this file and opens Write Only
     *    this app. opens it Read Only
     */
    prog_named_write_pipe_fd = open(prog_named_pipe_w, O_WRONLY);
    if(prog_named_write_pipe_fd < 0) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to open name pipe (%s). %d\n",
                    prog_named_pipe_w, prog_named_write_pipe_fd);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    prog_named_read_pipe_fd = open(prog_named_pipe_r, O_RDWR);
    if(prog_named_read_pipe_fd < 0) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to open name pipe (%s). %d\n",
                    prog_named_pipe_r, prog_named_read_pipe_fd);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /*
     * Start the handshake
     */
    len = 0;
    if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to write handshake to named pipe (%s). %d\n",
                    prog_named_pipe_w, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &value, sizeof(int))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to read length from named pipe (%s). %d\n",
                    prog_named_pipe_r, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /* Check the response to make sure we can checkpoint this process */
    if( OPAL_CHECKPOINT_CMD_IN_PROGRESS == value ) {
        opal_show_help("help-opal-checkpoint.txt",
                       "ckpt:in_progress",
                       true,
                       opal_checkpoint_globals.pid);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }
    else if( OPAL_CHECKPOINT_CMD_NULL == value ) {
        opal_show_help("help-opal-checkpoint.txt",
                       "ckpt:req_null",
                       true,
                       opal_checkpoint_globals.pid);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }
    else if ( OPAL_CHECKPOINT_CMD_ERROR == value ) {
        opal_show_help("help-opal-checkpoint.txt",
                       "ckpt:req_error",
                       true,
                       opal_checkpoint_globals.pid);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /*
     * Write the checkpoint request and information to the
     *  pipe
     */
    cmd = OPAL_CR_CHECKPOINT;
    /* Send the command */
    if( sizeof(cmd) != (ret = write(prog_named_write_pipe_fd, &cmd, sizeof(cmd))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to write CHECKPOINT Command to named pipe (%s). %d\n",
                    prog_named_pipe_w, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /* Send the arguments: {pid, term} */
    if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &pid, sizeof(int))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to write pid (%d) to named pipe (%s). %d\n",
                    pid, prog_named_pipe_w, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &term, sizeof(int))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to write term (%d) to named pipe (%s), %d\n",
                    term, prog_named_pipe_w, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /* Send the snapshot_name argument */
    len = strlen(opal_checkpoint_globals.snapshot_name) + 1;
    if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to write snapshot name len (%d) to named pipe (%s). %d\n",
                    len, prog_named_pipe_w, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    tmp_size = sizeof(char) * len;
    if( tmp_size != (ret = write(prog_named_write_pipe_fd, (opal_checkpoint_globals.snapshot_name), (sizeof(char) * len))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to write snapshot name (%s) to named pipe (%s). %d\n",
                    opal_checkpoint_globals.snapshot_name, prog_named_pipe_w, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /* Send the snashot location argument */
    len = strlen(opal_checkpoint_globals.snapshot_loc) + 1;
    if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to write snapshot location len (%d) to named pipe (%s). %d\n",
                    len, prog_named_pipe_w, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    tmp_size = sizeof(char) * len;
    if( tmp_size != (ret = write(prog_named_write_pipe_fd, (opal_checkpoint_globals.snapshot_loc), (sizeof(char) * len))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to write snapshot location (%s) to named pipe (%s). %d\n",
                    opal_checkpoint_globals.snapshot_loc, prog_named_pipe_w, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /*
     * Get the response from the notification routine on the other
     *  machine.
     */
    if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &len, sizeof(int))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to read length from named pipe (%s). %d\n",
                    prog_named_pipe_r, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    if(len > 0) {
        loc_fname = (char *) malloc(sizeof(char) * len);
        if( (ssize_t)(sizeof(char) * len) != (ret = read(prog_named_read_pipe_fd, loc_fname, (sizeof(char) * len))) ) {
            opal_output(opal_checkpoint_globals.output,
                        "opal_checkpoint: Error: Unable to read filename from named pipe (%s). %d\n",
                        prog_named_pipe_w, ret);
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
    }

    *fname = strdup(loc_fname);
    if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &cr_state, sizeof(int))) ) {
        opal_output(opal_checkpoint_globals.output,
                    "opal_checkpoint: Error: Unable to read state from named pipe (%s). %d\n",
                    prog_named_pipe_r, ret);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

 cleanup:
    /*
     * Close the pipes now that we are done with it
     */
    close(prog_named_write_pipe_fd);
    close(prog_named_read_pipe_fd);

    if( NULL != tmp_pid)
        free(tmp_pid);
    if( NULL != prog_named_pipe_r)
        free(prog_named_pipe_r);
    if( NULL != prog_named_pipe_w)
        free(prog_named_pipe_w);

    return exit_status;
}