/* * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2007 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** * @file * ORTE Checkpoint Tool for checkpointing a multiprocess job * */ #include "orte_config.h" #include "orte/constants.h" #include #include #ifdef HAVE_STDLIB_H #include #endif /* HAVE_STDLIB_H */ #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #ifdef HAVE_FCNTL_H #include #endif /* HAVE_FCNTL_H */ #ifdef HAVE_SYS_TYPES_H #include #endif /* HAVE_SYS_TYPES_H */ #ifdef HAVE_SYS_STAT_H #include /* for mkfifo */ #endif /* HAVE_SYS_STAT_H */ #ifdef HAVE_SYS_WAIT_H #include #endif /* HAVE_SYS_WAIT_H */ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ #include "opal/util/cmd_line.h" #include "opal/util/output.h" #include "opal/util/argv.h" #include "opal/util/opal_environ.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" #include "opal/runtime/opal.h" #include "opal/runtime/opal_cr.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_cr.h" #include "orte/util/hnp_contact.h" #include "orte/runtime/orte_globals.h" #include "orte/util/name_fns.h" #include "opal/util/show_help.h" #include "orte/util/proc_info.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" #include "opal/dss/dss.h" #include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/base/base.h" #include "orte/mca/sstore/sstore.h" #include "orte/mca/sstore/base/base.h" #include MCA_timer_IMPLEMENTATION_HEADER /****************** * Local Functions ******************/ static int ckpt_init(int argc, char *argv[]); /* Initalization routine */ static int ckpt_finalize(void); /* Finalization routine */ static int parse_args(int argc, char *argv[]); static int find_hnp(void); static int start_listener(void); static int stop_listener(void); static void hnp_receiver(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); static void process_ckpt_update_cmd(orte_process_name_t* sender, opal_buffer_t* buffer); static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options); static int pretty_print_status(void); static int pretty_print_reference(void); static int list_all_snapshots(void); static orte_hnp_contact_t *orterun_hnp = NULL; static char * global_snapshot_handle = NULL; static int global_sequence_num = 0; /***************************************** * Global Vars for Command line Arguments *****************************************/ static bool listener_started = false; static bool is_checkpoint_finished = false; static bool is_checkpoint_established = false; static bool is_checkpoint_recovered = false; static double timer_start = 0; static double timer_last = 0; static double get_time(void); typedef struct { bool help; int pid; opal_crs_base_ckpt_options_t *options; bool term; bool stop; bool verbose; int verbose_level; orte_jobid_t req_hnp; /**< User Requested HNP */ bool nowait; /* Do not wait for checkpoint to complete before returning */ bool status; /* Display status messages while checkpoint is progressing */ int output; int ckpt_status; bool list_only; /* List available checkpoints only */ #if OPAL_ENABLE_CRDEBUG == 1 bool enable_crdebug; /* Enable C/R Debugging */ bool attach_debugger; bool detach_debugger; #endif } orte_checkpoint_globals_t; orte_checkpoint_globals_t orte_checkpoint_globals; opal_cmd_line_init_t cmd_line_opts[] = { { NULL, NULL, NULL, 'h', NULL, "help", 0, &orte_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL, "This help message" }, { NULL, NULL, NULL, 'v', NULL, "verbose", 0, &orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, "Be Verbose" }, { NULL, NULL, NULL, 'V', NULL, NULL, 1, &orte_checkpoint_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT, "Set the verbosity level (For additional debugging information)" }, { NULL, NULL, NULL, '\0', NULL, "term", 0, &(orte_checkpoint_globals.term), OPAL_CMD_LINE_TYPE_BOOL, "Terminate the application after checkpoint (Cannot be used with --stop)" }, { NULL, NULL, NULL, '\0', NULL, "stop", 0, &(orte_checkpoint_globals.stop), OPAL_CMD_LINE_TYPE_BOOL, "Send SIGSTOP to application just after checkpoint (checkpoint will not finish until SIGCONT is sent) (Cannot be used with --term)" }, { NULL, NULL, NULL, 'w', NULL, "nowait", 0, &orte_checkpoint_globals.nowait, OPAL_CMD_LINE_TYPE_BOOL, "Do not wait for the application to finish checkpointing before returning" }, { NULL, NULL, NULL, 's', NULL, "status", 0, &orte_checkpoint_globals.status, OPAL_CMD_LINE_TYPE_BOOL, "Display status messages describing the progression of the checkpoint" }, { "hnp-jobid", NULL, NULL, '\0', NULL, "hnp-jobid", 1, &orte_checkpoint_globals.req_hnp, OPAL_CMD_LINE_TYPE_INT, "This should be the jobid of the HNP whose applications you wish " "to checkpoint." }, { "hnp-pid", NULL, NULL, '\0', NULL, "hnp-pid", 1, &orte_checkpoint_globals.pid, OPAL_CMD_LINE_TYPE_INT, "This should be the pid of the mpirun whose applications you wish " "to checkpoint." }, { NULL, NULL, NULL, 'l', NULL, "list", 0, &orte_checkpoint_globals.list_only, OPAL_CMD_LINE_TYPE_BOOL, "Display a list of checkpoint files available on this machine" }, #if OPAL_ENABLE_CRDEBUG == 1 { NULL, NULL, NULL, '\0', "crdebug", "crdebug", 0, &orte_checkpoint_globals.enable_crdebug, OPAL_CMD_LINE_TYPE_BOOL, "Enable C/R Enhanced Debugging" }, { NULL, NULL, NULL, '\0', "attach", "attach", 0, &(orte_checkpoint_globals.attach_debugger), OPAL_CMD_LINE_TYPE_BOOL, "Wait for the debugger to attach directly after taking the checkpoint." }, { NULL, NULL, NULL, '\0', "detach", "detach", 0, &(orte_checkpoint_globals.detach_debugger), OPAL_CMD_LINE_TYPE_BOOL, "Do not wait for the debugger to reattach after taking the checkpoint." }, #endif /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } }; int main(int argc, char *argv[]) { int ret, exit_status = ORTE_SUCCESS; /*************** * Initialize ***************/ if (ORTE_SUCCESS != (ret = ckpt_init(argc, argv))) { exit_status = ret; goto cleanup; } /************************************* * Listing only Checkpoint References *************************************/ if( orte_checkpoint_globals.list_only ) { if (ORTE_SUCCESS != (ret = list_all_snapshots())) { exit_status = ret; goto cleanup; } exit_status = ORTE_SUCCESS; goto cleanup; } /*************************** * Find the HNP that we want to connect to, if it exists ***************************/ if (ORTE_SUCCESS != (ret = find_hnp())) { /* Error printed by called function */ exit_status = ret; goto cleanup; } /******************************* * Checkpoint the requested PID *******************************/ is_checkpoint_finished = false; is_checkpoint_recovered = false; is_checkpoint_established = false; if( orte_checkpoint_globals.verbose ) { opal_output_verbose(10, orte_checkpoint_globals.output, "orte_checkpoint: Checkpointing..."); if (0 < orte_checkpoint_globals.pid) { opal_output_verbose(10, orte_checkpoint_globals.output, "\t PID %d", orte_checkpoint_globals.pid); } else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){ opal_output_verbose(10, orte_checkpoint_globals.output, "\t Mpirun (%s)", ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp)); } opal_output_verbose(10, orte_checkpoint_globals.output, "\t Connected to Mpirun %s", ORTE_NAME_PRINT(&orterun_hnp->name)); if(orte_checkpoint_globals.options->term) { opal_output_verbose(10, orte_checkpoint_globals.output, "\t Terminating after checkpoint\n"); } if(orte_checkpoint_globals.options->stop) { opal_output_verbose(10, orte_checkpoint_globals.output, "\t Stopping after checkpoint\n"); } } if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.options)) ) { opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, orte_checkpoint_globals.pid, ret); ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Wait for the checkpoint to complete */ if(!orte_checkpoint_globals.nowait) { while( !is_checkpoint_finished ) { opal_progress(); } } if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status || ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) { exit_status = ORTE_ERROR; goto cleanup; } if(!orte_checkpoint_globals.nowait) { pretty_print_reference(); } cleanup: /*************** * Cleanup ***************/ if (ORTE_SUCCESS != (ret = ckpt_finalize())) { return ret; } return exit_status; } static int parse_args(int argc, char *argv[]) { int i, ret, len, exit_status = ORTE_SUCCESS ; opal_cmd_line_t cmd_line; char **app_env = NULL, **global_env = NULL; char * tmp_env_var = NULL; char *argv0 = NULL; /* Init structure */ memset(&orte_checkpoint_globals, 0, sizeof(orte_checkpoint_globals_t)); orte_checkpoint_globals.help = false; orte_checkpoint_globals.pid = -1; orte_checkpoint_globals.verbose = false; orte_checkpoint_globals.verbose_level = 0; orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID; orte_checkpoint_globals.nowait = false; orte_checkpoint_globals.status = false; orte_checkpoint_globals.output = -1; orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE; orte_checkpoint_globals.list_only = false; #if OPAL_ENABLE_CRDEBUG == 1 orte_checkpoint_globals.enable_crdebug = false; #endif orte_checkpoint_globals.options = OBJ_NEW(opal_crs_base_ckpt_options_t); orte_checkpoint_globals.term = false; orte_checkpoint_globals.stop = false; #if OPAL_ENABLE_CRDEBUG == 1 orte_checkpoint_globals.attach_debugger = false; orte_checkpoint_globals.detach_debugger = false; #endif #if OPAL_ENABLE_FT_CR == 0 /* Warn and exit if not configured with Checkpoint/Restart */ { char *str, *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); str = opal_show_help_string("help-orte-checkpoint.txt", "usage-no-cr", true, args); if (NULL != str) { printf("%s", str); free(str); } free(args); exit_status = ORTE_ERROR; goto cleanup; } #endif /* Parse the command line options */ opal_cmd_line_create(&cmd_line, cmd_line_opts); mca_base_open(); mca_base_cmd_line_setup(&cmd_line); ret = opal_cmd_line_parse(&cmd_line, false, argc, argv); if (OPAL_SUCCESS != ret) { if (OPAL_ERR_SILENT != ret) { fprintf(stderr, "%s: command line error (%s)\n", argv[0], opal_strerror(ret)); } exit_status = 1; goto cleanup; } if (orte_checkpoint_globals.help) { char *str, *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); str = opal_show_help_string("help-orte-checkpoint.txt", "usage", true, args); if (NULL != str) { printf("%s", str); free(str); } free(args); /* If we show the help message, that should be all we do */ exit(0); } /** * Put all of the MCA arguments in the environment */ mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env); len = opal_argv_count(app_env); for(i = 0; i < len; ++i) { putenv(app_env[i]); } len = opal_argv_count(global_env); for(i = 0; i < len; ++i) { putenv(global_env[i]); } tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); opal_setenv(tmp_env_var, "1", true, &environ); free(tmp_env_var); tmp_env_var = NULL; /** * Now start parsing our specific arguments */ /* get the remaining bits */ argv0 = strdup(argv[0]); opal_cmd_line_get_tail(&cmd_line, &argc, &argv); if(orte_checkpoint_globals.list_only ) { exit_status = ORTE_SUCCESS; goto cleanup; } if (0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp) { fprintf(stderr, "%s: Nothing to do\n", argv0); fprintf(stderr, "Type '%s --help' for usage.\n", argv0); exit_status = 1; goto cleanup; } orte_checkpoint_globals.options->term = orte_checkpoint_globals.term; orte_checkpoint_globals.options->stop = orte_checkpoint_globals.stop; #if OPAL_ENABLE_CRDEBUG == 1 orte_checkpoint_globals.options->attach_debugger = orte_checkpoint_globals.attach_debugger; orte_checkpoint_globals.options->detach_debugger = orte_checkpoint_globals.detach_debugger; #endif if(orte_checkpoint_globals.verbose_level < 0 ) { orte_checkpoint_globals.verbose_level = 0; } if(orte_checkpoint_globals.verbose_level > 0) { orte_checkpoint_globals.verbose = true; } /* * If the user did not supply an hnp jobid, then they must * supply the PID of MPIRUN */ if(0 >= argc && ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp) { exit_status = ORTE_SUCCESS; goto cleanup; } orte_checkpoint_globals.pid = atoi(argv[0]); if ( 0 >= orte_checkpoint_globals.pid ) { opal_show_help("help-orte-checkpoint.txt", "invalid_pid", true, orte_checkpoint_globals.pid); exit_status = ORTE_ERROR; goto cleanup; } /* * JJH: No wait is currently not implemented or tested */ if(orte_checkpoint_globals.nowait) { orte_checkpoint_globals.nowait = false; opal_show_help("help-orte-checkpoint.txt", "not_impl", true, "Disconnected checkpoint"); } if(orte_checkpoint_globals.verbose) { orte_checkpoint_globals.status = true; } cleanup: if (NULL != argv0) { free(argv0); } return exit_status; } /* * This function attempts to find an HNP to connect to. */ static int find_hnp(void) { int ret, exit_status = ORTE_SUCCESS; opal_list_t hnp_list; opal_list_item_t *item; orte_hnp_contact_t *hnpcandidate; /* get the list of local hnp's available to us and setup * contact info for them into the RML */ OBJ_CONSTRUCT(&hnp_list, opal_list_t); if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) { opal_show_help("help-orte-checkpoint.txt", "no_hnps", true, orte_checkpoint_globals.pid, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, ret, ORTE_ERROR_NAME(ret)); exit_status = ret; goto cleanup; } /* search the list for the desired hnp */ while (NULL != (item = opal_list_remove_first(&hnp_list))) { hnpcandidate = (orte_hnp_contact_t*)item; if (hnpcandidate->name.jobid == orte_checkpoint_globals.req_hnp || hnpcandidate->pid == orte_checkpoint_globals.pid) { /* this is the one we want */ orterun_hnp = hnpcandidate; exit_status = ORTE_SUCCESS; goto cleanup; } } /* If no match was found, error out */ opal_show_help("help-orte-checkpoint.txt", "no_universe", true, orte_checkpoint_globals.pid, orte_process_info.tmpdir_base, orte_process_info.top_session_dir); cleanup: while (NULL != (item = opal_list_remove_first(&hnp_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&hnp_list); if( NULL == orterun_hnp ) { return ORTE_ERROR; } else { return exit_status; } } static int ckpt_init(int argc, char *argv[]) { int exit_status = ORTE_SUCCESS, ret; char * tmp_env_var = NULL; listener_started = false; /* * Make sure to init util before parse_args * to ensure installdirs is setup properly * before calling mca_base_open(); */ if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) { return ret; } /* * Parse Command Line Arguments */ if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) { return ret; } /* Disable the checkpoint notification routine for this * tool. As we will never need to checkpoint this tool. * Note: This must happen before opal_init(). */ opal_cr_set_enabled(false); /* Select the none component, since we don't actually use a checkpointer */ tmp_env_var = mca_base_param_env_var("crs"); opal_setenv(tmp_env_var, "none", true, &environ); free(tmp_env_var); tmp_env_var = NULL; /*************************** * We need all of OPAL and the TOOLS portion of ORTE - this * sets us up so we can talk to any HNP over the wire ***************************/ if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) { exit_status = ret; goto cleanup; } /* * Setup ORTE Output handle from the verbose argument */ if( orte_checkpoint_globals.verbose ) { orte_checkpoint_globals.output = opal_output_open(NULL); opal_output_set_verbosity(orte_checkpoint_globals.output, orte_checkpoint_globals.verbose_level); } else { orte_checkpoint_globals.output = 0; /* Default=STDERR */ } /* * Start the listener */ if( ORTE_SUCCESS != (ret = start_listener() ) ) { exit_status = ret; } cleanup: return exit_status; } static int ckpt_finalize(void) { int exit_status = ORTE_SUCCESS, ret; /* * Stop the listener */ if( ORTE_SUCCESS != (ret = stop_listener() ) ) { exit_status = ret; } if (ORTE_SUCCESS != (ret = orte_finalize())) { exit_status = ret; } return exit_status; } static int start_listener(void) { int ret, exit_status = ORTE_SUCCESS; if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT, ORTE_RML_PERSISTENT, hnp_receiver, NULL))) { exit_status = ret; goto cleanup; } listener_started = true; cleanup: return exit_status; } static int stop_listener(void) { int ret, exit_status = ORTE_SUCCESS; if( !listener_started ) { exit_status = ORTE_ERROR; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT))) { exit_status = ret; goto cleanup; } listener_started = false; cleanup: return exit_status; } static void hnp_receiver(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { orte_snapc_cmd_flag_t command; orte_std_cntr_t count; int rc; opal_output_verbose(5, orte_checkpoint_globals.output, "orte_checkpoint: hnp_receiver: Receive a command message."); /* * Otherwise this is an inter-coordinator command (usually updating state info). */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_CMD))) { ORTE_ERROR_LOG(rc); return; } switch (command) { case ORTE_SNAPC_GLOBAL_UPDATE_CMD: opal_output_verbose(10, orte_checkpoint_globals.output, "orte_checkpoint: hnp_receiver: Status Update."); process_ckpt_update_cmd(sender, buffer); break; case ORTE_SNAPC_GLOBAL_INIT_CMD: case ORTE_SNAPC_GLOBAL_TERM_CMD: /* Do Nothing */ break; default: ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); } } static void process_ckpt_update_cmd(orte_process_name_t* sender, opal_buffer_t* buffer) { int ret, exit_status = ORTE_SUCCESS; orte_std_cntr_t count = 1; int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE; /* * Receive the data: * - ckpt_state * - global snapshot handle (upon finish only) * - sequence number (upon finish only) */ count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) { exit_status = ret; goto cleanup; } orte_checkpoint_globals.ckpt_status = ckpt_status; if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status || ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status || ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status || ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) { count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_snapshot_handle, &count, OPAL_STRING)) ) { exit_status = ret; goto cleanup; } count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_sequence_num, &count, OPAL_INT)) ) { exit_status = ret; goto cleanup; } } /* * If the job is not able to be checkpointed, then return */ if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) { opal_show_help("help-orte-checkpoint.txt", "non-ckptable", true, orte_checkpoint_globals.pid); is_checkpoint_finished = true; exit_status = ORTE_ERROR; goto cleanup; } if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status) { opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, orte_checkpoint_globals.pid, ORTE_ERROR); is_checkpoint_finished = true; exit_status = ORTE_ERROR; goto cleanup; } /* Status progression */ if( orte_checkpoint_globals.status ) { pretty_print_status(); } if( ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status) { is_checkpoint_finished = true; goto cleanup; } /* Normal termination check */ if( (ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status && is_checkpoint_established) || (ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status && is_checkpoint_recovered) ){ is_checkpoint_finished = true; goto cleanup; } else if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status ) { is_checkpoint_recovered = true; } else if(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ) { is_checkpoint_established = true; } cleanup: return; } static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options) { int ret, exit_status = ORTE_SUCCESS; opal_buffer_t *buffer = NULL; orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_INIT_CMD; orte_jobid_t jobid = ORTE_JOBID_INVALID; if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) { exit_status = ORTE_ERROR; goto cleanup; } opal_output_verbose(10, orte_checkpoint_globals.output, "orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n", orte_checkpoint_globals.pid); timer_start = get_time(); /*********************************** * Notify HNP of checkpoint request * Send: * - Command * - options * - jobid ***********************************/ if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SNAPC_CMD)) ) { exit_status = ret; goto cleanup; } if( ORTE_SUCCESS != (ret = orte_snapc_base_pack_options(buffer, options)) ) { exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &jobid, 1, ORTE_JOBID))) { exit_status = ret; goto cleanup; } if ( 0 > (ret = orte_rml.send_buffer(&(orterun_hnp->name), buffer, ORTE_RML_TAG_CKPT, 0)) ) { exit_status = ret; goto cleanup; } opal_output_verbose(10, orte_checkpoint_globals.output, "orte_checkpoint: notify_hnp: Requested a checkpoint of jobid %s\n", ORTE_JOBID_PRINT(jobid)); cleanup: if( NULL != buffer) { OBJ_RELEASE(buffer); buffer = NULL; } if( ORTE_SUCCESS != exit_status ) { opal_show_help("help-orte-checkpoint.txt", "unable_to_connect", true, orte_checkpoint_globals.pid); } return exit_status; } /*************** * Pretty Print ***************/ static double get_time(void) { double wtime; #if OPAL_TIMER_USEC_NATIVE wtime = (double)opal_timer_base_get_usec() / 1000000.0; #else struct timeval tv; gettimeofday(&tv, NULL); wtime = tv.tv_sec; wtime += (double)tv.tv_usec / 1000000.0; #endif return wtime; } static int pretty_print_status(void) { char * state_str = NULL; double cur_time; cur_time = get_time(); if( timer_last == 0 ) { timer_last = cur_time; } orte_snapc_ckpt_state_str(&state_str, orte_checkpoint_globals.ckpt_status); if( NULL != global_snapshot_handle ) { opal_output(0, "[%6.2f / %6.2f] %*s - %s\n", (cur_time - timer_last), (cur_time - timer_start), 25, state_str, global_snapshot_handle); } else { opal_output(0, "[%6.2f / %6.2f] %*s - ...\n", (cur_time - timer_last), (cur_time - timer_start), 25, state_str); } if( NULL != state_str) { free(state_str); } timer_last = cur_time; return ORTE_SUCCESS; } static int pretty_print_reference(void) { #if OPAL_ENABLE_CRDEBUG == 1 if( orte_checkpoint_globals.enable_crdebug ) { printf("Checkpoint handle: -s %3d %s\n", global_sequence_num, global_snapshot_handle); return ORTE_SUCCESS; } #endif printf("Snapshot Ref.: %3d %s\n", global_sequence_num, global_snapshot_handle); return ORTE_SUCCESS; } static int list_all_snapshots(void) { int ret, exit_status = ORTE_SUCCESS; opal_list_t *all_snapshots = NULL; opal_list_item_t* item = NULL; orte_sstore_base_global_snapshot_info_t *global_snapshot = NULL; int s; all_snapshots = OBJ_NEW(opal_list_t); if( ORTE_SUCCESS != (ret = orte_sstore_base_get_all_snapshots(all_snapshots, NULL)) ) { opal_output(0, "Error: Unable to list the checkpoints in the directory <%s>\n", orte_sstore_base_global_snapshot_dir); ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * For each reference */ for(item = opal_list_get_first(all_snapshots); item != opal_list_get_end(all_snapshots); item = opal_list_get_next(item) ) { global_snapshot = (orte_sstore_base_global_snapshot_info_t*)item; /* * Get a list of valid sequence numbers */ if( ORTE_SUCCESS != (ret = orte_sstore_base_find_all_seq_nums(global_snapshot, &(global_snapshot->num_seqs), &(global_snapshot->all_seqs)))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } s = 0; /* Silence a compiler warning */ #if OPAL_ENABLE_CRDEBUG == 1 /* Pretty print the result - C/R Debug version */ if( orte_checkpoint_globals.enable_crdebug ) { for(s = 0; s < global_snapshot->num_seqs; ++s) { printf("-s %s %s\n", global_snapshot->all_seqs[s], global_snapshot->reference); } } else #endif { /* Pretty print the result */ printf("Snapshot Ref.: %s\t[", global_snapshot->reference); if( 0 >= global_snapshot->num_seqs ) { printf("No Valid Checkpoints"); } else { printf("%s", opal_argv_join(global_snapshot->all_seqs, ',')); } printf("]\n"); } } cleanup: while (NULL != (item = opal_list_remove_first(all_snapshots))) { OBJ_RELEASE(item); } OBJ_RELEASE(all_snapshots); return exit_status; }