/* * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2012 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2012-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** @file * * OPAL Layer Checkpoint/Restart Runtime functions * */ #include "opal_config.h" #include #include #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #ifdef HAVE_FCNTL_H #include #endif /* HAVE_FCNTL_H */ #ifdef HAVE_SYS_TYPES_H #include #endif /* HAVE_SYS_TYPES_H */ #ifdef HAVE_SYS_STAT_H #include /* for mkfifo */ #endif /* HAVE_SYS_STAT_H */ #include #include "opal/class/opal_object.h" #include "opal/util/opal_environ.h" #include "opal/util/show_help.h" #include "opal/util/output.h" #include "opal/util/malloc.h" #include "opal/util/keyval_parse.h" #include "opal/util/opal_environ.h" #include "opal/util/argv.h" #include "opal/util/printf.h" #include "opal/memoryhooks/memory.h" #include "opal/mca/base/base.h" #include "opal/runtime/opal_cr.h" #include "opal/runtime/opal.h" #include "opal/constants.h" #include "opal/mca/if/base/base.h" #include "opal/mca/memcpy/base/base.h" #include "opal/mca/memory/base/base.h" #include "opal/mca/timer/base/base.h" #include "opal/threads/mutex.h" #include "opal/threads/threads.h" #include "opal/mca/crs/base/base.h" /****************** * Global Var Decls ******************/ #if OPAL_ENABLE_CRDEBUG == 1 static opal_thread_t **opal_cr_debug_free_threads = NULL; static int opal_cr_debug_num_free_threads = 0; static int opal_cr_debug_threads_already_waiting = false; int MPIR_debug_with_checkpoint = 0; static volatile int MPIR_checkpoint_debug_gate = 0; int opal_cr_debug_signal = 0; #endif bool opal_cr_stall_check = false; bool opal_cr_currently_stalled = false; int opal_cr_output = -1; int opal_cr_verbose = 0; int opal_cr_initalized = 0; static double opal_cr_get_time(void); static void display_indv_timer_core(double diff, char *str); static double timer_start[OPAL_CR_TIMER_MAX]; bool opal_cr_timing_barrier_enabled = false; bool opal_cr_timing_enabled = false; int opal_cr_timing_my_rank = 0; int opal_cr_timing_target_rank = 0; /****************** * Local Functions & Var Decls ******************/ static int extract_env_vars(int prev_pid, char * file_name); static void opal_cr_sigpipe_debug_signal_handler (int signo); static opal_cr_user_inc_callback_fn_t cur_user_coord_callback[OPAL_CR_INC_MAX] = {NULL}; static opal_cr_coord_callback_fn_t cur_coord_callback = NULL; static opal_cr_notify_callback_fn_t cur_notify_callback = NULL; static int core_prev_pid = 0; /****************** * Interface Functions & Vars ******************/ char * opal_cr_pipe_dir = NULL; int opal_cr_entry_point_signal = 0; bool opal_cr_is_enabled = true; bool opal_cr_is_tool = false; /* Current checkpoint state */ int opal_cr_checkpointing_state = OPAL_CR_STATUS_NONE; /* Current checkpoint request channel state */ int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE; static bool opal_cr_debug_sigpipe = false; bool opal_cr_continue_like_restart = false; #if OPAL_ENABLE_FT_THREAD == 1 /***************** * Threading Functions and Variables *****************/ static void* opal_cr_thread_fn(opal_object_t *obj); bool opal_cr_thread_is_done = false; bool opal_cr_thread_is_active = false; bool opal_cr_thread_in_library = false; bool opal_cr_thread_use_if_avail = true; int32_t opal_cr_thread_num_in_library = 0; int opal_cr_thread_sleep_check = 0; int opal_cr_thread_sleep_wait = 0; opal_thread_t opal_cr_thread; opal_mutex_t opal_cr_thread_lock; #if 0 #define OPAL_CR_LOCK() opal_cr_thread_in_library = true; opal_mutex_lock(&opal_cr_thread_lock); #define OPAL_CR_UNLOCK() opal_cr_thread_in_library = false; opal_mutex_unlock(&opal_cr_thread_lock); #define OPAL_CR_THREAD_LOCK() opal_mutex_lock(&opal_cr_thread_lock); #define OPAL_CR_THREAD_UNLOCK() opal_mutex_unlock(&opal_cr_thread_lock); #else /* This technique will potentially starve the thread, but that is OK since * it is only there as support for when the process is not in the MPI library */ static const uint32_t ThreadFlag = 0x1; static const uint32_t ProcInc = 0x2; #define OPAL_CR_LOCK() \ { \ opal_cr_thread_in_library = true; \ OPAL_THREAD_ADD_FETCH32(&opal_cr_thread_num_in_library, ProcInc); \ while( (opal_cr_thread_num_in_library & ThreadFlag ) != 0 ) { \ sched_yield(); \ } \ } #define OPAL_CR_UNLOCK() \ { \ OPAL_THREAD_ADD_FETCH32(&opal_cr_thread_num_in_library, -ProcInc); \ if( opal_cr_thread_num_in_library <= 0 ) { \ opal_cr_thread_in_library = false; \ } \ } #define OPAL_CR_THREAD_LOCK() \ { \ int32_t _tmp_value = 0; \ while(!OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_32 (&opal_cr_thread_num_in_library, &_tmp_value, ThreadFlag)) { \ if( !opal_cr_thread_is_active && opal_cr_thread_is_done) { \ break; \ } \ sched_yield(); \ usleep(opal_cr_thread_sleep_check); \ } \ } #define OPAL_CR_THREAD_UNLOCK() \ { \ OPAL_THREAD_ADD_FETCH32(&opal_cr_thread_num_in_library, -ThreadFlag); \ } #endif #endif /* OPAL_ENABLE_FT_THREAD == 1 */ int opal_cr_set_enabled(bool en) { opal_cr_is_enabled = en; return OPAL_SUCCESS; } static int opal_cr_register (void) { int ret; #if OPAL_ENABLE_CRDEBUG == 1 int t; #endif /* * Some startup MCA parameters */ ret = mca_base_var_register ("opal", "opal", "cr", "verbose", "Verbose output level for the runtime OPAL Checkpoint/Restart functionality", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &opal_cr_verbose); if (0 > ret) { return ret; } opal_cr_is_enabled = false; (void) mca_base_var_register("opal", "ft", "cr", "enabled", "Enable fault tolerance for this program", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_is_enabled); opal_cr_timing_enabled = false; (void) mca_base_var_register ("opal", "opal", "cr", "enable_timer", "Enable Checkpoint timer (Default: Disabled)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_timing_enabled); opal_cr_timing_barrier_enabled = false; (void) mca_base_var_register ("opal", "opal", "cr", "enable_timer_barrier", "Enable Checkpoint timer Barrier. Must have opal_cr_enable_timer set. (Default: Disabled)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, opal_cr_timing_enabled ? MCA_BASE_VAR_FLAG_SETTABLE : 0, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_timing_barrier_enabled); opal_cr_timing_barrier_enabled = opal_cr_timing_barrier_enabled && opal_cr_timing_enabled; (void) mca_base_var_register ("opal", "opal", "cr", "timer_target_rank", "Target Rank for the timer (Default: 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_timing_target_rank); #if OPAL_ENABLE_FT_THREAD == 1 opal_cr_thread_use_if_avail = false; (void) mca_base_var_register ("opal", "opal", "cr", "use_thread", "Use an async thread to checkpoint this program (Default: Disabled)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_thread_use_if_avail); opal_cr_thread_sleep_check = 0; (void) mca_base_var_register ("opal", "opal", "cr", "thread_sleep_check", "Time to sleep between checking for a checkpoint (Default: 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_thread_sleep_check); opal_cr_thread_sleep_wait = 100; (void) mca_base_var_register ("opal", "opal", "cr", "thread_sleep_wait", "Time to sleep waiting for process to exit MPI library (Default: 1000)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_thread_sleep_wait); #endif opal_cr_is_tool = false; (void) mca_base_var_register ("opal", "opal", "cr", "is_tool", "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_is_tool); #ifndef __WINDOWS__ opal_cr_entry_point_signal = SIGUSR1; (void) mca_base_var_register ("opal", "opal", "cr", "signal", "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_entry_point_signal); opal_cr_debug_sigpipe = false; (void) mca_base_var_register ("opal", "opal", "cr", "debug_sigpipe", "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_debug_sigpipe); #else opal_cr_is_tool = true; /* no support for CR on Windows yet */ #endif /* __WINDOWS__ */ #if OPAL_ENABLE_CRDEBUG == 1 MPIR_debug_with_checkpoint = 0; (void) mca_base_var_register ("opal", "opal", "cr", "enable_crdebug", "Enable checkpoint/restart debugging", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &MPIR_debug_with_checkpoint); opal_cr_debug_num_free_threads = 3; opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads ); for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) { opal_cr_debug_free_threads[t] = NULL; } opal_cr_debug_signal = SIGTSTP; (void) mca_base_var_register ("opal", "opal", "cr", "crdebug_signal", "Checkpoint/Restart signal used to hold threads when debugging", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_debug_signal); #endif opal_cr_pipe_dir = (char *) opal_tmp_directory(); (void) mca_base_var_register ("opal", "opal", "cr", "tmp_dir", "Temporary directory to place rendezvous files for a checkpoint", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_pipe_dir); return OPAL_SUCCESS; } int opal_cr_init(void ) { int ret, exit_status = OPAL_SUCCESS; opal_cr_coord_callback_fn_t prev_coord_func; if( ++opal_cr_initalized != 1 ) { if( opal_cr_initalized < 1 ) { exit_status = OPAL_ERROR; goto cleanup; } exit_status = OPAL_SUCCESS; goto cleanup; } ret = opal_cr_register (); if (OPAL_SUCCESS != ret) { return ret; } if(0 != opal_cr_verbose) { opal_cr_output = opal_output_open(NULL); opal_output_set_verbosity(opal_cr_output, opal_cr_verbose); } opal_output_verbose(10, opal_cr_output, "opal_cr: init: Verbose Level: %d", opal_cr_verbose); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Enabled: %s", opal_cr_is_enabled ? "true" : "false"); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Is a tool program: %s", opal_cr_is_tool ? "true" : "false"); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Debug SIGPIPE: %d (%s)", opal_cr_verbose, (opal_cr_debug_sigpipe ? "True" : "False")); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal: %d", opal_cr_entry_point_signal); #if OPAL_ENABLE_FT_THREAD == 1 opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Use thread: %s", opal_cr_thread_use_if_avail ? "true" : "false"); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT thread sleep: check = %d, wait = %d", opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait); /* If we have a thread, then attach the SIGPIPE signal handler there since * it is most likely to be the one that needs it. */ if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #else if( opal_cr_debug_sigpipe ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #endif #if OPAL_ENABLE_CRDEBUG == 1 opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s]\n", (MPIR_debug_with_checkpoint ? "True": "False")); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal (Debug): %d", opal_cr_debug_signal); if( SIG_ERR == signal(opal_cr_debug_signal, MPIR_checkpoint_debugger_signal_handler) ) { opal_output(opal_cr_output, "opal_cr: init: Failed to register C/R debug signal (%d)", opal_cr_debug_signal); } #endif opal_output_verbose(10, opal_cr_output, "opal_cr: init: Temp Directory: %s", opal_cr_pipe_dir); if( !opal_cr_is_tool ) { /* Register the OPAL interlevel coordination callback */ opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func); opal_cr_stall_check = false; opal_cr_currently_stalled = false; } /* End opal_cr_is_tool = true */ /* * If fault tolerance was not compiled in then * we need to make sure that the listener thread is active to tell * the tools that this is not a checkpointable job. * We don't need the CRS framework to be initalized. */ #if OPAL_ENABLE_FT_CR == 1 /* * Open the checkpoint / restart service components */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_crs_base_framework, 0))) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_open", ret ); exit_status = ret; goto cleanup; } if (OPAL_SUCCESS != (ret = opal_crs_base_select())) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_select", ret ); exit_status = ret; goto cleanup; } #endif #if OPAL_ENABLE_FT_THREAD == 1 if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) { opal_output_verbose(10, opal_cr_output, "opal_cr: init: starting the thread\n"); /* JJH: We really do need this line below since it enables * actual locks for threads. However currently the * upper layers will deadlock if it is enabled. * So hack around the problem for now, while working * on a complete solution. See ticket #2741 for more * details. * opal_set_using_threads(true); */ /* * Start the thread */ OBJ_CONSTRUCT(&opal_cr_thread, opal_thread_t); OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t); opal_cr_thread_is_done = false; opal_cr_thread_is_active = false; opal_cr_thread_in_library = false; opal_cr_thread_num_in_library = 0; opal_cr_thread.t_run = opal_cr_thread_fn; opal_cr_thread.t_arg = NULL; opal_thread_start(&opal_cr_thread); } /* End opal_cr_is_tool = true */ else { opal_output_verbose(10, opal_cr_output, "opal_cr: init: *Not* Using C/R thread\n"); } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ cleanup: return exit_status; } int opal_cr_finalize(void) { int exit_status = OPAL_SUCCESS; if( --opal_cr_initalized != 0 ) { if( opal_cr_initalized < 0 ) { return OPAL_ERROR; } return OPAL_SUCCESS; } if( !opal_cr_is_tool ) { #if OPAL_ENABLE_FT_THREAD == 1 if( opal_cr_thread_use_if_avail ) { void *data; /* * Stop the thread */ opal_cr_thread_is_done = true; opal_cr_thread_is_active = false; opal_cr_thread_in_library = true; opal_thread_join(&opal_cr_thread, &data); OBJ_DESTRUCT(&opal_cr_thread); OBJ_DESTRUCT(&opal_cr_thread_lock); } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ /* Nothing to do for just process notifications */ opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM; opal_cr_checkpoint_request = OPAL_CR_STATUS_TERM; } #if OPAL_ENABLE_CRDEBUG == 1 if( NULL != opal_cr_debug_free_threads ) { free( opal_cr_debug_free_threads ); opal_cr_debug_free_threads = NULL; } opal_cr_debug_num_free_threads = 0; #endif if (NULL != opal_cr_pipe_dir) { free(opal_cr_pipe_dir); opal_cr_pipe_dir = NULL; } #if OPAL_ENABLE_FT_CR == 1 /* * Close the checkpoint / restart service components */ (void) mca_base_framework_close(&opal_crs_base_framework); #endif return exit_status; } /* * Check if a checkpoint request needs to be operated upon */ void opal_cr_test_if_checkpoint_ready(void) { int ret; if( opal_cr_currently_stalled) { opal_output_verbose(20, opal_cr_output, "opal_cr:opal_test_if_ready: JUMPING to Post Stall stage"); goto STAGE_1; } /* * If there is no checkpoint request to act on * then just return */ if(OPAL_CR_STATUS_REQUESTED != opal_cr_checkpoint_request ) { return; } /* * If we are currently checkpointing: * - If a request is pending then cancel it * - o.w., skip it. */ if(OPAL_CR_STATUS_RUNNING == opal_cr_checkpointing_state ) { if( OPAL_SUCCESS != (ret = cur_notify_callback(OPAL_CHECKPOINT_CMD_IN_PROGRESS) ) ) { opal_output(opal_cr_output, "Error: opal_cr: test_if_checkpoint_ready: Respond [In Progress] Failed. (%d)", ret); } opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE; return; } /* * If no CRS module is loaded return an error */ if (NULL == opal_crs.crs_checkpoint ) { if( OPAL_SUCCESS != (ret = cur_notify_callback(OPAL_CHECKPOINT_CMD_NULL) ) ) { opal_output(opal_cr_output, "Error: opal_cr: test_if_checkpoint_ready: Respond [Not Able/NULL] Failed. (%d)", ret); } opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE; return; } /* * Start the checkpoint */ opal_cr_checkpointing_state = OPAL_CR_STATUS_RUNNING; opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE; STAGE_1: if( OPAL_SUCCESS != (ret = cur_notify_callback(OPAL_CHECKPOINT_CMD_START) ) ) { opal_output(opal_cr_output, "Error: opal_cr: test_if_checkpoint_ready: Respond [Start Ckpt] Failed. (%d)", ret); } return; } /******************************* * Notification Routines *******************************/ int opal_cr_inc_core_prep(void) { int ret; /* * Call User Level INC */ if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_PRE_CRS_PRE_MPI, OPAL_CR_INC_STATE_PREPARE)) ) { return ret; } /* * Use the registered coordination routine */ if(OPAL_SUCCESS != (ret = cur_coord_callback(OPAL_CRS_CHECKPOINT)) ) { if ( OPAL_EXISTS != ret ) { opal_output(opal_cr_output, "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n", OPAL_CRS_CHECKPOINT, ret); } return ret; } /* * Call User Level INC */ if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_PRE_CRS_POST_MPI, OPAL_CR_INC_STATE_PREPARE)) ) { return ret; } core_prev_pid = getpid(); return OPAL_SUCCESS; } int opal_cr_inc_core_ckpt(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_base_ckpt_options_t *options, int *state) { int ret, exit_status = OPAL_SUCCESS; OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE0); if(OPAL_SUCCESS != (ret = opal_crs.crs_checkpoint(pid, snapshot, options, (opal_crs_state_type_t *)state))) { opal_output(opal_cr_output, "opal_cr: inc_core: Error: The checkpoint failed. %d\n", ret); exit_status = ret; } if(*state == OPAL_CRS_CONTINUE) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1); if(options->term) { *state = OPAL_CRS_TERM; opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM; } else { opal_cr_checkpointing_state = OPAL_CR_STATUS_CONTINUE; } } else { options->term = false; } /* * If restarting read environment stuff that opal-restart left us. */ if(*state == OPAL_CRS_RESTART) { opal_cr_refresh_environ(core_prev_pid); opal_cr_checkpointing_state = OPAL_CR_STATUS_RESTART_PRE; } return exit_status; } int opal_cr_inc_core_recover(int state) { int ret; opal_cr_user_inc_callback_state_t cb_state; if( opal_cr_checkpointing_state != OPAL_CR_STATUS_TERM && opal_cr_checkpointing_state != OPAL_CR_STATUS_CONTINUE && opal_cr_checkpointing_state != OPAL_CR_STATUS_RESTART_PRE && opal_cr_checkpointing_state != OPAL_CR_STATUS_RESTART_POST ) { if(state == OPAL_CRS_CONTINUE) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1); opal_cr_checkpointing_state = OPAL_CR_STATUS_CONTINUE; } /* * If restarting read environment stuff that opal-restart left us. */ else if(state == OPAL_CRS_RESTART) { opal_cr_refresh_environ(core_prev_pid); opal_cr_checkpointing_state = OPAL_CR_STATUS_RESTART_PRE; } } /* * Call User Level INC */ if( OPAL_CRS_CONTINUE == state ) { cb_state = OPAL_CR_INC_STATE_CONTINUE; } else if( OPAL_CRS_RESTART == state ) { cb_state = OPAL_CR_INC_STATE_RESTART; } else { cb_state = OPAL_CR_INC_STATE_ERROR; } if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_POST_CRS_PRE_MPI, cb_state)) ) { return ret; } /* * Use the registered coordination routine */ if(OPAL_SUCCESS != (ret = cur_coord_callback(state)) ) { if ( OPAL_EXISTS != ret ) { opal_output(opal_cr_output, "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n", state, ret); } return ret; } if(OPAL_SUCCESS != (ret = ompi_trigger_user_inc_callback(OPAL_CR_INC_POST_CRS_POST_MPI, cb_state)) ) { return ret; } #if OPAL_ENABLE_CRDEBUG == 1 opal_cr_debug_clear_current_ckpt_thread(); #endif return OPAL_SUCCESS; } int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_base_ckpt_options_t *options, int *state) { int ret, exit_status = OPAL_SUCCESS; /* * INC: Prepare stack using the registered coordination routine */ if(OPAL_SUCCESS != (ret = opal_cr_inc_core_prep() ) ) { return ret; } /* * INC: Take the checkpoint */ if(OPAL_SUCCESS != (ret = opal_cr_inc_core_ckpt(pid, snapshot, options, state) ) ) { exit_status = ret; /* Don't return here since we want to restart the OPAL level stuff */ } /* * INC: Recover stack using the registered coordination routine */ if(OPAL_SUCCESS != (ret = opal_cr_inc_core_recover(*state) ) ) { return ret; } return exit_status; } /******************************* * Coordination Routines *******************************/ /** * Current Coordination callback routines */ int opal_cr_coord(int state) { if(OPAL_CRS_CHECKPOINT == state) { /* Do Checkpoint Phase work */ } else if (OPAL_CRS_CONTINUE == state ) { /* Do Continue Phase work */ } else if (OPAL_CRS_RESTART == state ) { /* Do Restart Phase work */ /* * Re-initialize the event engine * Otherwise it may/will use stale file descriptors which will disrupt * the intended users of the soon-to-be newly assigned file descriptors. */ opal_event_reinit(opal_sync_event_base); /* * Flush if() functionality, since it caches system specific info. */ (void) mca_base_framework_close(&opal_if_base_framework); /* Since opal_ifinit() is not exposed, the necessary * functions will call it when needed. Just make sure we * finalized this code so we don't get old socket addrs. */ opal_output_reopen_all(); } else if (OPAL_CRS_TERM == state ) { /* Do Continue Phase work in prep to terminate the application */ } else { /* We must have been in an error state from the checkpoint * recreate everything, as in the Continue Phase */ } /* * Here we are returning to either: * - [orte | ompi]_notify() */ opal_cr_checkpointing_state = OPAL_CR_STATUS_RESTART_POST; return OPAL_SUCCESS; } int opal_cr_reg_notify_callback(opal_cr_notify_callback_fn_t new_func, opal_cr_notify_callback_fn_t *prev_func) { /* * Preserve the previous callback */ if( NULL != cur_notify_callback) { *prev_func = cur_notify_callback; } else { *prev_func = NULL; } /* * Update the callbacks */ cur_notify_callback = new_func; return OPAL_SUCCESS; } int opal_cr_user_inc_register_callback(opal_cr_user_inc_callback_event_t event, opal_cr_user_inc_callback_fn_t function, opal_cr_user_inc_callback_fn_t *prev_function) { if (event >= OPAL_CR_INC_MAX) { return OPAL_ERROR; } if( NULL != cur_user_coord_callback[event] ) { *prev_function = cur_user_coord_callback[event]; } else { *prev_function = NULL; } cur_user_coord_callback[event] = function; return OPAL_SUCCESS; } int ompi_trigger_user_inc_callback(opal_cr_user_inc_callback_event_t event, opal_cr_user_inc_callback_state_t state) { if( NULL == cur_user_coord_callback[event] ) { return OPAL_SUCCESS; } if (event >= OPAL_CR_INC_MAX) { return OPAL_ERROR; } return ((cur_user_coord_callback[event])(event, state)); } int opal_cr_reg_coord_callback(opal_cr_coord_callback_fn_t new_func, opal_cr_coord_callback_fn_t *prev_func) { /* * Preserve the previous callback */ if( NULL != cur_coord_callback) { *prev_func = cur_coord_callback; } else { *prev_func = NULL; } /* * Update the callbacks */ cur_coord_callback = new_func; return OPAL_SUCCESS; } int opal_cr_refresh_environ(int prev_pid) { char *file_name; #if OPAL_ENABLE_CRDEBUG == 1 char *tmp; #endif struct stat file_status; if( 0 >= prev_pid ) { prev_pid = getpid(); } /* * Make sure the file exists. If it doesn't then this means 2 things: * 1) We have already executed this function, and * 2) The file has been deleted on the previous round. */ opal_asprintf(&file_name, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid); if (NULL == file_name) { return OPAL_ERR_OUT_OF_RESOURCE; } if(0 != stat(file_name, &file_status) ){ free(file_name); return OPAL_SUCCESS; } #if OPAL_ENABLE_CRDEBUG == 1 mca_base_var_env_name ("opal_cr_enable_crdebug", &tmp); opal_unsetenv(tmp, &environ); free (tmp); #endif extract_env_vars(prev_pid, file_name); #if OPAL_ENABLE_CRDEBUG == 1 MPIR_debug_with_checkpoint = 0; (void) mca_base_var_register ("opal", "opal", "cr", "enable_crdebug", "Enable checkpoint/restart debugging", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &MPIR_debug_with_checkpoint); opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s] (refresh)\n", (MPIR_debug_with_checkpoint ? "True": "False")); #endif free(file_name); return OPAL_SUCCESS; } /* * Extract environment variables from a saved file * and place them in the environment. */ static int extract_env_vars(int prev_pid, char * file_name) { int exit_status = OPAL_SUCCESS; FILE *env_data = NULL; int len = OPAL_PATH_MAX; char * tmp_str = NULL; if( 0 >= prev_pid ) { opal_output(opal_cr_output, "opal_cr: extract_env_vars: Invalid PID (%d)\n", prev_pid); exit_status = OPAL_ERROR; goto cleanup; } if (NULL == (env_data = fopen(file_name, "r")) ) { exit_status = OPAL_ERROR; goto cleanup; } tmp_str = (char *) malloc(sizeof(char) * OPAL_PATH_MAX); if( NULL == tmp_str) { exit_status = OPAL_ERR_OUT_OF_RESOURCE; goto cleanup; } /* Extract an env var */ while(!feof(env_data) ) { char **t_set = NULL; if( NULL == fgets(tmp_str, OPAL_PATH_MAX, env_data) ) { exit_status = OPAL_ERROR; goto cleanup; } len = strlen(tmp_str); if(tmp_str[len - 1] == '\n') { tmp_str[len - 1] = '\0'; } else { opal_output(opal_cr_output, "opal_cr: extract_env_vars: Error: Parameter too long (%s)\n", tmp_str); continue; } if( NULL == (t_set = opal_argv_split(tmp_str, '=')) ) { break; } opal_setenv(t_set[0], t_set[1], true, &environ); opal_argv_free(t_set); } cleanup: if( NULL != env_data ) { fclose(env_data); } unlink(file_name); if( NULL != tmp_str ){ free(tmp_str); } return exit_status; } /***************************************** * OPAL CR Entry Point Functionality *****************************************/ /* * Used only for debugging SIGPIPE problems */ static void opal_cr_sigpipe_debug_signal_handler (int signo) { int sleeper = 1; if( !opal_cr_debug_sigpipe ) { opal_output_verbose(10, opal_cr_output, "opal_cr: sigpipe_debug: Debug SIGPIPE Not enabled :(\n"); return; } opal_output(0, "opal_cr: sigpipe_debug: Debug SIGPIPE [%d]: PID (%d)\n", signo, getpid()); while(sleeper == 1 ) { sleep(1); } } #if OPAL_ENABLE_FT_THREAD == 1 static void* opal_cr_thread_fn(opal_object_t *obj) { /* Sanity Check */ if( !opal_cr_thread_use_if_avail ) { return NULL; } if( opal_cr_debug_sigpipe ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } /* * Register this thread with the OPAL CRS */ if( NULL != opal_crs.crs_reg_thread ) { if( OPAL_SUCCESS != opal_crs.crs_reg_thread() ) { opal_output(0, "Error: Thread registration failed\n"); return NULL; } } #if OPAL_ENABLE_CRDEBUG == 1 opal_cr_debug_free_threads[1] = opal_thread_get_self(); #endif /* * Wait to become active */ while( !opal_cr_thread_is_active && !opal_cr_thread_is_done) { sched_yield(); } if( opal_cr_thread_is_done ) { return NULL; } /* * While active */ while( opal_cr_thread_is_active && !opal_cr_thread_is_done) { /* * While no threads are in the MPI library then try to process * checkpoint requests. */ OPAL_CR_THREAD_LOCK(); while ( !opal_cr_thread_in_library ) { sched_yield(); usleep(opal_cr_thread_sleep_check); OPAL_CR_TEST_CHECKPOINT_READY(); /* Sanity check */ if( OPAL_UNLIKELY(opal_cr_currently_stalled) ) { OPAL_CR_TEST_CHECKPOINT_READY(); } } /* * While they are in the MPI library yield */ OPAL_CR_THREAD_UNLOCK(); while ( opal_cr_thread_in_library && opal_cr_thread_is_active ) { usleep(opal_cr_thread_sleep_wait); } } return NULL; } void opal_cr_thread_init_library(void) { if( !opal_cr_thread_use_if_avail ) { OPAL_CR_TEST_CHECKPOINT_READY(); } else { /* Activate the CR Thread */ opal_cr_thread_in_library = false; opal_cr_thread_is_done = false; opal_cr_thread_is_active = true; } } void opal_cr_thread_finalize_library(void) { if( !opal_cr_thread_use_if_avail ) { OPAL_CR_TEST_CHECKPOINT_READY(); } else { /* Deactivate the CR Thread */ opal_cr_thread_is_done = true; opal_cr_thread_is_active = false; OPAL_CR_LOCK(); opal_cr_thread_in_library = true; } } void opal_cr_thread_abort_library(void) { if( !opal_cr_thread_use_if_avail ) { OPAL_CR_TEST_CHECKPOINT_READY(); } else { /* Deactivate the CR Thread */ opal_cr_thread_is_done = true; opal_cr_thread_is_active = false; OPAL_CR_LOCK(); opal_cr_thread_in_library = true; } } void opal_cr_thread_enter_library(void) { if( !opal_cr_thread_use_if_avail ) { OPAL_CR_TEST_CHECKPOINT_READY(); } else { /* Lock out the CR Thread */ OPAL_CR_LOCK(); } } void opal_cr_thread_exit_library(void) { if( !opal_cr_thread_use_if_avail ) { OPAL_CR_TEST_CHECKPOINT_READY(); } else { /* Allow CR Thread to continue */ OPAL_CR_UNLOCK(); } } void opal_cr_thread_noop_progress(void) { if( !opal_cr_thread_use_if_avail ) { OPAL_CR_TEST_CHECKPOINT_READY(); } } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ static double opal_cr_get_time() { double wtime; #if OPAL_TIMER_USEC_NATIVE wtime = (double)opal_timer_base_get_usec() / 1000000.0; #else struct timeval tv; gettimeofday(&tv, NULL); wtime = tv.tv_sec; wtime += (double)tv.tv_usec / 1000000.0; #endif return wtime; } void opal_cr_set_time(int idx) { if(idx < OPAL_CR_TIMER_MAX ) { if( timer_start[idx] <= 0.0 ) { timer_start[idx] = opal_cr_get_time(); } } } void opal_cr_clear_timers(void) { int i; for(i = 0; i < OPAL_CR_TIMER_MAX; ++i) { timer_start[i] = 0.0; } } static void display_indv_timer_core(double diff, char *str) { double total = 0; double perc = 0; total = timer_start[OPAL_CR_TIMER_MAX-1] - timer_start[OPAL_CR_TIMER_ENTRY0]; perc = (diff/total) * 100; opal_output(0, "opal_cr: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", str, diff, total, perc); return; } void opal_cr_display_all_timers(void) { double diff = 0.0; char * label = NULL; if( opal_cr_timing_target_rank != opal_cr_timing_my_rank ) { return; } opal_output(0, "OPAL CR Timing: ******************** Summary Begin\n"); /********** Entry into the system **********/ label = strdup("Start Entry Point"); if( opal_cr_timing_barrier_enabled ) { diff = timer_start[OPAL_CR_TIMER_CRCPBR0] - timer_start[OPAL_CR_TIMER_ENTRY0]; } else { diff = timer_start[OPAL_CR_TIMER_CRCP0] - timer_start[OPAL_CR_TIMER_ENTRY0]; } display_indv_timer_core(diff, label); free(label); /********** CRCP Protocol **********/ label = strdup("CRCP Protocol"); if( opal_cr_timing_barrier_enabled ) { diff = timer_start[OPAL_CR_TIMER_CRCPBR1] - timer_start[OPAL_CR_TIMER_CRCP0]; } else { diff = timer_start[OPAL_CR_TIMER_P2P0] - timer_start[OPAL_CR_TIMER_CRCP0]; } display_indv_timer_core(diff, label); free(label); /********** P2P Suspend **********/ label = strdup("P2P Suspend"); if( opal_cr_timing_barrier_enabled ) { diff = timer_start[OPAL_CR_TIMER_P2PBR0] - timer_start[OPAL_CR_TIMER_P2P0]; } else { diff = timer_start[OPAL_CR_TIMER_CORE0] - timer_start[OPAL_CR_TIMER_P2P0]; } display_indv_timer_core(diff, label); free(label); /********** Checkpoint to Disk **********/ label = strdup("Checkpoint"); diff = timer_start[OPAL_CR_TIMER_CORE1] - timer_start[OPAL_CR_TIMER_CORE0]; display_indv_timer_core(diff, label); free(label); /********** P2P Reactivation **********/ label = strdup("P2P Reactivation"); if( opal_cr_timing_barrier_enabled ) { diff = timer_start[OPAL_CR_TIMER_P2PBR2] - timer_start[OPAL_CR_TIMER_CORE1]; } else { diff = timer_start[OPAL_CR_TIMER_CRCP1] - timer_start[OPAL_CR_TIMER_CORE1]; } display_indv_timer_core(diff, label); free(label); /********** CRCP Protocol Finalize **********/ label = strdup("CRCP Cleanup"); if( opal_cr_timing_barrier_enabled ) { diff = timer_start[OPAL_CR_TIMER_COREBR1] - timer_start[OPAL_CR_TIMER_CRCP1]; } else { diff = timer_start[OPAL_CR_TIMER_CORE2] - timer_start[OPAL_CR_TIMER_CRCP1]; } display_indv_timer_core(diff, label); free(label); /********** Exit the system **********/ label = strdup("Finish Entry Point"); diff = timer_start[OPAL_CR_TIMER_ENTRY4] - timer_start[OPAL_CR_TIMER_CORE2]; display_indv_timer_core(diff, label); free(label); opal_output(0, "OPAL CR Timing: ******************** Summary End\n"); } #if OPAL_ENABLE_CRDEBUG == 1 int opal_cr_debug_set_current_ckpt_thread_self(void) { int t; if( NULL == opal_cr_debug_free_threads ) { opal_cr_debug_num_free_threads = 3; opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads ); for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) { opal_cr_debug_free_threads[t] = NULL; } } opal_cr_debug_free_threads[0] = opal_thread_get_self(); return OPAL_SUCCESS; } int opal_cr_debug_clear_current_ckpt_thread(void) { opal_cr_debug_free_threads[0] = NULL; return OPAL_SUCCESS; } int MPIR_checkpoint_debugger_detach(void) { /* This function is meant to be a noop function for checkpoint/restart * enabled debugging functionality */ #if 0 /* Once the debugger can successfully force threads into the function below, * then we can uncomment this line */ if( MPIR_debug_with_checkpoint ) { opal_cr_debug_threads_already_waiting = true; } #endif return OPAL_SUCCESS; } void MPIR_checkpoint_debugger_signal_handler(int signo) { opal_output_verbose(1, opal_cr_output, "crs: MPIR_checkpoint_debugger_signal_handler(): Enter Debug signal handler..."); MPIR_checkpoint_debugger_waitpoint(); opal_output_verbose(1, opal_cr_output, "crs: MPIR_checkpoint_debugger_signal_handler(): Leave Debug signal handler..."); } void *MPIR_checkpoint_debugger_waitpoint(void) { int t; opal_thread_t *thr = NULL; thr = opal_thread_get_self(); /* * Sanity check, if the debugger is not going to attach, then do not wait * Make sure to open the debug gate, so that threads can get out */ if( !MPIR_debug_with_checkpoint ) { opal_output_verbose(1, opal_cr_output, "crs: MPIR_checkpoint_debugger_waitpoint(): Debugger is not attaching... (%d)", (int)thr->t_handle); MPIR_checkpoint_debug_gate = 1; return NULL; } else { opal_output_verbose(1, opal_cr_output, "crs: MPIR_checkpoint_debugger_waitpoint(): Waiting for the Debugger to attach... (%d)", (int)thr->t_handle); MPIR_checkpoint_debug_gate = 0; } /* * Let special threads escape without waiting, they will wait later */ for(t = 0; t < opal_cr_debug_num_free_threads; ++t) { if( opal_cr_debug_free_threads[t] != NULL && opal_thread_self_compare(opal_cr_debug_free_threads[t]) ) { opal_output_verbose(1, opal_cr_output, "crs: MPIR_checkpoint_debugger_waitpoint(): Checkpointing thread does not wait here... (%d)", (int)thr->t_handle); return NULL; } } /* * Force all other threads into the waiting function, * unless they are already in there, then just return so we do not nest * calls into this wait function and potentially confuse the debugger. */ if( opal_cr_debug_threads_already_waiting ) { opal_output_verbose(1, opal_cr_output, "crs: MPIR_checkpoint_debugger_waitpoint(): Threads are already waiting from debugger detach, do not wait here... (%d)", (int)thr->t_handle); return NULL; } else { opal_output_verbose(1, opal_cr_output, "crs: MPIR_checkpoint_debugger_waitpoint(): Wait... (%d)", (int)thr->t_handle); return MPIR_checkpoint_debugger_breakpoint(); } } /* * A tight loop to wait for debugger to release this process from the * breakpoint. */ void *MPIR_checkpoint_debugger_breakpoint(void) { /* spin until debugger attaches and releases us */ while (MPIR_checkpoint_debug_gate == 0) { #if defined(HAVE_USLEEP) usleep(100000); /* microseconds */ #else sleep(1); /* seconds */ #endif } opal_cr_debug_threads_already_waiting = false; return NULL; } #endif