2007-03-17 02:11:45 +03:00
|
|
|
/*
|
2009-04-30 20:55:39 +04:00
|
|
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
2007-03-17 02:11:45 +03:00
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
|
2007-10-09 18:32:36 +04:00
|
|
|
#include <errno.h>
|
2007-03-17 02:11:45 +03:00
|
|
|
#include <sys/types.h>
|
2007-04-01 20:16:54 +04:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2007-03-17 02:11:45 +03:00
|
|
|
#include <unistd.h>
|
2007-04-01 20:16:54 +04:00
|
|
|
#endif /* HAVE_UNISTD_H */
|
2007-10-09 00:53:02 +04:00
|
|
|
#ifdef HAVE_FCNTL_H
|
|
|
|
#include <fcntl.h>
|
|
|
|
#endif /* HAVE_FCNTL_H */
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
|
|
#include <sys/types.h>
|
|
|
|
#endif /* HAVE_SYS_TYPES_H */
|
|
|
|
#ifdef HAVE_SYS_STAT_H
|
|
|
|
#include <sys/stat.h> /* for mkfifo */
|
|
|
|
#endif /* HAVE_SYS_STAT_H */
|
|
|
|
#ifdef HAVE_SIGNAL_H
|
|
|
|
#include <signal.h>
|
|
|
|
#endif
|
2009-05-07 20:14:59 +04:00
|
|
|
#ifdef HAVE_STRING_H
|
|
|
|
#include <string.h>
|
|
|
|
#endif
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2008-09-22 20:49:56 +04:00
|
|
|
#include "orte/runtime/orte_cr.h"
|
2009-02-20 06:16:13 +03:00
|
|
|
#include "orte/runtime/orte_globals.h"
|
2009-04-29 10:57:49 +04:00
|
|
|
#include "opal/runtime/opal_cr.h"
|
2009-02-14 05:26:12 +03:00
|
|
|
#include "opal/util/output.h"
|
2009-05-09 00:51:13 +04:00
|
|
|
#include "opal/event/event.h"
|
2007-03-17 02:11:45 +03:00
|
|
|
#include "opal/util/opal_environ.h"
|
|
|
|
#include "opal/mca/mca.h"
|
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
#include "opal/mca/crs/crs.h"
|
|
|
|
#include "opal/mca/crs/base/base.h"
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/util/name_fns.h"
|
2007-03-17 02:11:45 +03:00
|
|
|
#include "orte/mca/snapc/snapc.h"
|
|
|
|
#include "orte/mca/snapc/base/base.h"
|
2009-05-09 00:51:13 +04:00
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
#include "orte/mca/rml/rml_types.h"
|
2007-03-17 02:11:45 +03:00
|
|
|
|
|
|
|
#include "snapc_full.h"
|
|
|
|
|
|
|
|
/************************************
|
|
|
|
* Locally Global vars & functions :)
|
|
|
|
************************************/
|
2007-10-09 00:53:02 +04:00
|
|
|
static void snapc_full_app_signal_handler (int signo);
|
|
|
|
static int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp);
|
2009-09-22 22:26:12 +04:00
|
|
|
static int app_notify_resp_stage_1(opal_cr_ckpt_cmd_state_t resp,
|
|
|
|
opal_crs_base_ckpt_options_t *options);
|
2009-05-09 00:51:13 +04:00
|
|
|
static int app_notify_resp_stage_2(int cr_state );
|
|
|
|
static int app_notify_resp_stage_3(int cr_state);
|
2007-10-09 00:53:02 +04:00
|
|
|
static int snapc_full_app_notify_reopen_files(void);
|
2009-09-22 22:26:12 +04:00
|
|
|
static int snapc_full_app_ckpt_handshake_start(opal_crs_base_ckpt_options_t *options,
|
|
|
|
opal_cr_ckpt_cmd_state_t resp);
|
2007-10-09 00:53:02 +04:00
|
|
|
static int snapc_full_app_ckpt_handshake_end(int cr_state);
|
|
|
|
|
|
|
|
static char *app_comm_pipe_r = NULL;
|
|
|
|
static char *app_comm_pipe_w = NULL;
|
|
|
|
static int app_comm_pipe_r_fd = -1;
|
|
|
|
static int app_comm_pipe_w_fd = -1;
|
|
|
|
|
|
|
|
static opal_crs_base_snapshot_t *local_snapshot = NULL;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
static int app_cur_epoch = -1;
|
|
|
|
static int app_last_epoch = -1;
|
|
|
|
static bool app_split_ckpt = false;
|
|
|
|
static bool app_notif_processed = false;
|
|
|
|
|
|
|
|
static char * app_cur_global_ref = NULL;
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
/************************
|
|
|
|
* Function Definitions
|
|
|
|
************************/
|
|
|
|
|
|
|
|
int app_coord_init() {
|
|
|
|
int exit_status = ORTE_SUCCESS;
|
2007-10-09 00:53:02 +04:00
|
|
|
opal_cr_notify_callback_fn_t prev_notify_func;
|
|
|
|
char *tmp_pid = NULL;
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) Initalized for Application %s\n",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
/*
|
2008-02-28 04:57:57 +03:00
|
|
|
* Register the INC notification callback
|
2007-10-09 00:53:02 +04:00
|
|
|
*/
|
|
|
|
opal_cr_reg_notify_callback(snapc_full_app_notify_response, &prev_notify_func);
|
|
|
|
|
|
|
|
/* String representation of the PID */
|
|
|
|
asprintf(&tmp_pid, "%d", getpid());
|
|
|
|
|
|
|
|
asprintf(&app_comm_pipe_r, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_R, tmp_pid);
|
|
|
|
asprintf(&app_comm_pipe_w, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_W, tmp_pid);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Setup a signal handler to catch and start the proper thread
|
|
|
|
* to handle the checkpoint
|
|
|
|
*/
|
|
|
|
if( SIG_ERR == signal(opal_cr_entry_point_signal, snapc_full_app_signal_handler) ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-03-08 00:10:14 +03:00
|
|
|
"App) init: Error: Failed to register signal %d\n",
|
|
|
|
opal_cr_entry_point_signal);
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(OPAL_ERROR);
|
2007-10-09 00:53:02 +04:00
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((15, mca_snapc_full_component.super.output_handle,
|
2008-03-08 00:10:14 +03:00
|
|
|
"app) Named Pipes (%s) (%s), Signal (%d)",
|
|
|
|
app_comm_pipe_r, app_comm_pipe_w, opal_cr_entry_point_signal));
|
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
cleanup:
|
|
|
|
if( NULL != tmp_pid) {
|
|
|
|
free(tmp_pid);
|
|
|
|
tmp_pid = NULL;
|
|
|
|
}
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
|
|
|
|
int app_coord_finalize() {
|
2007-10-09 00:53:02 +04:00
|
|
|
/*
|
|
|
|
* Cleanup named pipes
|
|
|
|
*/
|
|
|
|
if( NULL != app_comm_pipe_r) {
|
|
|
|
free(app_comm_pipe_r);
|
|
|
|
app_comm_pipe_r = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( NULL != app_comm_pipe_w) {
|
|
|
|
free(app_comm_pipe_w);
|
|
|
|
app_comm_pipe_w = NULL;
|
|
|
|
}
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/******************
|
|
|
|
* Local functions
|
|
|
|
******************/
|
2007-10-09 00:53:02 +04:00
|
|
|
static void snapc_full_app_signal_handler (int signo)
|
|
|
|
{
|
|
|
|
if( opal_cr_entry_point_signal != signo ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, mca_snapc_full_component.super.output_handle,
|
2008-03-06 17:04:25 +03:00
|
|
|
"App) signal_handler: Received unknown signal %d",
|
|
|
|
signo));
|
2007-10-09 00:53:02 +04:00
|
|
|
/* Not our signal */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Signal thread to start checkpoint handshake
|
|
|
|
*/
|
|
|
|
opal_cr_checkpoint_request = OPAL_CR_STATUS_REQUESTED;
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) signal_handler: Receive Checkpoint Request."));
|
2007-10-09 00:53:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Respond to an asynchronous checkpoint request
|
|
|
|
*/
|
|
|
|
int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
|
|
|
{
|
2009-09-22 22:26:12 +04:00
|
|
|
opal_crs_base_ckpt_options_t *options = NULL;
|
2007-10-09 00:53:02 +04:00
|
|
|
static int cr_state;
|
|
|
|
int app_pid;
|
|
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
|
|
|
|
if( opal_cr_currently_stalled ) {
|
|
|
|
goto STAGE_1;
|
|
|
|
}
|
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
options = OBJ_NEW(opal_crs_base_ckpt_options_t);
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2009-05-09 00:51:13 +04:00
|
|
|
"App) notify_response: Stage 1..."));
|
2009-09-22 22:26:12 +04:00
|
|
|
if( ORTE_SUCCESS != (ret = app_notify_resp_stage_1(resp, options) ) ) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
2008-05-08 22:47:47 +04:00
|
|
|
exit_status = ret;
|
|
|
|
goto ckpt_cleanup;
|
|
|
|
}
|
2007-10-09 00:53:02 +04:00
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
/*
|
|
|
|
* If this is a split checkpoint operation then we only need to do stage_1,
|
|
|
|
* but we need to keep the name pipe open for the end();
|
|
|
|
*/
|
|
|
|
if( app_split_ckpt ) {
|
|
|
|
app_notif_processed = true;
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2008-09-22 20:49:56 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Start checkpoint..."));
|
2007-10-09 00:53:02 +04:00
|
|
|
STAGE_1:
|
|
|
|
opal_cr_currently_stalled = false;
|
|
|
|
|
|
|
|
app_pid = getpid();
|
2009-04-30 20:55:39 +04:00
|
|
|
if( orte_snapc_full_skip_app ) {
|
2009-05-05 20:45:49 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Skipping App. (%d)\n",
|
|
|
|
getpid()));
|
2009-04-30 20:55:39 +04:00
|
|
|
ret = ORTE_SUCCESS;
|
|
|
|
cr_state = OPAL_CRS_CONTINUE;
|
2007-10-09 00:53:02 +04:00
|
|
|
}
|
2009-09-22 22:26:12 +04:00
|
|
|
else {
|
|
|
|
/*
|
|
|
|
* INC: Prepare stack using the registered coordination routine
|
|
|
|
*/
|
|
|
|
if(OPAL_SUCCESS != (ret = opal_cr_inc_core_prep() ) ) {
|
|
|
|
if( OPAL_EXISTS == ret ) {
|
|
|
|
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Stalling the checkpoint progress until state is stable again (PID = %d)\n",
|
|
|
|
getpid()));
|
|
|
|
opal_cr_currently_stalled = true;
|
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Error: checkpoint notification failed. %d\n", ret);
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
goto ckpt_cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* INC: Take the checkpoint
|
|
|
|
*/
|
|
|
|
ret = opal_cr_inc_core_ckpt(app_pid, local_snapshot, options, &cr_state);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Tell Local Coordinator that we are done with local checkpoint
|
|
|
|
* (only if not restarting, on restart we are not attached to the Local
|
|
|
|
* Coordinator. )
|
|
|
|
*/
|
|
|
|
if( OPAL_CRS_RESTART != cr_state ) {
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Stage 2..."));
|
|
|
|
if( ORTE_SUCCESS != (ret = app_notify_resp_stage_2(cr_state) ) ) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
goto ckpt_cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* INC: Recover stack using the registered coordination routine
|
|
|
|
*/
|
|
|
|
if( OPAL_SUCCESS != (ret = opal_cr_inc_core_recover(cr_state)) ) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
goto ckpt_cleanup;
|
|
|
|
}
|
2007-10-09 00:53:02 +04:00
|
|
|
}
|
2009-09-22 22:26:12 +04:00
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
/* Don't stall any longer */
|
|
|
|
opal_cr_stall_check = false;
|
|
|
|
|
|
|
|
if(OPAL_CRS_RESTART == cr_state) {
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Restarting...(%d)\n",
|
|
|
|
getpid()));
|
2007-10-09 00:53:02 +04:00
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
options->term = false;
|
2007-10-09 00:53:02 +04:00
|
|
|
/* Do not respond to the non-existent command line tool */
|
|
|
|
goto ckpt_cleanup;
|
|
|
|
}
|
|
|
|
else if(cr_state == OPAL_CRS_CONTINUE) {
|
2008-09-22 20:49:56 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Continuing...(%d)\n",
|
|
|
|
getpid()));
|
2009-09-22 22:26:12 +04:00
|
|
|
; /* Don't need to do anything here */
|
2007-10-09 00:53:02 +04:00
|
|
|
}
|
|
|
|
else if(cr_state == OPAL_CRS_TERM ) {
|
|
|
|
; /* Don't need to do anything here */
|
|
|
|
}
|
|
|
|
else {
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Unknown cr_state(%d) [%d]",
|
|
|
|
cr_state, getpid()));
|
2007-10-09 00:53:02 +04:00
|
|
|
}
|
2009-05-09 00:51:13 +04:00
|
|
|
|
|
|
|
ckpt_cleanup:
|
|
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Stage 3..."));
|
|
|
|
if( ORTE_SUCCESS != (ret = app_notify_resp_stage_3(cr_state) )) {
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-09 00:51:13 +04:00
|
|
|
exit_status = ret;
|
|
|
|
goto ckpt_cleanup;
|
|
|
|
}
|
2007-10-09 00:53:02 +04:00
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
if( options->term ) {
|
2009-05-09 00:51:13 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: User has asked to terminate the application"));
|
|
|
|
exit(ORTE_SUCCESS);
|
|
|
|
}
|
2009-09-22 22:26:12 +04:00
|
|
|
|
|
|
|
if( NULL != options ) {
|
|
|
|
OBJ_RELEASE(options);
|
|
|
|
options = NULL;
|
|
|
|
}
|
|
|
|
|
2009-05-09 00:51:13 +04:00
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
static int app_notify_resp_stage_1(opal_cr_ckpt_cmd_state_t resp,
|
|
|
|
opal_crs_base_ckpt_options_t *options)
|
2009-05-09 00:51:13 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
OPAL_CR_CLEAR_TIMERS();
|
|
|
|
opal_cr_timing_my_rank = ORTE_PROC_MY_NAME->vpid;
|
|
|
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Open communication channels
|
|
|
|
*/
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Open Communication Channels."));
|
|
|
|
if (ORTE_SUCCESS != (ret = snapc_full_app_notify_reopen_files())) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initial Handshake
|
|
|
|
*/
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Initial Handshake."));
|
2009-09-22 22:26:12 +04:00
|
|
|
if( ORTE_SUCCESS != (ret = snapc_full_app_ckpt_handshake_start(options, resp) ) ) {
|
2009-05-09 00:51:13 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Begin checkpoint
|
|
|
|
* - Init the checkpoint metadata file
|
|
|
|
*/
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Init checkpoint directory..."));
|
|
|
|
if( OPAL_SUCCESS != (ret = opal_crs_base_init_snapshot_directory(local_snapshot) ) ) {
|
|
|
|
opal_output(0, "App) Error: Unable to initalize the snapshot directory!\n");
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY2);
|
|
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Start checkpoint..."));
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int app_notify_resp_stage_2(int cr_state )
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
/*
|
|
|
|
* Final Handshake
|
|
|
|
*/
|
2008-10-16 19:09:00 +04:00
|
|
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY3);
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Waiting for final handshake."));
|
2007-10-09 00:53:02 +04:00
|
|
|
if( ORTE_SUCCESS != (ret = snapc_full_app_ckpt_handshake_end(cr_state ) ) ) {
|
2009-05-09 00:51:13 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
2007-10-09 00:53:02 +04:00
|
|
|
}
|
2009-05-09 00:51:13 +04:00
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Final Handshake complete."));
|
2007-10-09 00:53:02 +04:00
|
|
|
|
2009-05-09 00:51:13 +04:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int app_notify_resp_stage_3(int cr_state)
|
|
|
|
{
|
2009-09-22 22:26:12 +04:00
|
|
|
if( 0 <= app_comm_pipe_r_fd ) {
|
|
|
|
close(app_comm_pipe_r_fd);
|
|
|
|
app_comm_pipe_r_fd = -1;
|
|
|
|
}
|
|
|
|
if( 0 <= app_comm_pipe_w_fd ) {
|
|
|
|
close(app_comm_pipe_w_fd);
|
|
|
|
app_comm_pipe_w_fd = -1;
|
|
|
|
}
|
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
remove(app_comm_pipe_r);
|
|
|
|
remove(app_comm_pipe_w);
|
2009-09-22 22:26:12 +04:00
|
|
|
|
2008-09-22 20:49:56 +04:00
|
|
|
app_comm_pipe_r_fd = -1;
|
|
|
|
app_comm_pipe_w_fd = -1;
|
2009-05-09 00:51:13 +04:00
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
/* Prepare to wait for another checkpoint action */
|
2008-05-08 22:47:47 +04:00
|
|
|
opal_cr_checkpointing_state = OPAL_CR_STATUS_NONE;
|
|
|
|
opal_cr_currently_stalled = false;
|
2007-10-09 00:53:02 +04:00
|
|
|
|
2008-10-16 19:09:00 +04:00
|
|
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY4);
|
|
|
|
if(OPAL_CRS_RESTART != cr_state) {
|
|
|
|
OPAL_CR_DISPLAY_ALL_TIMERS();
|
|
|
|
}
|
|
|
|
|
2009-05-09 00:51:13 +04:00
|
|
|
return ORTE_SUCCESS;
|
2007-10-09 00:53:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static int snapc_full_app_notify_reopen_files(void)
|
|
|
|
{
|
|
|
|
int ret = OPAL_ERR_NOT_IMPLEMENTED;
|
|
|
|
|
|
|
|
#ifndef HAVE_MKFIFO
|
|
|
|
return ret;
|
|
|
|
#else
|
|
|
|
#ifdef __WINDOWS__
|
|
|
|
return ret;
|
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* Open up the read pipe
|
|
|
|
*/
|
|
|
|
if( (ret = mkfifo(app_comm_pipe_r, 0660)) < 0) {
|
|
|
|
if(EEXIST == ret || -1 == ret ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_reopen_files: mkfifo failed because file (%s) already exists, attempting to use this pipe. (%d)",
|
|
|
|
app_comm_pipe_r, ret));
|
2007-10-09 00:53:02 +04:00
|
|
|
}
|
|
|
|
else {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_reopen_files: Error: mkfifo failed to make named pipe (%s). (%d)\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_r, ret);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
app_comm_pipe_r_fd = open(app_comm_pipe_r, O_RDWR);
|
|
|
|
if(app_comm_pipe_r_fd < 0) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) init: Error: open failed to open the named pipe (%s). %d\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_r, app_comm_pipe_r_fd);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Open up the write pipe
|
|
|
|
*/
|
|
|
|
if( (ret = mkfifo(app_comm_pipe_w, 0660)) < 0) {
|
|
|
|
if(EEXIST == ret || -1 == ret ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_reopen_files: mkfifo failed because file (%s) already exists, attempting to use this pipe. (%d)",
|
|
|
|
app_comm_pipe_w, ret));
|
2007-10-09 00:53:02 +04:00
|
|
|
}
|
|
|
|
else {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_reopen_files: Error: mkfifo failed to make named pipe (%s). (%d)\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_w, ret);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
app_comm_pipe_w_fd = open(app_comm_pipe_w, O_WRONLY);
|
|
|
|
if(app_comm_pipe_w_fd < 0) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_reopen_files: Error: open failed to open the named pipe (%s). (%d)\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_w, app_comm_pipe_w_fd);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
#endif /* __WINDOWS__ */
|
|
|
|
#endif /* HAVE_MKFIFO */
|
|
|
|
}
|
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
static int snapc_full_app_ckpt_handshake_start(opal_crs_base_ckpt_options_t *options,
|
|
|
|
opal_cr_ckpt_cmd_state_t resp)
|
2007-10-09 00:53:02 +04:00
|
|
|
{
|
|
|
|
int ret, exit_status = ORTE_SUCCESS;
|
2009-09-22 22:26:12 +04:00
|
|
|
int len = 0, tmp_resp, opt_rep;
|
2007-10-09 00:53:02 +04:00
|
|
|
char *tmp_str = NULL;
|
|
|
|
ssize_t tmp_size = 0;
|
|
|
|
|
|
|
|
/*
|
2009-09-22 22:26:12 +04:00
|
|
|
* Get the initial handshake command:
|
|
|
|
* - Term argument
|
|
|
|
* - Stop argument
|
2007-10-09 00:53:02 +04:00
|
|
|
*/
|
2009-09-22 22:26:12 +04:00
|
|
|
if( sizeof(int) != (ret = read(app_comm_pipe_r_fd, &opt_rep, sizeof(int))) ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2009-09-22 22:26:12 +04:00
|
|
|
"App) notify_response: Error: Unable to read the 'term' from named pipe (%s). %d\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_r, ret);
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2007-10-09 00:53:02 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
2009-09-22 22:26:12 +04:00
|
|
|
options->term = OPAL_INT_TO_BOOL(opt_rep);
|
|
|
|
|
|
|
|
if( sizeof(int) != (ret = read(app_comm_pipe_r_fd, &opt_rep, sizeof(int))) ) {
|
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Error: Unable to read the 'stop' from named pipe (%s). %d\n",
|
|
|
|
app_comm_pipe_r, ret);
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
options->stop = OPAL_INT_TO_BOOL(opt_rep);
|
2007-10-09 00:53:02 +04:00
|
|
|
|
|
|
|
tmp_resp = (int)resp;
|
|
|
|
if( sizeof(int) != (ret = write(app_comm_pipe_w_fd, &tmp_resp, sizeof(int)) ) ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: %d: Error: Unable to write to pipe (%s) ret = %d [Line %d]\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
tmp_resp, app_comm_pipe_w, ret, __LINE__);
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2007-10-09 00:53:02 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Respond that the checkpoint is currently in progress
|
|
|
|
*/
|
|
|
|
if( OPAL_CHECKPOINT_CMD_IN_PROGRESS == resp ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Checkpoint in progress, cannot start (%d)",
|
|
|
|
getpid()));
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2007-10-09 00:53:02 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Respond that the application is unable to be checkpointed
|
|
|
|
*/
|
|
|
|
else if( OPAL_CHECKPOINT_CMD_NULL == resp ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Non-checkpointable application, cannot start (%d)",
|
|
|
|
getpid()));
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2007-10-09 00:53:02 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Respond that some error has occurred such that the application is
|
|
|
|
* not able to be checkpointed
|
|
|
|
*/
|
|
|
|
else if( OPAL_CHECKPOINT_CMD_ERROR == resp ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Error generated, cannot start (%d)",
|
|
|
|
getpid()));
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2007-10-09 00:53:02 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Respond signalng that we wish to respond to this request
|
|
|
|
*/
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Starting checkpoint request (%d)",
|
|
|
|
getpid()));
|
2007-10-09 00:53:02 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Get Snapshot Handle argument
|
|
|
|
*/
|
|
|
|
if( sizeof(int) != (ret = read(app_comm_pipe_r_fd, &len, sizeof(int))) ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Error: Unable to read the snapshot_handle len from named pipe (%s). %d\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_r, ret);
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2007-10-09 00:53:02 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
tmp_size = sizeof(char) * len;
|
|
|
|
tmp_str = (char *) malloc(sizeof(char) * len);
|
|
|
|
if( tmp_size != (ret = read(app_comm_pipe_r_fd, tmp_str, (sizeof(char) * len))) ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Error: Unable to read the snapshot_handle from named pipe (%s). %d\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_r, ret);
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2007-10-09 00:53:02 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If they didn't send anything of meaning then use the defaults
|
|
|
|
*/
|
|
|
|
local_snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
|
|
|
|
|
|
|
|
if( 1 < strlen(tmp_str) ) {
|
|
|
|
if( NULL != local_snapshot->reference_name)
|
|
|
|
free( local_snapshot->reference_name );
|
|
|
|
local_snapshot->reference_name = strdup(tmp_str);
|
|
|
|
|
|
|
|
if( NULL != local_snapshot->local_location )
|
|
|
|
free( local_snapshot->local_location );
|
|
|
|
local_snapshot->local_location = opal_crs_base_get_snapshot_directory(local_snapshot->reference_name);
|
|
|
|
|
|
|
|
if( NULL != local_snapshot->remote_location )
|
|
|
|
free( local_snapshot->remote_location );
|
|
|
|
local_snapshot->remote_location = strdup(local_snapshot->local_location);
|
|
|
|
}
|
|
|
|
if( NULL != tmp_str ) {
|
|
|
|
free(tmp_str);
|
|
|
|
tmp_str = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get Snapshot location argument
|
|
|
|
*/
|
|
|
|
if( sizeof(int) != (ret = read(app_comm_pipe_r_fd, &len, sizeof(int))) ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Error: Unable to read the snapshot_location len from named pipe (%s). %d\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_r, ret);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
tmp_str = (char *) malloc(sizeof(char) * len);
|
|
|
|
tmp_size = sizeof(char) * len;
|
|
|
|
if( tmp_size != (ret = read(app_comm_pipe_r_fd, tmp_str, (sizeof(char) * len))) ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2008-02-28 04:57:57 +03:00
|
|
|
"App) notify_response: Error: Unable to read the snapshot_location from named pipe (%s). %d\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_r, ret);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If they didn't send anything of meaning then use the defaults
|
|
|
|
*/
|
|
|
|
if( 1 < strlen(tmp_str) ) {
|
|
|
|
if( NULL != local_snapshot->local_location)
|
|
|
|
free( local_snapshot->local_location );
|
|
|
|
asprintf(&(local_snapshot->local_location), "%s/%s", tmp_str, local_snapshot->reference_name);
|
|
|
|
|
|
|
|
if( NULL != local_snapshot->remote_location)
|
|
|
|
free( local_snapshot->remote_location );
|
|
|
|
local_snapshot->remote_location = strdup(local_snapshot->local_location);
|
|
|
|
}
|
2009-05-09 00:51:13 +04:00
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
if( NULL != tmp_str ) {
|
|
|
|
free(tmp_str);
|
|
|
|
tmp_str = NULL;
|
|
|
|
}
|
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
/*
|
|
|
|
* Get Global Snapshot Ref
|
|
|
|
*/
|
|
|
|
if( sizeof(int) != (ret = read(app_comm_pipe_r_fd, &len, sizeof(int))) ) {
|
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Error: Unable to read the global snapshot ref len from named pipe (%s). %d\n",
|
|
|
|
app_comm_pipe_r, ret);
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
tmp_str = (char *) malloc(sizeof(char) * len);
|
|
|
|
tmp_size = sizeof(char) * len;
|
|
|
|
if( tmp_size != (ret = read(app_comm_pipe_r_fd, tmp_str, (sizeof(char) * len))) ) {
|
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Error: Unable to read the global snapshot ref from named pipe (%s). %d\n",
|
|
|
|
app_comm_pipe_r, ret);
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
if( NULL != app_cur_global_ref ) {
|
|
|
|
free(app_cur_global_ref);
|
|
|
|
app_cur_global_ref = NULL;
|
|
|
|
}
|
|
|
|
app_cur_global_ref = strdup(tmp_str);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the Seq. Number
|
|
|
|
*/
|
|
|
|
if( sizeof(size_t) != (ret = read(app_comm_pipe_r_fd, &tmp_size, sizeof(size_t))) ) {
|
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) notify_response: Error: Unable to read the global snapshot seq number from named pipe (%s). %d\n",
|
|
|
|
app_comm_pipe_r, ret);
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
app_cur_epoch = (int)tmp_size;
|
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
cleanup:
|
|
|
|
if( NULL != tmp_str ) {
|
|
|
|
free(tmp_str);
|
|
|
|
tmp_str = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int snapc_full_app_ckpt_handshake_end(int cr_state)
|
|
|
|
{
|
|
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
int last_cmd = 0;
|
2009-09-22 22:26:12 +04:00
|
|
|
int err;
|
2007-10-09 00:53:02 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the final checkpoint state to the local coordinator
|
|
|
|
*/
|
|
|
|
if( sizeof(int) != (ret = write(app_comm_pipe_w_fd, &cr_state, sizeof(int))) ) {
|
2009-09-22 22:26:12 +04:00
|
|
|
err = errno;
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2009-09-22 22:26:12 +04:00
|
|
|
"App) notify_response: Error: Unable to write cr_state to named pipe (%s). %d/%d/%s\n",
|
|
|
|
app_comm_pipe_w, ret, err, strerror(err));
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
2007-10-09 00:53:02 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) handshake_end: Waiting for release (%d)",
|
|
|
|
getpid()));
|
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
/*
|
|
|
|
* Wait for the local coordinator to release us
|
|
|
|
*/
|
|
|
|
if( sizeof(int) != (ret = read(app_comm_pipe_r_fd, &last_cmd, sizeof(int))) ) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(mca_snapc_full_component.super.output_handle,
|
2009-09-22 22:26:12 +04:00
|
|
|
"App) notify_response: Error: Unable to read the 'last_cmd' from named pipe (%s). %d\n",
|
2007-10-09 00:53:02 +04:00
|
|
|
app_comm_pipe_r, ret);
|
2009-09-22 22:26:12 +04:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
2007-10-09 00:53:02 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) handshake_end: Released... (%d)",
|
|
|
|
getpid()));
|
2007-12-13 17:37:17 +03:00
|
|
|
|
2007-10-09 00:53:02 +04:00
|
|
|
cleanup:
|
|
|
|
return exit_status;
|
|
|
|
}
|
2008-09-22 20:49:56 +04:00
|
|
|
|
|
|
|
int app_coord_ft_event(int state) {
|
|
|
|
int exit_status = ORTE_SUCCESS;
|
2009-09-22 22:26:12 +04:00
|
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) In ft_event(%d)", state));
|
2008-09-22 20:49:56 +04:00
|
|
|
|
|
|
|
/******** Checkpoint Prep ********/
|
|
|
|
if(OPAL_CRS_CHECKPOINT == state) {
|
|
|
|
; /* Nothing */
|
|
|
|
}
|
|
|
|
/******** Continue Recovery ********/
|
|
|
|
else if (OPAL_CRS_CONTINUE == state ) {
|
|
|
|
; /* Nothing */
|
|
|
|
}
|
|
|
|
/******** Restart Pre-Recovery ********/
|
|
|
|
else if (OPAL_CRS_RESTART_PRE == state ) {
|
|
|
|
;
|
|
|
|
}
|
|
|
|
/******** Restart Recovery ********/
|
|
|
|
else if (OPAL_CRS_RESTART == state ) {
|
2009-09-22 22:26:12 +04:00
|
|
|
; /* Nothing */
|
2008-09-22 20:49:56 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle,
|
|
|
|
"App) Initalized for Application %s (Restart)\n",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
2009-09-22 22:26:12 +04:00
|
|
|
}
|
|
|
|
/******** Termination ********/
|
|
|
|
else if (OPAL_CRS_TERM == state ) {
|
|
|
|
; /* Nothing */
|
|
|
|
}
|
|
|
|
/******** Error State ********/
|
|
|
|
else {
|
|
|
|
; /* Nothing */
|
|
|
|
}
|
|
|
|
|
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
|
|
|
|
int app_coord_start_ckpt(orte_snapc_base_quiesce_t *datum)
|
|
|
|
{
|
|
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
orte_snapc_full_cmd_flag_t command = ORTE_SNAPC_FULL_START_CKPT_CMD;
|
|
|
|
opal_buffer_t buffer;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Identify this as a split checkpoint
|
|
|
|
*/
|
|
|
|
app_split_ckpt = true;
|
2008-09-22 20:49:56 +04:00
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
/*
|
|
|
|
* Rank 0: Contact HNP to start checkpoint
|
|
|
|
* Rank *: Wait for HNP to xcast epoch
|
|
|
|
*/
|
|
|
|
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
|
|
|
/*
|
|
|
|
* Send request to HNP
|
|
|
|
*/
|
|
|
|
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_SNAPC_FULL_CMD))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
|
|
|
return ORTE_ERROR;
|
2008-09-22 20:49:56 +04:00
|
|
|
}
|
2009-09-22 22:26:12 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(ORTE_PROC_MY_NAME->jobid), 1, ORTE_JOBID))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
|
|
|
return ORTE_ERROR;
|
2008-09-22 20:49:56 +04:00
|
|
|
}
|
2009-09-22 22:26:12 +04:00
|
|
|
|
|
|
|
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_SNAPC_FULL, 0))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
|
|
|
return ORTE_ERROR;
|
2008-09-22 20:49:56 +04:00
|
|
|
}
|
2009-09-22 22:26:12 +04:00
|
|
|
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
while( app_cur_epoch < 0 || !app_notif_processed ) {
|
|
|
|
opal_progress();
|
|
|
|
opal_event_loop(OPAL_EVLOOP_NONBLOCK);
|
|
|
|
OPAL_CR_TEST_CHECKPOINT_READY();
|
|
|
|
}
|
|
|
|
|
|
|
|
datum->epoch = app_cur_epoch;
|
|
|
|
asprintf(&(datum->handle), "[%s:%s:%d]", app_cur_global_ref, local_snapshot->reference_name, app_cur_epoch);
|
|
|
|
datum->target_dir = strdup(local_snapshot->local_location);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* INC: Prepare the stack
|
|
|
|
*/
|
|
|
|
if(OPAL_SUCCESS != (ret = opal_cr_inc_core_prep() ) ) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
opal_cr_checkpointing_state = OPAL_CR_STATUS_RUNNING;
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int app_coord_end_ckpt(orte_snapc_base_quiesce_t *datum)
|
|
|
|
{
|
|
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
orte_snapc_full_cmd_flag_t command = ORTE_SNAPC_FULL_END_CKPT_CMD;
|
|
|
|
opal_buffer_t buffer;
|
|
|
|
|
|
|
|
if( datum->restarting ) {
|
|
|
|
datum->cr_state = OPAL_CRS_RESTART;
|
|
|
|
} else {
|
|
|
|
datum->cr_state = OPAL_CRS_CONTINUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* INC: Recover the stack
|
|
|
|
*/
|
|
|
|
if(OPAL_SUCCESS != (ret = opal_cr_inc_core_recover(datum->cr_state) ) ) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( datum->cr_state != OPAL_CRS_CONTINUE ) {
|
|
|
|
if( ORTE_SUCCESS != (ret = app_notify_resp_stage_3(datum->cr_state) )) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
2008-09-22 20:49:56 +04:00
|
|
|
}
|
2009-09-22 22:26:12 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
2008-09-22 20:49:56 +04:00
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
if( ORTE_SUCCESS != (ret = app_notify_resp_stage_2(datum->cr_state) ) ) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
2008-09-22 20:49:56 +04:00
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
if( ORTE_SUCCESS != (ret = app_notify_resp_stage_3(datum->cr_state) )) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
2008-09-22 20:49:56 +04:00
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
/*
|
|
|
|
* Rank 0: Contact HNP to let them know we are done
|
|
|
|
* Then return to application
|
|
|
|
*/
|
|
|
|
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
2008-09-22 20:49:56 +04:00
|
|
|
/*
|
2009-09-22 22:26:12 +04:00
|
|
|
* Send request to HNP
|
2008-09-22 20:49:56 +04:00
|
|
|
*/
|
2009-09-22 22:26:12 +04:00
|
|
|
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_SNAPC_FULL_CMD))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
|
|
|
return ORTE_ERROR;
|
2008-09-22 20:49:56 +04:00
|
|
|
}
|
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(ORTE_PROC_MY_NAME->jobid), 1, ORTE_JOBID))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
2008-09-22 20:49:56 +04:00
|
|
|
|
2009-09-22 22:26:12 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(datum->epoch), 1, OPAL_INT))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_SNAPC_FULL, 0))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
exit_status = ret;
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
OBJ_DESTRUCT(&buffer);
|
2008-09-22 20:49:56 +04:00
|
|
|
}
|
2009-09-22 22:26:12 +04:00
|
|
|
|
|
|
|
app_last_epoch = datum->epoch;
|
|
|
|
app_cur_epoch = -1;
|
|
|
|
if( NULL != app_cur_global_ref ) {
|
|
|
|
free(app_cur_global_ref);
|
|
|
|
app_cur_global_ref = NULL;
|
2008-09-22 20:49:56 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
cleanup:
|
2009-09-22 22:26:12 +04:00
|
|
|
/*
|
|
|
|
* Split checkpoint complete
|
|
|
|
*/
|
|
|
|
app_split_ckpt = false;
|
|
|
|
app_notif_processed = false;
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
2008-09-22 20:49:56 +04:00
|
|
|
}
|