39a6057fc6
* General TCP cleanup for OPAL / ORTE * Simplifying the OOB by moving much of the logic into the RML * Allowing the OOB RML component to do routing of messages * Adding a component framework for handling routing tables * Moving the xcast functionality from the OOB base to its own framework Includes merge from tmp/bwb-oob-rml-merge revisions: r15506, r15507, r15508, r15510, r15511, r15512, r15513 This commit was SVN r15528. The following SVN revisions from the original message are invalid or inconsistent and therefore were not cross-referenced: r15506 r15507 r15508 r15510 r15511 r15512 r15513
869 строки
27 KiB
C
869 строки
27 KiB
C
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* ORTE Checkpoint Tool for checkpointing a multiprocess job
|
|
*
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include <stdio.h>
|
|
#include <errno.h>
|
|
#ifdef HAVE_STDLIB_H
|
|
#include <stdlib.h>
|
|
#endif /* HAVE_STDLIB_H */
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
#ifdef HAVE_FCNTL_H
|
|
#include <fcntl.h>
|
|
#endif /* HAVE_FCNTL_H */
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
#include <sys/types.h>
|
|
#endif /* HAVE_SYS_TYPES_H */
|
|
#ifdef HAVE_SYS_STAT_H
|
|
#include <sys/stat.h> /* for mkfifo */
|
|
#endif /* HAVE_SYS_STAT_H */
|
|
#ifdef HAVE_SYS_WAIT_H
|
|
#include <sys/wait.h>
|
|
#endif /* HAVE_SYS_WAIT_H */
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif /* HAVE_STRING_H */
|
|
|
|
|
|
#include "opal/util/cmd_line.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/opal_environ.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/os_path.h"
|
|
#include "opal/util/os_dirpath.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "opal/mca/crs/crs.h"
|
|
#include "opal/mca/crs/base/base.h"
|
|
#include "opal/runtime/opal.h"
|
|
#include "opal/runtime/opal_cr.h"
|
|
|
|
#include "orte/orte_constants.h"
|
|
#include "orte/runtime/runtime.h"
|
|
#include "orte/runtime/orte_cr.h"
|
|
#include "orte/mca/gpr/gpr.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/dss/dss.h"
|
|
#include "orte/util/univ_info.h"
|
|
#include "orte/util/sys_info.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "opal/util/os_path.h"
|
|
#include "orte/util/session_dir.h"
|
|
#include "orte/mca/snapc/snapc.h"
|
|
#include "orte/mca/snapc/base/base.h"
|
|
|
|
/******************
|
|
* Local Functions
|
|
******************/
|
|
static int ckpt_init(int argc, char *argv[]); /* Initalization routine */
|
|
static int ckpt_finalize(void); /* Finalization routine */
|
|
static int parse_args(int argc, char *argv[]);
|
|
static int notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term);
|
|
static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle, int term);
|
|
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num);
|
|
static int find_universe(void);
|
|
static int pretty_print_status(int state, char * snapshot_ref);
|
|
static int pretty_print_reference(int seq, char * snapshot_ref);
|
|
|
|
/*****************************************
|
|
* Global Vars for Command line Arguments
|
|
*****************************************/
|
|
enum orte_checkpoint_stage_t {
|
|
ORTE_CKPT_STAGE_NULL,
|
|
ORTE_CKPT_STAGE_INIT_OPAL_UTIL,
|
|
ORTE_CKPT_STAGE_INIT_OPAL,
|
|
ORTE_CKPT_STAGE_INIT_ORTE,
|
|
ORTE_CKPT_STAGE_FINALIZE
|
|
|
|
};
|
|
typedef enum orte_checkpoint_stage_t orte_checkpoint_stage_t;
|
|
|
|
typedef struct {
|
|
bool help;
|
|
int pid;
|
|
bool term;
|
|
bool verbose;
|
|
char *req_universe_name; /**< User Requested Universe */
|
|
int stage; /* Has completed init fully */
|
|
bool nowait; /* Do not wait for checkpoint to complete before returning */
|
|
bool status; /* Display status messages while checkpoint is progressing */
|
|
int output;
|
|
int ckpt_status;
|
|
} orte_checkpoint_globals_t;
|
|
|
|
orte_checkpoint_globals_t orte_checkpoint_globals;
|
|
|
|
opal_cmd_line_init_t cmd_line_opts[] = {
|
|
{ NULL, NULL, NULL,
|
|
'h', NULL, "help",
|
|
0,
|
|
&orte_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"This help message" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'v', NULL, "verbose",
|
|
0,
|
|
&orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Be Verbose" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'\0', NULL, "term",
|
|
0,
|
|
&orte_checkpoint_globals.term, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Terminate the application after checkpoint" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'w', NULL, "nowait",
|
|
0,
|
|
&orte_checkpoint_globals.nowait, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Do not wait for the application to finish checkpointing before returning" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
's', NULL, "status",
|
|
0,
|
|
&orte_checkpoint_globals.status, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Display status messages describing the progression of the checkpoint" },
|
|
|
|
{ "universe", NULL, NULL,
|
|
'\0', NULL, "universe",
|
|
1,
|
|
&orte_checkpoint_globals.req_universe_name, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Set the universe name as username@hostname:universe_name for this "
|
|
"application. This should be the universe of the application that you wish "
|
|
"to checkpoint." },
|
|
|
|
/* End of list */
|
|
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
|
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
|
NULL }
|
|
};
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char *global_snapshot_handle;
|
|
int seq_num = -1;
|
|
|
|
/***************
|
|
* Initialize
|
|
***************/
|
|
if (ORTE_SUCCESS != (ret = ckpt_init(argc, argv))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*******************************
|
|
* Checkpoint the requested PID
|
|
*******************************/
|
|
if( orte_checkpoint_globals.verbose ) {
|
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
|
"orte_checkpoint: Checkpointing...");
|
|
if(0 < orte_checkpoint_globals.pid) {
|
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
|
"\t PID %d",
|
|
orte_checkpoint_globals.pid);
|
|
}
|
|
|
|
if(NULL != orte_checkpoint_globals.req_universe_name) {
|
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
|
"\t Universe (%s)",
|
|
orte_checkpoint_globals.req_universe_name);
|
|
}
|
|
else {
|
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
|
"\t Connected to Universe (%s)",
|
|
orte_universe_info.name);
|
|
}
|
|
|
|
if(orte_checkpoint_globals.term) {
|
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
|
"\t Terminating after checkpoint\n");
|
|
}
|
|
}
|
|
|
|
if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint(&global_snapshot_handle,
|
|
&seq_num,
|
|
orte_checkpoint_globals.term)) ) {
|
|
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
|
|
orte_checkpoint_globals.pid, ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
|
|
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
|
|
orte_checkpoint_globals.pid, ORTE_ERROR);
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( orte_checkpoint_globals.status ) {
|
|
pretty_print_status(ORTE_SNAPC_CKPT_STATE_FINISHED, global_snapshot_handle);
|
|
}
|
|
|
|
if(!orte_checkpoint_globals.nowait) {
|
|
pretty_print_reference(seq_num, global_snapshot_handle);
|
|
}
|
|
|
|
cleanup:
|
|
/***************
|
|
* Cleanup
|
|
***************/
|
|
if (ORTE_SUCCESS != (ret = ckpt_finalize())) {
|
|
return ret;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int parse_args(int argc, char *argv[]) {
|
|
int i, ret, len, exit_status = ORTE_SUCCESS ;
|
|
opal_cmd_line_t cmd_line;
|
|
char **app_env = NULL, **global_env = NULL;
|
|
|
|
/* Init structure */
|
|
memset(&orte_checkpoint_globals, 0, sizeof(orte_checkpoint_globals_t));
|
|
orte_checkpoint_globals.help = false;
|
|
orte_checkpoint_globals.pid = -1;
|
|
orte_checkpoint_globals.term = false;
|
|
orte_checkpoint_globals.verbose = false;
|
|
orte_checkpoint_globals.req_universe_name = NULL;
|
|
orte_checkpoint_globals.stage = ORTE_CKPT_STAGE_NULL;
|
|
orte_checkpoint_globals.nowait = false;
|
|
orte_checkpoint_globals.status = false;
|
|
orte_checkpoint_globals.output = -1;
|
|
orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
|
|
|
|
/* Parse the command line options */
|
|
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
|
mca_base_open();
|
|
mca_base_cmd_line_setup(&cmd_line);
|
|
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
|
|
|
|
/**
|
|
* Put all of the MCA arguments in the environment
|
|
*/
|
|
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
|
|
|
|
len = opal_argv_count(app_env);
|
|
for(i = 0; i < len; ++i) {
|
|
putenv(app_env[i]);
|
|
}
|
|
|
|
len = opal_argv_count(global_env);
|
|
for(i = 0; i < len; ++i) {
|
|
putenv(global_env[i]);
|
|
}
|
|
|
|
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
|
"1",
|
|
true, &environ);
|
|
|
|
/**
|
|
* Now start parsing our specific arguments
|
|
*/
|
|
/* get the remaining bits */
|
|
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
|
|
|
|
if (OPAL_SUCCESS != ret ||
|
|
orte_checkpoint_globals.help ||
|
|
(0 >= argc && NULL == orte_checkpoint_globals.req_universe_name)) {
|
|
char *args = NULL;
|
|
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
|
opal_show_help("help-orte-checkpoint.txt", "usage", true,
|
|
args);
|
|
free(args);
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* If the user did not supply a universe, then they must
|
|
* supply the PID of MPIRUN
|
|
*/
|
|
if(0 >= argc &&
|
|
NULL != orte_checkpoint_globals.req_universe_name) {
|
|
exit_status = ORTE_SUCCESS;
|
|
goto cleanup;
|
|
}
|
|
|
|
orte_checkpoint_globals.pid = atoi(argv[0]);
|
|
if ( 0 >= orte_checkpoint_globals.pid ) {
|
|
opal_show_help("help-orte-checkpoint.txt", "invalid_pid", true,
|
|
orte_checkpoint_globals.pid);
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* JJH: No wait is currently not implemented or tested
|
|
*/
|
|
if(orte_checkpoint_globals.nowait) {
|
|
orte_checkpoint_globals.nowait = false;
|
|
opal_show_help("help-orte-checkpoint.txt", "not_impl",
|
|
true,
|
|
"Disconnected checkpoint");
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int
|
|
notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term)
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
orte_process_name_t peer;
|
|
|
|
peer = *ORTE_PROC_MY_HNP;
|
|
|
|
/*
|
|
* Contact HNP via RML
|
|
* The notification will be received by the Global Snapshot Coordinator [SnapC]
|
|
* in the HNP(s)
|
|
* See orte_snapc(7) for more information.
|
|
*/
|
|
if( ORTE_SUCCESS != (ret = contact_hnp(&peer, *global_snapshot_handle, term)) ) {
|
|
opal_show_help("help-orte-checkpoint.txt", "unable_to_connect", true,
|
|
orte_checkpoint_globals.pid);
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Wait for the global_snapshot_coordinator to notify us (via the RML)
|
|
* of the completion of the checkpoint.
|
|
* Unless the user wants us to return immediately
|
|
*/
|
|
if(!orte_checkpoint_globals.nowait) {
|
|
/*
|
|
* Wait for progress updates, stop waiting when 'Finished' status
|
|
*/
|
|
do {
|
|
if( ORTE_SUCCESS != (ret = wait_for_checkpoint(&peer, global_snapshot_handle, seq_num) ) ) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* If process said that it cannot checkpoint at this time return a
|
|
* pretty message.
|
|
*/
|
|
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status ) {
|
|
opal_show_help("help-orte-checkpoint.txt", "non-ckptable",
|
|
true,
|
|
orte_checkpoint_globals.pid);
|
|
exit_status = ORTE_ERROR;
|
|
break;
|
|
}
|
|
/*
|
|
* If we are to display the status progression
|
|
*/
|
|
if( orte_checkpoint_globals.status ) {
|
|
if(ORTE_SNAPC_CKPT_STATE_FINISHED != orte_checkpoint_globals.ckpt_status)
|
|
pretty_print_status(orte_checkpoint_globals.ckpt_status, *global_snapshot_handle);
|
|
}
|
|
/*
|
|
* Otherwise only display it if we are going to be terminated soon
|
|
*/
|
|
else {
|
|
/* Since ORTE kills us before we get the Finished message,
|
|
* print out the global snapshot handle when we start running
|
|
*/
|
|
if(orte_checkpoint_globals.term &&
|
|
ORTE_SNAPC_CKPT_STATE_RUNNING == orte_checkpoint_globals.ckpt_status ) {
|
|
pretty_print_status(orte_checkpoint_globals.ckpt_status, *global_snapshot_handle);
|
|
}
|
|
}
|
|
} while(ORTE_SNAPC_CKPT_STATE_FINISHED != orte_checkpoint_globals.ckpt_status &&
|
|
ORTE_SNAPC_CKPT_STATE_ERROR != orte_checkpoint_globals.ckpt_status );
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
/*
|
|
* This function attempts to:
|
|
* 1. Find the universe that matches one or both of the following:
|
|
* - --universe specified by the user (if any)
|
|
* - PID specified by the user (if any)
|
|
* 2. Attach orte_checkpoint to that universe, so we can talk to
|
|
* it's GPR.
|
|
*/
|
|
static int find_universe(void) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char *fulldirpath = NULL,
|
|
*prefix = NULL,
|
|
*frontend = NULL,
|
|
*univ_name = NULL,
|
|
*full_univ = NULL;
|
|
|
|
/*
|
|
* If the user specified a universe, trust it as correct
|
|
*/
|
|
if( NULL != orte_checkpoint_globals.req_universe_name ) {
|
|
exit_status = ORTE_SUCCESS;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Otherwise check to see if we can find it by looking for the
|
|
* default constructed with the PID
|
|
*/
|
|
asprintf(&univ_name, "%s-%d", ORTE_DEFAULT_UNIVERSE, orte_checkpoint_globals.pid);
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_session_dir_get_name(&fulldirpath,
|
|
&prefix,
|
|
&frontend,
|
|
orte_system_info.user,
|
|
orte_system_info.nodename,
|
|
NULL, /* Unknown batchid */
|
|
univ_name,
|
|
NULL, /* Unknown Jobid */
|
|
NULL) /* Unknown process ID */
|
|
)) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
|
"orte_checkpoint: find_universe: Trying to find the session directory\n\t\t(%s)\n",
|
|
fulldirpath);
|
|
|
|
/*
|
|
* Check that the directory is accessable.
|
|
*/
|
|
if( ORTE_SUCCESS != (ret = opal_os_dirpath_access(fulldirpath, 0) )) {
|
|
opal_show_help("help-orte-checkpoint.txt", "no_universe", true,
|
|
orte_checkpoint_globals.pid, fulldirpath);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Set the MCA parameter
|
|
* username@hostname:universe_name
|
|
*/
|
|
asprintf(&full_univ, "%s@%s:%s",
|
|
orte_system_info.user,
|
|
orte_system_info.nodename,
|
|
univ_name);
|
|
opal_setenv(mca_base_param_env_var("universe"),
|
|
full_univ,
|
|
true, &environ);
|
|
|
|
cleanup:
|
|
if(NULL != full_univ)
|
|
free(full_univ);
|
|
if(NULL != fulldirpath)
|
|
free(fulldirpath);
|
|
if(NULL != prefix)
|
|
free(prefix);
|
|
if(NULL != frontend)
|
|
free(frontend);
|
|
if(NULL != univ_name)
|
|
free(univ_name);
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int ckpt_init(int argc, char *argv[]) {
|
|
int exit_status = ORTE_SUCCESS, ret;
|
|
|
|
/*
|
|
* Make sure to init util before parse_args
|
|
* to ensure installdirs is setup properly
|
|
* before calling mca_base_open();
|
|
*/
|
|
if( ORTE_SUCCESS != (ret = opal_init_util()) ) {
|
|
return ret;
|
|
}
|
|
orte_checkpoint_globals.stage = ORTE_CKPT_STAGE_INIT_OPAL_UTIL;
|
|
|
|
/*
|
|
* Parse Command Line Arguments
|
|
*/
|
|
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Setup OPAL Output handle from the verbose argument
|
|
*/
|
|
if( orte_checkpoint_globals.verbose ) {
|
|
orte_checkpoint_globals.output = opal_output_open(NULL);
|
|
opal_output_set_verbosity(orte_checkpoint_globals.output, 10);
|
|
} else {
|
|
orte_checkpoint_globals.output = 0; /* Default=STDOUT */
|
|
}
|
|
|
|
/*
|
|
* We are trying to attach to another process' GPR so we need to
|
|
* attach no matter if it is identified as private or not.
|
|
*/
|
|
opal_setenv(mca_base_param_env_var("universe_console"),
|
|
"1",
|
|
true, &environ);
|
|
|
|
/* Disable the checkpoint notification routine for this
|
|
* tool. As we will never need to checkpoint this tool.
|
|
* Note: This must happen before opal_init().
|
|
*/
|
|
opal_cr_set_enabled(false);
|
|
|
|
/* Select the none component, since we don't actually use a checkpointer */
|
|
opal_setenv(mca_base_param_env_var("crs"),
|
|
"none",
|
|
true, &environ);
|
|
|
|
/***************************
|
|
* We need all of OPAL
|
|
***************************/
|
|
if (ORTE_SUCCESS != (ret = opal_init())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
orte_checkpoint_globals.stage = ORTE_CKPT_STAGE_INIT_OPAL;
|
|
|
|
/***************************
|
|
* And ORTE, but need to do a bit of a dance first
|
|
***************************/
|
|
/* register handler for errnum -> string converstion */
|
|
opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);
|
|
|
|
/* Register all MCA Params */
|
|
if (ORTE_SUCCESS != (ret = orte_register_params(true))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Ensure the system_info structure is instantiated and initialized */
|
|
if (ORTE_SUCCESS != (ret = orte_sys_info())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Ensure the process info structure is instantiated and initialized */
|
|
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/***************************
|
|
* Find the universe that we need to connect to, if it exists
|
|
***************************/
|
|
if (ORTE_SUCCESS != (ret = find_universe())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
/* JJH XXX
|
|
* JJH XXX In actuality, we only want to setup upto the RML
|
|
* JJH XXX so we can talk to the HNP over the wire, but don't
|
|
* JJH XXX become a job of the universe.
|
|
* JJH XXX This is a bandaid until we do it right.
|
|
* JJH XXX
|
|
*/
|
|
if (ORTE_SUCCESS != (ret = orte_system_init(ORTE_INFRASTRUCTURE, ORTE_NON_BARRIER)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
orte_checkpoint_globals.stage = ORTE_CKPT_STAGE_INIT_ORTE;
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int ckpt_finalize(void) {
|
|
int exit_status = ORTE_SUCCESS, ret;
|
|
|
|
if( ORTE_CKPT_STAGE_INIT_OPAL_UTIL == orte_checkpoint_globals.stage) {
|
|
if (ORTE_SUCCESS != (ret = opal_finalize_util())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
else if( ORTE_CKPT_STAGE_INIT_OPAL == orte_checkpoint_globals.stage) {
|
|
if (ORTE_SUCCESS != (ret = opal_finalize())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
else {
|
|
if (ORTE_SUCCESS != (ret = orte_finalize())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle, int term) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
orte_buffer_t *buffer;
|
|
size_t command;
|
|
pid_t hnp_pid;
|
|
orte_std_cntr_t n;
|
|
/* JJH XXX currently we assume jobid = 1, don't do this in the future */
|
|
orte_jobid_t jobid = 1;
|
|
|
|
if (NULL == (buffer = OBJ_NEW(orte_buffer_t))) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/***********************************
|
|
* Notify HNP that we want to chat about a checkpoint
|
|
***********************************/
|
|
command = ORTE_SNAPC_GLOBAL_INIT_CMD;
|
|
if (ORTE_SUCCESS != (ret = orte_dss.pack(buffer, &command, 1, ORTE_CKPT_CMD)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if ( 0 > (ret = orte_rml.send_buffer(peer, buffer, ORTE_RML_TAG_CKPT, 0)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/********************
|
|
* Receive the PID of the HNP, just to be doubly sure we are talking to
|
|
* the right HNP.
|
|
********************/
|
|
if( 0 > (ret = orte_rml.recv_buffer(peer, buffer, ORTE_RML_TAG_CKPT, 0)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
n = 1;
|
|
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &hnp_pid, &n, ORTE_PID)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
|
"orte_checkpoint: contact_hnp: Head Node Process PID = %d\n",
|
|
hnp_pid);
|
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
|
"orte_checkpoint: contact_hnp: Expected PID = %d\n",
|
|
orte_checkpoint_globals.pid);
|
|
|
|
if(hnp_pid != orte_checkpoint_globals.pid) {
|
|
opal_show_help("help-orte-checkpoint.txt", "invalid_pid", true,
|
|
orte_checkpoint_globals.pid);
|
|
exit_status = ORTE_ERROR;
|
|
orte_snapc_base_global_coord_send_ack(peer, false);
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_send_ack(peer, true)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/**************************
|
|
* Send over the term flag
|
|
**************************/
|
|
OBJ_RELEASE(buffer);
|
|
if (NULL == (buffer = OBJ_NEW(orte_buffer_t))) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_dss.pack(buffer, &term, 1, ORTE_BOOL))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (0 > (ret = orte_rml.send_buffer(peer, buffer, ORTE_RML_TAG_CKPT, 0))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/**************************
|
|
* Send over the jobid flag
|
|
**************************/
|
|
OBJ_RELEASE(buffer);
|
|
if (NULL == (buffer = OBJ_NEW(orte_buffer_t))) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_dss.pack(buffer, &jobid, 1, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (0 > (ret = orte_rml.send_buffer(peer, buffer, ORTE_RML_TAG_CKPT, 0))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
OBJ_RELEASE(buffer);
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
orte_buffer_t *loc_buffer;
|
|
orte_std_cntr_t n = 1;
|
|
size_t str_len = 0;
|
|
int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
|
|
|
|
if (NULL == (loc_buffer = OBJ_NEW(orte_buffer_t))) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
/******************
|
|
* Receive the checkpoint status
|
|
******************/
|
|
OBJ_RELEASE(loc_buffer);
|
|
if (NULL == (loc_buffer = OBJ_NEW(orte_buffer_t))) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_rml.recv_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
n = 1;
|
|
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, &ckpt_status, &n, ORTE_INT)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
orte_checkpoint_globals.ckpt_status = ckpt_status;
|
|
|
|
/* ACK */
|
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_send_ack(peer, true)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* If we cannot checkpoint, then just skip to the end */
|
|
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == ckpt_status) {
|
|
*global_snapshot_handle = NULL;
|
|
goto cleanup;
|
|
}
|
|
|
|
/******************
|
|
* Receive the size of the global snapshot handle
|
|
******************/
|
|
OBJ_RELEASE(loc_buffer);
|
|
if (NULL == (loc_buffer = OBJ_NEW(orte_buffer_t))) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_rml.recv_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
n = 1;
|
|
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, &str_len, &n, ORTE_SIZE)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* ACK */
|
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_send_ack(peer, true)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/******************
|
|
* Receive the global snapshot handle
|
|
******************/
|
|
OBJ_RELEASE(loc_buffer);
|
|
if (NULL == (loc_buffer = OBJ_NEW(orte_buffer_t))) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_rml.recv_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
n = 1;
|
|
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, global_snapshot_handle, &n, ORTE_STRING)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
n = 1;
|
|
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, seq_num, &n, ORTE_INT)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* ACK */
|
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_send_ack(peer, true)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
OBJ_RELEASE(loc_buffer);
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int pretty_print_status(int state, char * snapshot_ref) {
|
|
char * state_str = NULL;
|
|
|
|
state_str = orte_snapc_ckpt_state_str(state);
|
|
|
|
opal_output(orte_checkpoint_globals.output,
|
|
"%*s - Global Snapshot Reference: %s\n",
|
|
25, state_str, snapshot_ref);
|
|
if( NULL != state_str) {
|
|
free(state_str);
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int pretty_print_reference(int seq, char * snapshot_ref) {
|
|
|
|
printf("Snapshot Ref.: %3d %s\n",
|
|
seq, snapshot_ref);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|