1
1

Merge pull request #4223 from rhc54/topic/stale

Remove stale tools
Этот коммит содержится в:
Ralph Castain 2017-09-18 09:43:06 -07:00 коммит произвёл GitHub
родитель 252be7ffb0 ed508010b4
Коммит 08c93091f7
15 изменённых файлов: 0 добавлений и 3416 удалений

Просмотреть файл

@ -25,12 +25,9 @@ AC_DEFUN([ORTE_CONFIG_FILES],[
orte/tools/wrappers/Makefile
orte/tools/wrappers/ortecc-wrapper-data.txt
orte/tools/wrappers/orte.pc
orte/tools/orte-checkpoint/Makefile
orte/tools/orte-restart/Makefile
orte/tools/orte-ps/Makefile
orte/tools/orte-clean/Makefile
orte/tools/orte-top/Makefile
orte/tools/orte-migrate/Makefile
orte/tools/orte-info/Makefile
orte/tools/orte-server/Makefile
orte/tools/orte-dvm/Makefile

Просмотреть файл

@ -84,24 +84,6 @@ $(top_builddir)/orte/tools/orte-clean/orte-clean.1:
ompi-clean.1: $(top_builddir)/orte/tools/orte-clean/orte-clean.1
cp -f $(top_builddir)/orte/tools/orte-clean/orte-clean.1 ompi-clean.1
$(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1:
(cd $(top_builddir)/orte/tools/orte-checkpoint && $(MAKE) $(AM_MAKEFLAGS) orte-checkpoint.1)
ompi-checkpoint.1: $(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1
cp -f $(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1 ompi-checkpoint.1
$(top_builddir)/orte/tools/orte-restart/orte-restart.1:
(cd $(top_builddir)/orte/tools/orte-restart && $(MAKE) $(AM_MAKEFLAGS) orte-restart.1)
ompi-restart.1: $(top_builddir)/orte/tools/orte-restart/orte-restart.1
cp -f $(top_builddir)/orte/tools/orte-restart/orte-restart.1 ompi-restart.1
$(top_builddir)/orte/tools/orte-migrate/orte-migrate.1:
(cd $(top_builddir)/orte/tools/orte-migrate && $(MAKE) $(AM_MAKEFLAGS) orte-migrate.1)
ompi-migrate.1: $(top_builddir)/orte/tools/orte-migrate/orte-migrate.1
cp -f $(top_builddir)/orte/tools/orte-migrate/orte-migrate.1 ompi-migrate.1
$(top_builddir)/orte/tools/orte-top/orte-top.1:
(cd $(top_builddir)/orte/tools/orte-top && $(MAKE) $(AM_MAKEFLAGS) orte-top.1)

Просмотреть файл

@ -25,29 +25,23 @@
# orte/Makefile.am
SUBDIRS += \
tools/orte-checkpoint \
tools/orte-clean \
tools/orte-ps \
tools/orte-restart \
tools/orted \
tools/orterun \
tools/wrappers \
tools/orte-top \
tools/orte-info \
tools/orte-migrate \
tools/orte-server
DIST_SUBDIRS += \
tools/orte-checkpoint \
tools/orte-clean \
tools/orte-ps \
tools/orte-restart \
tools/orted \
tools/orterun \
tools/wrappers \
tools/orte-top \
tools/orte-info \
tools/orte-migrate \
tools/orte-server \
tools/orte-dvm \
tools/prun

Просмотреть файл

@ -1,51 +0,0 @@
#
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
include $(top_srcdir)/Makefile.ompi-rules
man_pages = orte-checkpoint.1
EXTRA_DIST = orte-checkpoint.1in
if WANT_FT_CR
if OPAL_INSTALL_BINARIES
bin_PROGRAMS = orte-checkpoint
nodist_man_MANS = $(man_pages)
# Ensure that the man pages are rebuilt if the opal_config.h file
# changes; a "good enough" way to know if configure was run again (and
# therefore the release date or version may have changed)
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
dist_ortedata_DATA = help-orte-checkpoint.txt
endif # OPAL_INSTALL_BINARIES
orte_checkpoint_SOURCES = orte-checkpoint.c
orte_checkpoint_LDADD = \
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
endif # WANT_FT_CR
distclean-local:
rm -f $(man_pages)

Просмотреть файл

@ -1,113 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI checkpoint tool
#
[usage]
ompi-checkpoint PID_OF_MPIRUN
Open MPI Checkpoint Tool
%s
#
[usage-no-cr]
This build of Open MPI does *not* include Checkpoint/Restart functionality.
If you require this functionality re-configure Open MPI with the proper
Checkpoint/Restart options.
ompi-checkpoint PID_OF_MPIRUN
Open MPI Checkpoint Tool
%s
#
[invalid_pid]
Error: The PID (%d) is invalid because either you have not provided a PID
or provided an invalid PID.
Please see --help for usage.
#
[ckpt_failure]
Error: The application (PID = %d) failed to checkpoint properly.
Returned %d.
#
[pid_does_not_exist]
Error: The process with PID %d is not checkpointable.
This could be due to one of the following:
- An application with this PID doesn't currently exist
- The application with this PID isn't checkpointable
- The application with this PID isn't an Open MPI application.
We were looking for the named file:
%s
#
[no_hnps]
Error: Unable to find a list of active MPIRUN processes on this machine.
This could be due to one of the following:
- The PID specified (%d) is not that of an active MPIRUN.
- The session directory location could not be found/parsed.
ompi-checkpoint attempted to find the session directory:
%s/%s
Check to make sure that this directory exists while the MPIRUN
process is running.
Return Code: %d (%s)
#
[no_universe]
Error: Unable to find the requested, active MPIRUN process on this machine.
This could be due to one of the following:
- The jobid specified by the '--hnp-jobid' option is not
correct.
- The PID specified (%d) is not that of an active MPIRUN.
- The application with this PID is not checkpointable
- The application with this PID is not an Open MPI application.
- The session directory location could not be parsed.
ompi-checkpoint attempted to use the session directory:
%s/%s
#
[unable_to_connect]
Error: Unable to connect to the Head Node Process to initiate the
checkpoint of the application.
This could be due to one of the following:
- The universe specified by the '--hnp-jobid' option is not
correct.
- The PID is not that of an active MPIRUN.
- The application with this PID isn't checkpointable
- The application with this PID isn't an Open MPI application.
#
[non-ckptable]
Error: The job with pid %d is not checkpointable.
This could be caused by one of the following:
- The application is using unsupported components.
- Your application did not select to be checkpointable
To enable checkpointing in an application use the following AMCA parameter
argument to mpirun:
-am ft-enable-cr
#
[not_impl]
The following feature was requested, but is not currently implemented.
%s
If you require this feature contact the Open MPI development group.
[pid_not_found]
Error: The process with PID %d is not checkpointable.
This could be due to one of the following:
- An application with this PID doesn't currently exist
- The application with this PID isn't an Open MPI application.
#
[hnp_not_found]
Error: The jobid specified by the '--hnp-jobid' option does not exist.

Просмотреть файл

@ -1,103 +0,0 @@
.\"
.\" Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
.\" University Research and Technology
.\" Corporation. All rights reserved.
.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
.\"
.\" Man page for OMPI's ompi-checkpoint command
.\"
.\" .TH name section center-footer left-footer center-header
.TH OMPI-CHECKPOINT 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
.\" **************************
.\" Name Section
.\" **************************
.SH NAME
.
ompi-checkpoint, orte-checkpoint \- Checkpoint a running parallel process using the Open MPI
Checkpoint/Restart Service (CRS)
.
.PP
.
\fBNOTE:\fP \fIompi-checkpoint\fP, and \fIorte-checkpoint\fP are all exact
synonyms for each other. Using any of the names will result in exactly
identical behavior.
.
.\" **************************
.\" Synopsis Section
.\" **************************
.SH SYNOPSIS
.
.B ompi-checkpoint
.B [ options ]
.B <PID_OF_MPIRUN>
.
.\" **************************
.\" Options Section
.\" **************************
.SH Options
.
\fIorte-checkpoint\fR will attempt to notify a running parallel job (identified
by \fImpirun\fP) that it has been requested that the job checkpoint itself. A
global snapshot handle reference is presented to the user, which is used in
\fIompi_restart\fP to restart the job.
.
.TP 10
.B <PID_OF_MPIRUN>
Process ID of the \fImpirun\fP process.
.
.
.TP
.B -h | --help
Display help for this command
.
.
.TP
.B -w | --nowait
Do not wait for the application to finish checkpointing before returning.
.
.
.TP
.B -s | --status
Display status messages regarding the progression of the checkpoint request.
.
.
.TP
.B --term
After checkpointing the running job, terminate it.
.
.
.TP
.B -v | --verbose
Enable verbose output for debugging.
.
.
.TP
.B -gmca | --gmca \fR<key> <value>\fP
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
the parameter name; \fI<value>\fP is the parameter value.
.
.
.TP
.B -mca | --mca <key> <value>
Send arguments to various MCA modules.
.
.
.\" **************************
.\" Description Section
.\" **************************
.SH DESCRIPTION
.
.PP
\fIorte-checkpoint\fR can be invoked multiple, non-overlapping times.
It is convenient to note that the user does not need to spectify
the checkpointer to be used here, as that is determined completely by each of
the running process in the job being checkpointed.
.
.
.\" **************************
.\" See Also Section
.\" **************************
.
.SH SEE ALSO
orte-ps(1), orte-clean(1), ompi-restart(1), opal-checkpoint(1), opal-restart(1), opal_crs(7)
.

Просмотреть файл

@ -1,985 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* ORTE Checkpoint Tool for checkpointing a multiprocess job
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h> /* for mkfifo */
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#include <string.h>
#include "opal/util/cmd_line.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/mca/base/base.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_cr.h"
#include "orte/util/hnp_contact.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "opal/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "opal/dss/dss.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/sstore.h"
#include "orte/mca/sstore/base/base.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/******************
* Local Functions
******************/
static int ckpt_init(int argc, char *argv[]); /* Initalization routine */
static int ckpt_finalize(void); /* Finalization routine */
static int parse_args(int argc, char *argv[]);
static int find_hnp(void);
static int start_listener(void);
static int stop_listener(void);
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata);
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer);
static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options);
static int pretty_print_status(void);
static int pretty_print_reference(void);
static int list_all_snapshots(void);
static orte_hnp_contact_t *orterun_hnp = NULL;
static char * global_snapshot_handle = NULL;
static int global_sequence_num = 0;
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
static bool listener_started = false;
static bool is_checkpoint_finished = false;
static bool is_checkpoint_established = false;
static bool is_checkpoint_recovered = false;
static double timer_start = 0;
static double timer_last = 0;
static double get_time(void);
typedef struct {
bool help;
int pid;
opal_crs_base_ckpt_options_t *options;
bool term;
bool stop;
bool verbose;
int verbose_level;
orte_jobid_t req_hnp; /**< User Requested HNP */
bool nowait; /* Do not wait for checkpoint to complete before returning */
bool status; /* Display status messages while checkpoint is progressing */
int output;
int ckpt_status;
bool list_only; /* List available checkpoints only */
#if OPAL_ENABLE_CRDEBUG == 1
bool enable_crdebug; /* Enable C/R Debugging */
bool attach_debugger;
bool detach_debugger;
#endif
} orte_checkpoint_globals_t;
orte_checkpoint_globals_t orte_checkpoint_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL,
'h', NULL, "help",
0,
&orte_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL,
'v', NULL, "verbose",
0,
&orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL,
'V', NULL, NULL,
1,
&orte_checkpoint_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
"Set the verbosity level (For additional debugging information)" },
{ NULL,
'\0', NULL, "term",
0,
&(orte_checkpoint_globals.term), OPAL_CMD_LINE_TYPE_BOOL,
"Terminate the application after checkpoint (Cannot be used with --stop)" },
{ NULL,
'\0', NULL, "stop",
0,
&(orte_checkpoint_globals.stop), OPAL_CMD_LINE_TYPE_BOOL,
"Send SIGSTOP to application just after checkpoint (checkpoint will not finish until SIGCONT is sent) (Cannot be used with --term)" },
{ NULL,
'w', NULL, "nowait",
0,
&orte_checkpoint_globals.nowait, OPAL_CMD_LINE_TYPE_BOOL,
"Do not wait for the application to finish checkpointing before returning" },
{ NULL,
's', NULL, "status",
0,
&orte_checkpoint_globals.status, OPAL_CMD_LINE_TYPE_BOOL,
"Display status messages describing the progression of the checkpoint" },
{ "hnp-jobid",
'\0', NULL, "hnp-jobid",
1,
&orte_checkpoint_globals.req_hnp, OPAL_CMD_LINE_TYPE_INT,
"This should be the jobid of the HNP whose applications you wish "
"to checkpoint." },
{ "hnp-pid",
'\0', NULL, "hnp-pid",
1,
&orte_checkpoint_globals.pid, OPAL_CMD_LINE_TYPE_INT,
"This should be the pid of the mpirun whose applications you wish "
"to checkpoint." },
{ NULL,
'l', NULL, "list",
0,
&orte_checkpoint_globals.list_only, OPAL_CMD_LINE_TYPE_BOOL,
"Display a list of checkpoint files available on this machine" },
#if OPAL_ENABLE_CRDEBUG == 1
{ NULL,
'\0', "crdebug", "crdebug",
0,
&orte_checkpoint_globals.enable_crdebug, OPAL_CMD_LINE_TYPE_BOOL,
"Enable C/R Enhanced Debugging" },
{ NULL,
'\0', "attach", "attach",
0,
&(orte_checkpoint_globals.attach_debugger), OPAL_CMD_LINE_TYPE_BOOL,
"Wait for the debugger to attach directly after taking the checkpoint." },
{ NULL,
'\0', "detach", "detach",
0,
&(orte_checkpoint_globals.detach_debugger), OPAL_CMD_LINE_TYPE_BOOL,
"Do not wait for the debugger to reattach after taking the checkpoint." },
#endif
/* End of list */
{ NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
int
main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
/***************
* Initialize
***************/
if (ORTE_SUCCESS != (ret = ckpt_init(argc, argv))) {
exit_status = ret;
goto cleanup;
}
/*************************************
* Listing only Checkpoint References
*************************************/
if( orte_checkpoint_globals.list_only ) {
if (ORTE_SUCCESS != (ret = list_all_snapshots())) {
exit_status = ret;
goto cleanup;
}
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/***************************
* Find the HNP that we want to connect to, if it exists
***************************/
if (ORTE_SUCCESS != (ret = find_hnp())) {
/* Error printed by called function */
exit_status = ret;
goto cleanup;
}
/*******************************
* Checkpoint the requested PID
*******************************/
is_checkpoint_finished = false;
is_checkpoint_recovered = false;
is_checkpoint_established = false;
if( orte_checkpoint_globals.verbose ) {
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: Checkpointing...");
if (0 < orte_checkpoint_globals.pid) {
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t PID %d",
orte_checkpoint_globals.pid);
} else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Mpirun (%s)",
ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp));
}
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Connected to Mpirun %s",
ORTE_NAME_PRINT(&orterun_hnp->name));
if(orte_checkpoint_globals.options->term) {
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Terminating after checkpoint\n");
}
if(orte_checkpoint_globals.options->stop) {
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Stopping after checkpoint\n");
}
}
if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.options)) ) {
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
orte_checkpoint_globals.pid, ret);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* Wait for the checkpoint to complete
*/
if(!orte_checkpoint_globals.nowait) {
while( !is_checkpoint_finished ) {
opal_progress();
}
}
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status ||
ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
if(!orte_checkpoint_globals.nowait) {
pretty_print_reference();
}
cleanup:
/***************
* Cleanup
***************/
if (ORTE_SUCCESS != (ret = ckpt_finalize())) {
return ret;
}
return exit_status;
}
static int parse_args(int argc, char *argv[]) {
int i, ret, len, exit_status = ORTE_SUCCESS ;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
char *argv0 = NULL;
/* Init structure */
memset(&orte_checkpoint_globals, 0, sizeof(orte_checkpoint_globals_t));
orte_checkpoint_globals.help = false;
orte_checkpoint_globals.pid = -1;
orte_checkpoint_globals.verbose = false;
orte_checkpoint_globals.verbose_level = 0;
orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID;
orte_checkpoint_globals.nowait = false;
orte_checkpoint_globals.status = false;
orte_checkpoint_globals.output = -1;
orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
orte_checkpoint_globals.list_only = false;
#if OPAL_ENABLE_CRDEBUG == 1
orte_checkpoint_globals.enable_crdebug = false;
#endif
orte_checkpoint_globals.options = OBJ_NEW(opal_crs_base_ckpt_options_t);
orte_checkpoint_globals.term = false;
orte_checkpoint_globals.stop = false;
#if OPAL_ENABLE_CRDEBUG == 1
orte_checkpoint_globals.attach_debugger = false;
orte_checkpoint_globals.detach_debugger = false;
#endif
#if OPAL_ENABLE_FT_CR == 0
/* Warn and exit if not configured with Checkpoint/Restart */
{
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-checkpoint.txt", "usage-no-cr",
true, args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
exit_status = ORTE_ERROR;
goto cleanup;
}
#endif
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv);
if (OPAL_SUCCESS != ret) {
if (OPAL_ERR_SILENT != ret) {
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
opal_strerror(ret));
}
exit_status = 1;
goto cleanup;
}
if (orte_checkpoint_globals.help) {
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-checkpoint.txt", "usage", true,
args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
/* If we show the help message, that should be all we do */
exit(0);
}
/**
* Put all of the MCA arguments in the environment
*/
mca_base_cmd_line_process_args(argc, &app_env, &global_env);
len = opal_argv_count(app_env);
for(i = 0; i < len; ++i) {
putenv(app_env[i]);
}
len = opal_argv_count(global_env);
for(i = 0; i < len; ++i) {
putenv(global_env[i]);
}
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
*/
/* get the remaining bits */
argv0 = strdup(argv[0]);
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
if(orte_checkpoint_globals.list_only ) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
if (0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp) {
fprintf(stderr, "%s: Nothing to do\n", argv0);
fprintf(stderr, "Type '%s --help' for usage.\n", argv0);
exit_status = 1;
goto cleanup;
}
orte_checkpoint_globals.options->term = orte_checkpoint_globals.term;
orte_checkpoint_globals.options->stop = orte_checkpoint_globals.stop;
#if OPAL_ENABLE_CRDEBUG == 1
orte_checkpoint_globals.options->attach_debugger = orte_checkpoint_globals.attach_debugger;
orte_checkpoint_globals.options->detach_debugger = orte_checkpoint_globals.detach_debugger;
#endif
if(orte_checkpoint_globals.verbose_level < 0 ) {
orte_checkpoint_globals.verbose_level = 0;
}
if(orte_checkpoint_globals.verbose_level > 0) {
orte_checkpoint_globals.verbose = true;
}
/*
* If the user did not supply an hnp jobid, then they must
* supply the PID of MPIRUN
*/
if(0 >= argc &&
ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
orte_checkpoint_globals.pid = atoi(argv[0]);
if ( 0 >= orte_checkpoint_globals.pid ) {
opal_show_help("help-orte-checkpoint.txt", "invalid_pid", true,
orte_checkpoint_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* JJH: No wait is currently not implemented or tested
*/
if(orte_checkpoint_globals.nowait) {
orte_checkpoint_globals.nowait = false;
opal_show_help("help-orte-checkpoint.txt", "not_impl",
true,
"Disconnected checkpoint");
}
if(orte_checkpoint_globals.verbose) {
orte_checkpoint_globals.status = true;
}
cleanup:
if (NULL != argv0) {
free(argv0);
}
return exit_status;
}
/*
* This function attempts to find an HNP to connect to.
*/
static int find_hnp(void) {
int ret, exit_status = ORTE_SUCCESS;
opal_list_t hnp_list;
opal_list_item_t *item;
orte_hnp_contact_t *hnpcandidate;
/* get the list of local hnp's available to us and setup
* contact info for them into the RML
*/
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
opal_show_help("help-orte-checkpoint.txt", "no_hnps", true,
orte_checkpoint_globals.pid,
orte_process_info.tmpdir_base,
orte_process_info.top_session_dir,
ret, ORTE_ERROR_NAME(ret));
exit_status = ret;
goto cleanup;
}
/* search the list for the desired hnp */
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
hnpcandidate = (orte_hnp_contact_t*)item;
if (hnpcandidate->name.jobid == orte_checkpoint_globals.req_hnp ||
hnpcandidate->pid == orte_checkpoint_globals.pid) {
/* this is the one we want */
orterun_hnp = hnpcandidate;
exit_status = ORTE_SUCCESS;
goto cleanup;
}
}
/* If no match was found, error out */
opal_show_help("help-orte-checkpoint.txt", "no_universe", true,
orte_checkpoint_globals.pid,
orte_process_info.tmpdir_base,
orte_process_info.top_session_dir);
cleanup:
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
if( NULL == orterun_hnp ) {
return ORTE_ERROR;
} else {
return exit_status;
}
}
static int ckpt_init(int argc, char *argv[]) {
int exit_status = ORTE_SUCCESS, ret;
char * tmp_env_var = NULL;
listener_started = false;
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
return ret;
}
/*
* Parse Command Line Arguments
*/
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
return ret;
}
/* Disable the checkpoint notification routine for this
* tool. As we will never need to checkpoint this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
(void) mca_base_var_env_name("crs", &tmp_env_var);
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/* we are never allowed to operate as a distributed tool,
* so insist on the ess/tool component */
opal_setenv("OMPI_MCA_ess", "tool", true, &environ);
/***************************
* We need all of OPAL and the TOOLS portion of ORTE - this
* sets us up so we can talk to any HNP over the wire
***************************/
if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
exit_status = ret;
goto cleanup;
}
/*
* Setup ORTE Output handle from the verbose argument
*/
if( orte_checkpoint_globals.verbose ) {
orte_checkpoint_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_checkpoint_globals.output, orte_checkpoint_globals.verbose_level);
} else {
orte_checkpoint_globals.output = 0; /* Default=STDERR */
}
/*
* Start the listener
*/
if( ORTE_SUCCESS != (ret = start_listener() ) ) {
exit_status = ret;
}
cleanup:
return exit_status;
}
static int ckpt_finalize(void) {
int exit_status = ORTE_SUCCESS, ret;
/*
* Stop the listener
*/
if( ORTE_SUCCESS != (ret = stop_listener() ) ) {
exit_status = ret;
}
if (ORTE_SUCCESS != (ret = orte_finalize())) {
exit_status = ret;
}
return exit_status;
}
static int start_listener(void)
{
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT,
ORTE_RML_PERSISTENT, hnp_receiver, NULL);
listener_started = true;
return ORTE_SUCCESS;
}
static int stop_listener(void)
{
if( !listener_started ) {
return ORTE_ERROR;
}
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT);
listener_started = false;
return ORTE_SUCCESS;
}
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
orte_snapc_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
opal_output_verbose(5, orte_checkpoint_globals.output,
"orte_checkpoint: hnp_receiver: Receive a command message.");
/*
* Otherwise this is an inter-coordinator command (usually updating state info).
*/
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_CMD))) {
ORTE_ERROR_LOG(rc);
return;
}
switch (command) {
case ORTE_SNAPC_GLOBAL_UPDATE_CMD:
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: hnp_receiver: Status Update.");
process_ckpt_update_cmd(sender, buffer);
break;
case ORTE_SNAPC_GLOBAL_INIT_CMD:
case ORTE_SNAPC_GLOBAL_TERM_CMD:
/* Do Nothing */
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
}
}
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer)
{
int ret;
orte_std_cntr_t count = 1;
int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
/*
* Receive the data:
* - ckpt_state
* - global snapshot handle (upon finish only)
* - sequence number (upon finish only)
*/
count = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
return;
}
orte_checkpoint_globals.ckpt_status = ckpt_status;
if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status ||
ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ||
ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status ||
ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
count = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_snapshot_handle, &count, OPAL_STRING)) ) {
return;
}
count = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_sequence_num, &count, OPAL_INT)) ) {
return;
}
}
/*
* If the job is not able to be checkpointed, then return
*/
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) {
opal_show_help("help-orte-checkpoint.txt", "non-ckptable",
true,
orte_checkpoint_globals.pid);
is_checkpoint_finished = true;
return;
}
if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status) {
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
orte_checkpoint_globals.pid, ORTE_ERROR);
is_checkpoint_finished = true;
return;
}
/* Status progression */
if( orte_checkpoint_globals.status ) {
pretty_print_status();
}
if( ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status) {
is_checkpoint_finished = true;
return;
}
/* Normal termination check */
if( (ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status && is_checkpoint_established) ||
(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status && is_checkpoint_recovered) ){
is_checkpoint_finished = true;
return;
}
else if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status ) {
is_checkpoint_recovered = true;
}
else if(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ) {
is_checkpoint_established = true;
}
}
static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t *buffer = NULL;
orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_INIT_CMD;
orte_jobid_t jobid = ORTE_JOBID_INVALID;
if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) {
exit_status = ORTE_ERROR;
goto cleanup;
}
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
orte_checkpoint_globals.pid);
timer_start = get_time();
/***********************************
* Notify HNP of checkpoint request
* Send:
* - Command
* - options
* - jobid
***********************************/
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = orte_snapc_base_pack_options(buffer, options)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &jobid, 1, ORTE_JOBID))) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&(orterun_hnp->name), buffer,
ORTE_RML_TAG_CKPT, orte_rml_send_callback,
NULL))) {
exit_status = ret;
goto cleanup;
}
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: notify_hnp: Requested a checkpoint of jobid %s\n",
ORTE_JOBID_PRINT(jobid));
cleanup:
if( ORTE_SUCCESS != exit_status ) {
opal_show_help("help-orte-checkpoint.txt", "unable_to_connect", true,
orte_checkpoint_globals.pid);
}
return exit_status;
}
/***************
* Pretty Print
***************/
static double get_time(void) {
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static int pretty_print_status(void) {
char * state_str = NULL;
double cur_time;
cur_time = get_time();
if( timer_last == 0 ) {
timer_last = cur_time;
}
orte_snapc_ckpt_state_str(&state_str, orte_checkpoint_globals.ckpt_status);
if( NULL != global_snapshot_handle ) {
opal_output(0,
"[%6.2f / %6.2f] %*s - %s\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str, global_snapshot_handle);
} else {
opal_output(0,
"[%6.2f / %6.2f] %*s - ...\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str);
}
if( NULL != state_str) {
free(state_str);
}
timer_last = cur_time;
return ORTE_SUCCESS;
}
static int pretty_print_reference(void)
{
#if OPAL_ENABLE_CRDEBUG == 1
if( orte_checkpoint_globals.enable_crdebug ) {
printf("Checkpoint handle: -s %3d %s\n",
global_sequence_num,
global_snapshot_handle);
return ORTE_SUCCESS;
}
#endif
printf("Snapshot Ref.: %3d %s\n",
global_sequence_num,
global_snapshot_handle);
return ORTE_SUCCESS;
}
static int list_all_snapshots(void) {
int ret, exit_status = ORTE_SUCCESS;
opal_list_t *all_snapshots = NULL;
opal_list_item_t* item = NULL;
orte_sstore_base_global_snapshot_info_t *global_snapshot = NULL;
all_snapshots = OBJ_NEW(opal_list_t);
if( ORTE_SUCCESS != (ret = orte_sstore_base_get_all_snapshots(all_snapshots, NULL)) ) {
opal_output(0, "Error: Unable to list the checkpoints in the directory <%s>\n",
orte_sstore_base_global_snapshot_dir);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* For each reference
*/
for(item = opal_list_get_first(all_snapshots);
item != opal_list_get_end(all_snapshots);
item = opal_list_get_next(item) ) {
global_snapshot = (orte_sstore_base_global_snapshot_info_t*)item;
/*
* Get a list of valid sequence numbers
*/
if( ORTE_SUCCESS != (ret = orte_sstore_base_find_all_seq_nums(global_snapshot,
&(global_snapshot->num_seqs),
&(global_snapshot->all_seqs)))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
#if OPAL_ENABLE_CRDEBUG == 1
int s;
/* Pretty print the result - C/R Debug version */
if( orte_checkpoint_globals.enable_crdebug ) {
for(s = 0; s < global_snapshot->num_seqs; ++s) {
printf("-s %s %s\n", global_snapshot->all_seqs[s], global_snapshot->reference);
}
}
else
#endif
{
/* Pretty print the result */
printf("Snapshot Ref.: %s\t[",
global_snapshot->reference);
if( 0 >= global_snapshot->num_seqs ) {
printf("No Valid Checkpoints");
} else {
printf("%s",
opal_argv_join(global_snapshot->all_seqs, ','));
}
printf("]\n");
}
}
cleanup:
while (NULL != (item = opal_list_remove_first(all_snapshots))) {
OBJ_RELEASE(item);
}
OBJ_RELEASE(all_snapshots);
return exit_status;
}

Просмотреть файл

@ -1,44 +0,0 @@
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
#
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
include $(top_srcdir)/Makefile.ompi-rules
man_pages = orte-migrate.1
EXTRA_DIST = orte-migrate.1in
if WANT_FT_CR
if OPAL_INSTALL_BINARIES
bin_PROGRAMS = orte-migrate
nodist_man_MANS = $(man_pages)
# Ensure that the man pages are rebuilt if the opal_config.h file
# changes; a "good enough" way to know if configure was run again (and
# therefore the release date or version may have changed)
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
dist_ortedata_DATA = help-orte-migrate.txt
endif # OPAL_INSTALL_BINARIES
orte_migrate_SOURCES = orte-migrate.c
orte_migrate_LDADD = \
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
endif # WANT_FT_CR
distclean-local:
rm -f $(man_pages)

Просмотреть файл

@ -1,81 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI migrate tool
#
[usage]
ompi-migrate PID_OF_MPIRUN
Open MPI Process Migration Tool
%s
#
[usage-no-cr]
This build of Open MPI does *not* include Checkpoint/Restart functionality.
If you require this functionality re-configure Open MPI with the proper
Checkpoint/Restart options.
ompi-migrate PID_OF_MPIRUN
Open MPI Migrage Tool
%s
#
[invalid_pid]
Error: The PID (%d) is invalid because either you have not provided a PID
or provided an invalid PID.
Please see --help for usage.
#
[no_universe]
Error: Unable to find the contact information for PID %d.
This could be due to one of the following:
- The PID is not that of an active MPIRUN.
- The application with this PID isn't migratable
- The application with this PID isn't an Open MPI application.
ompi-migrate attempted to find the session directory:
%s
#
[unable_to_connect]
Error: Unable to connect to the Head Node Process to initiate the
migration of the application.
This could be due to one of the following:
- The PID is not that of an active MPIRUN.
- The application with this PID isn't migratable
- The application with this PID isn't an Open MPI application.
#
[non-ckptable]
Error: The job with pid %d is not checkpointable.
This could be caused by one of the following:
- The application is using unsupported components.
- Your application did not select to be checkpointable
To enable checkpointing in an application use the following AMCA parameter
argument to mpirun:
-am ft-enable-cr
#
[not_impl]
The following feature was requested, but is not currently implemented.
%s
If you require this feature contact the Open MPI development group.
#
[err-inprogress]
Error: The Job identified by PID (%d) is currently migrating other processes.
Only one migration request can be processed at a time. Please try again
later.
#
[err-other]
Error: The Job identified by PID (%d) was not able to migrate processes in this
job. This could be caused by any of the following:
- Invalid node or rank specified
- No processes on the indicated node can by migrated
- Process migration was not enabled for this job. Make sure to indicate
the proper AMCA file: "-am ft-enable-cr-recovery".

Просмотреть файл

@ -1,81 +0,0 @@
.\"
.\" Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
.\" University Research and Technology
.\" Corporation. All rights reserved.
.\"
.\" Man page for OMPI's ompi-migrate command
.\"
.\" .TH name section center-footer left-footer center-header
.TH OMPI-MIGRATE 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
.\" **************************
.\" Name Section
.\" **************************
.SH NAME
.
ompi-migrate, orte-migrate \- Migrate processes among resources in Open MPI.
.
.PP
.
\fBNOTE:\fP \fIompi-migrate\fP, and \fIorte-migrate\fP are all exact
synonyms for each other. Using any of the names will result in exactly
identical behavior.
.
.\" **************************
.\" Synopsis Section
.\" **************************
.SH SYNOPSIS
.
.B ompi-migrate
.R [ options ]
.B <PID_OF_MPIRUN>
.
.\" **************************
.\" Options Section
.\" **************************
.SH Options
.
\fIorte-migrate\fR will attempt to notify a running parallel job (identified
by \fImpirun\fP) that a migration has been requeted.
.
.TP 10
.B <PID_OF_MPIRUN>
Process ID of the \fImpirun\fP process.
.
.
.TP
.B -h | --help
Display help for this command
.
.
.TP
.B -v | --verbose
Enable verbose output for debugging.
.
.
.TP
.B -gmca | --gmca \fR<key> <value>\fP
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
the parameter name; \fI<value>\fP is the parameter value.
.
.
.TP
.B -mca | --mca <key> <value>
Send arguments to various MCA modules.
.
.
.\" **************************
.\" Description Section
.\" **************************
.SH DESCRIPTION
.
.PP
\fIorte-migrate\fR can be invoked multiple, non-overlapping times.
.
.
.\" **************************
.\" See Also Section
.\" **************************
.
.SH SEE ALSO
orte-ps(1), orte-clean(1), ompi-restart(1), ompi-checkpoint(1), opal-checkpoint(1), opal-restart(1), opal_crs(7)
.

Просмотреть файл

@ -1,791 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* ORTE Process Migration Tool for migrating processes in a multiprocess job
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h> /* for mkfifo */
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#include <string.h>
#include "opal/util/cmd_line.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/mca/base/base.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_cr.h"
#include "orte/util/hnp_contact.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "opal/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "opal/dss/dss.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/******************
* Local Functions
******************/
static int tool_init(int argc, char *argv[]); /* Initalization routine */
static int tool_finalize(void); /* Finalization routine */
static int parse_args(int argc, char *argv[]);
static int find_hnp(void);
static int start_listener(void);
static int stop_listener(void);
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata);
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer);
static int notify_hnp(void);
static int pretty_print_status(void);
static int pretty_print_migration(void);
static orte_hnp_contact_t *orterun_hnp = NULL;
static int orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
static bool listener_started = false;
static double timer_start = 0;
static double timer_last = 0;
static double get_time(void);
typedef struct {
bool help;
int pid;
bool verbose;
int verbose_level;
bool status;
int output;
char *off_nodes;
char *off_procs;
char *onto_nodes;
} orte_migrate_globals_t;
orte_migrate_globals_t orte_migrate_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL,
'h', NULL, "help",
0,
&orte_migrate_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL,
'v', NULL, "verbose",
0,
&orte_migrate_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL,
'V', NULL, NULL,
1,
&orte_migrate_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
"Set the verbosity level (For additional debugging information)" },
{ "hnp-pid",
'\0', NULL, "hnp-pid",
1,
&orte_migrate_globals.pid, OPAL_CMD_LINE_TYPE_INT,
"This should be the pid of the mpirun whose applications you wish "
"to migrate." },
{ NULL,
'x', NULL, "off",
1,
&orte_migrate_globals.off_nodes, OPAL_CMD_LINE_TYPE_STRING,
"List of nodes to migrate off of (comma separated)" },
{ NULL,
'r', NULL, "ranks",
1,
&orte_migrate_globals.off_procs, OPAL_CMD_LINE_TYPE_STRING,
"List of MPI_COMM_WORLD ranks to migrate (comma separated)" },
{ NULL,
't', NULL, "onto",
1,
&orte_migrate_globals.onto_nodes, OPAL_CMD_LINE_TYPE_STRING,
"List of nodes to migrate onto (comma separated)" },
/* End of list */
{ NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
int
main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
/***************
* Initialize
***************/
if (ORTE_SUCCESS != (ret = tool_init(argc, argv))) {
exit_status = ret;
goto cleanup;
}
/***************************
* Find the HNP that we want to connect to, if it exists
***************************/
if( orte_migrate_globals.verbose ) {
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: Finding HNP...");
}
if (ORTE_SUCCESS != (ret = find_hnp())) {
opal_show_help("help-orte-migrate.txt", "invalid_pid",
true, orte_migrate_globals.pid);
exit_status = ret;
goto cleanup;
}
/*******************************
* Send migration information to HNP
*******************************/
if( orte_migrate_globals.verbose ) {
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: Sending info to HNP...");
}
if (ORTE_SUCCESS != (ret = notify_hnp())) {
opal_output(0,
"HNP with PID %d Not found!",
orte_migrate_globals.pid);
exit_status = ret;
goto cleanup;
}
/*******************************
* Wait for migration to complete
*******************************/
while( ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status &&
ORTE_ERRMGR_MIGRATE_STATE_ERROR != orte_migrate_ckpt_status &&
ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS != orte_migrate_ckpt_status) {
opal_progress();
}
if( orte_migrate_globals.status ) {
orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH;
pretty_print_status();
}
cleanup:
/***************
* Cleanup
***************/
if (ORTE_SUCCESS != (ret = tool_finalize())) {
return ret;
}
return exit_status;
}
static int parse_args(int argc, char *argv[]) {
int i, ret, len, exit_status = ORTE_SUCCESS ;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
char *argv0 = NULL;
/* Init structure */
memset(&orte_migrate_globals, 0, sizeof(orte_migrate_globals_t));
orte_migrate_globals.help = false;
orte_migrate_globals.pid = -1;
orte_migrate_globals.verbose = false;
orte_migrate_globals.verbose_level = 0;
orte_migrate_globals.status = false;
orte_migrate_globals.output = -1;
orte_migrate_globals.off_nodes = NULL;
orte_migrate_globals.off_procs = NULL;
orte_migrate_globals.onto_nodes = NULL;
#if OPAL_ENABLE_FT_CR == 0
/* Warn and exit if not configured with Migrate/Restart */
{
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-migrate.txt", "usage-no-cr",
true, args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
exit_status = ORTE_ERROR;
goto cleanup;
}
#endif
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv);
if (OPAL_SUCCESS != ret) {
if (OPAL_ERR_SILENT != ret) {
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
opal_strerror(ret));
}
exit_status = 1;
goto cleanup;
}
if (orte_migrate_globals.help) {
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-migrate.txt", "usage", true,
args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
/* If we show the help message, that should be all we do */
exit(0);
}
/**
* Put all of the MCA arguments in the environment
*/
mca_base_cmd_line_process_args(argv, &app_env, &global_env);
len = opal_argv_count(app_env);
for(i = 0; i < len; ++i) {
putenv(app_env[i]);
}
len = opal_argv_count(global_env);
for(i = 0; i < len; ++i) {
putenv(global_env[i]);
}
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
*/
/* get the remaining bits */
argv0 = strdup(argv[0]);
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
if (NULL == orte_migrate_globals.off_nodes &&
NULL == orte_migrate_globals.off_procs) {
fprintf(stderr, "%s: Nothing to do\n", argv0);
fprintf(stderr, "Type '%s --help' for usage.\n", argv0);
exit_status = 1;
goto cleanup;
}
if(orte_migrate_globals.verbose_level < 0 ) {
orte_migrate_globals.verbose_level = 0;
}
if(orte_migrate_globals.verbose_level > 0) {
orte_migrate_globals.verbose = true;
}
/*
* If the user did not supply an hnp jobid, then they must
* supply the PID of MPIRUN
*/
if(0 >= argc ) {
fprintf(stderr, "%s: Nothing to do\n", argv[0]);
fprintf(stderr, "Type '%s --help' for usage.\n", argv[0]);
exit_status = ORTE_ERROR;
goto cleanup;
}
orte_migrate_globals.pid = atoi(argv[0]);
if ( 0 >= orte_migrate_globals.pid ) {
opal_show_help("help-orte-migrate.txt", "invalid_pid", true,
orte_migrate_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
if(orte_migrate_globals.verbose) {
orte_migrate_globals.status = true;
}
if(orte_migrate_globals.verbose) {
pretty_print_migration();
}
cleanup:
if (NULL != argv0) {
free(argv0);
}
return exit_status;
}
/*
* This function attempts to find an HNP to connect to.
*/
static int find_hnp(void) {
int ret, exit_status = ORTE_SUCCESS;
opal_list_t hnp_list;
opal_list_item_t *item;
orte_hnp_contact_t *hnpcandidate;
/* get the list of local hnp's available to us and setup
* contact info for them into the RML
*/
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* search the list for the desired hnp */
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
hnpcandidate = (orte_hnp_contact_t*)item;
if( hnpcandidate->pid == orte_migrate_globals.pid) {
/* this is the one we want */
orterun_hnp = hnpcandidate;
exit_status = ORTE_SUCCESS;
goto cleanup;
}
}
cleanup:
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
if( NULL == orterun_hnp ) {
return ORTE_ERROR;
} else {
return exit_status;
}
}
static int tool_init(int argc, char *argv[]) {
int exit_status = ORTE_SUCCESS, ret;
char * tmp_env_var = NULL;
listener_started = false;
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
return ret;
}
/*
* Parse Command Line Arguments
*/
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
return ret;
}
/* Disable the migrate notification routine for this
* tool. As we will never need to migrate this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a migrateer */
(void) mca_base_var_env_name("crs", &tmp_env_var);
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/* we are never allowed to operate as a distributed tool,
* so insist on the ess/tool component */
opal_setenv("OMPI_MCA_ess", "tool", true, &environ);
/***************************
* We need all of OPAL and the TOOLS portion of ORTE - this
* sets us up so we can talk to any HNP over the wire
***************************/
if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
exit_status = ret;
goto cleanup;
}
/*
* Setup ORTE Output handle from the verbose argument
*/
if( orte_migrate_globals.verbose ) {
orte_migrate_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_migrate_globals.output, orte_migrate_globals.verbose_level);
} else {
orte_migrate_globals.output = 0; /* Default=STDERR */
}
/*
* Start the listener
*/
if( ORTE_SUCCESS != (ret = start_listener() ) ) {
exit_status = ret;
}
cleanup:
return exit_status;
}
static int tool_finalize(void) {
int exit_status = ORTE_SUCCESS, ret;
/*
* Stop the listener
*/
if( ORTE_SUCCESS != (ret = stop_listener() ) ) {
exit_status = ret;
}
if (ORTE_SUCCESS != (ret = orte_finalize())) {
exit_status = ret;
}
return exit_status;
}
static int start_listener(void)
{
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE,
ORTE_RML_PERSISTENT, hnp_receiver, NULL);
listener_started = true;
return ORTE_SUCCESS;
}
static int stop_listener(void)
{
if( !listener_started ) {
return ORTE_ERROR;
}
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE);
listener_started = false;
return ORTE_SUCCESS;
}
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
orte_errmgr_tool_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
opal_output_verbose(5, orte_migrate_globals.output,
"orte_migrate: hnp_receiver: Receive a command message.");
/*
* Otherwise this is an inter-coordinator command (usually updating state info).
*/
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_MIGRATE_TOOL_CMD))) {
ORTE_ERROR_LOG(rc);
return;
}
switch (command) {
case ORTE_ERRMGR_MIGRATE_TOOL_UPDATE_CMD:
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: hnp_receiver: Status Update.");
process_ckpt_update_cmd(sender, buffer);
break;
case ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD:
/* Do Nothing */
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
}
}
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer)
{
int ret;
orte_std_cntr_t count = 1;
int ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
/*
* Receive the data:
* - ckpt_state
*/
count = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
goto cleanup;
}
orte_migrate_ckpt_status = ckpt_status;
/*
* If the job is not able to be migrateed, then return
*/
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_migrate_ckpt_status) {
opal_show_help("help-orte-migrate.txt", "non-ckptable",
true,
orte_migrate_globals.pid);
goto cleanup;
}
/*
* If a migration is already in progress, then we must tell the user to
* try again later.
*/
if( ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS == orte_migrate_ckpt_status) {
opal_show_help("help-orte-migrate.txt", "err-inprogress",
true,
orte_migrate_globals.pid);
goto cleanup;
}
/*
* If there was an error, display a message and exit
*/
if( ORTE_ERRMGR_MIGRATE_STATE_ERROR == orte_migrate_ckpt_status ) {
opal_show_help("help-orte-migrate.txt", "err-other",
true,
orte_migrate_globals.pid);
goto cleanup;
}
/*
* If we are to display the status progression
*/
if( orte_migrate_globals.status ) {
if(ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status) {
pretty_print_status();
}
}
cleanup:
return;
}
static int notify_hnp(void)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t *buffer = NULL;
orte_errmgr_tool_cmd_flag_t command = ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD;
if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) {
exit_status = ORTE_ERROR;
goto cleanup;
}
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: notify_hnp: Contact Head Node Process PID %d\n",
orte_migrate_globals.pid);
timer_start = get_time();
/***********************************
* Notify HNP of migrate request
* Send:
* - Command
* - Off Nodes
* - Off Procs
* - Onto Nodes
***********************************/
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_ERRMGR_MIGRATE_TOOL_CMD)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_procs), 1, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_nodes), 1, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.onto_nodes), 1, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&(orterun_hnp->name), buffer,
ORTE_RML_TAG_MIGRATE, orte_rml_send_callback,
NULL))) {
exit_status = ret;
goto cleanup;
}
cleanup:
if( NULL != buffer) {
OBJ_RELEASE(buffer);
buffer = NULL;
}
if( ORTE_SUCCESS != exit_status ) {
opal_show_help("help-orte-migrate.txt", "unable_to_connect", true,
orte_migrate_globals.pid);
}
return exit_status;
}
/***************
* Pretty Print
***************/
static double get_time(void) {
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static int pretty_print_status(void) {
char * state_str = NULL;
double cur_time;
cur_time = get_time();
if( timer_last == 0 ) {
timer_last = cur_time;
}
orte_errmgr_base_migrate_state_str(&state_str, orte_migrate_ckpt_status);
opal_output(0,
"[%6.2f / %6.2f] %*s - ...\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str);
if( NULL != state_str) {
free(state_str);
}
timer_last = cur_time;
return ORTE_SUCCESS;
}
static int pretty_print_migration(void)
{
char **loc_off_nodes = NULL;
char **loc_off_procs = NULL;
char **loc_onto_nodes = NULL;
int loc_off_nodes_cnt = 0;
int loc_off_procs_cnt = 0;
int loc_onto_cnt = 0;
int i;
if( NULL != orte_migrate_globals.off_nodes ) {
loc_off_nodes = opal_argv_split(orte_migrate_globals.off_nodes, ',');
loc_off_nodes_cnt = opal_argv_count(loc_off_nodes);
}
if( NULL != orte_migrate_globals.off_procs ) {
loc_off_procs = opal_argv_split(orte_migrate_globals.off_procs, ',');
loc_off_procs_cnt = opal_argv_count(loc_off_procs);
}
if( NULL != orte_migrate_globals.onto_nodes ) {
loc_onto_nodes = opal_argv_split(orte_migrate_globals.onto_nodes, ',');
loc_onto_cnt = opal_argv_count(loc_onto_nodes);
}
printf("Migrate Nodes: (%d nodes)\n", loc_off_nodes_cnt);
for(i = 0; i < loc_off_nodes_cnt; ++i) {
printf("\t\"%s\"\n", loc_off_nodes[i]);
}
printf("Migrate Ranks: (%d ranks)\n", loc_off_procs_cnt);
for(i = 0; i < loc_off_procs_cnt; ++i) {
printf("\t\"%s\"\n", loc_off_procs[i]);
}
printf("Migrate Onto : (%d nodes)\n", loc_onto_cnt);
for(i = 0; i < loc_onto_cnt; ++i) {
printf("\t\"%s\"\n", loc_onto_nodes[i]);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,51 +0,0 @@
#
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
include $(top_srcdir)/Makefile.ompi-rules
man_pages = orte-restart.1
EXTRA_DIST = orte-restart.1in
if WANT_FT_CR
if OPAL_INSTALL_BINARIES
bin_PROGRAMS = orte-restart
nodist_man_MANS = $(man_pages)
# Ensure that the man pages are rebuilt if the opal_config.h file
# changes; a "good enough" way to know if configure was run again (and
# therefore the release date or version may have changed)
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
dist_ortedata_DATA = help-orte-restart.txt
endif # OPAL_INSTALL_BINARIES
orte_restart_SOURCES = orte-restart.c
orte_restart_LDADD = \
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
endif # WANT_FT_CR
distclean-local:
rm -f $(man_pages)

Просмотреть файл

@ -1,77 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI checkpoint tool
#
[usage]
ompi-restart GLOBAL_SNAPSHOT_REF
Open MPI Parallel Job Restart Tool
%s
#
[usage-no-cr]
This build of Open MPI does *not* include Checkpoint/Restart functionality.
If you require this functionality re-configure Open MPI with the proper
Checkpoint/Restart options.
ompi-restart GLOBAL_SNAPSHOT_REF
Open MPI Parallel Job Restart Tool
%s
#
[invalid_filename]
Error: The filename provided (referenced below) could not be used for
restarting the job. This could be for a variety of reasons:
- The filename/handle is invalid,
- The snapshot directory no longer exisits, or
- There are no stable checkpoint sequences in this global snapshot.
Please see --help for usage.
Filename: %s
#
[restart_cmd_failure]
Error: Unable to obtain the proper restart command to restart from the
checkpoint file (%s). Returned %d.
#
[comp_select_failure]
Error: Unable to select the %s component needed to restart this
application. (Returned %d)
This likely indicates that the checkpointer needed is not
available on this machine. You should move to a machine that
has this checkpointer enabled.
#
[restart_failure]
Error: The restart command:
shell$ %s
returned an error code %d, and was unable to restart properly.
#
[invalid_seq_num]
Error: The filename (%s) and sequence number (%d) could not be used.
This may be caused by an invalid sequence number. Try using the
'-i' option to determine a correct value.
#
[amca_param_not_found]
Warning: Unable to find the AMCA parameter in the checkpoint metadata.
This is the option supplied to mpirun as '-am '. Restart will
assume this value to be '%s'.
#
[tune_param_not_found]
Warning: Unable to find the TUNE parameter in the checkpoint metadata.
This is the option supplied to mpirun as '-tune '. Restart will
assume this value to be '%s'.

Просмотреть файл

@ -1,115 +0,0 @@
.\"
.\" Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
.\" University Research and Technology
.\" Corporation. All rights reserved.
.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
.\"
.\" Man page for OMPI's ompi-restart command
.\"
.\" .TH name section center-footer left-footer center-header
.TH OMPI-RESTART 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
.\" **************************
.\" Name Section
.\" **************************
.SH NAME
.
ompi-restart, orte-restart \- Restart a previously checkpointed parallel job
using the Open PAL Checkpoint/Restart Service (CRS)
.
.PP
.
\fBNOTE:\fP \fIompi-restart\fP, and \fIorte-restart\fP are all exact
synonyms for each other. Using any of the names will result in exactly
identical behavior.
.
.\" **************************
.\" Synopsis Section
.\" **************************
.SH SYNOPSIS
.
.B ompi-restart
.B [ options ]
.B <GLOBAL SNAPSHOT HANDLE>
.
.\" **************************
.\" Options Section
.\" **************************
.SH Options
.
\fIompi-restart\fR will attempt to restart a previously checkpointed parallel
job from the global snapshot handle reference returned by \fIompi_checkpoint\fP.
.
.TP 10
.B <GLOBAL SNAPSHOT HANDLE>
The global snapshot handle reference returned by \fIompi_checkpoint\fP, used to
restart the job. This is required to be the last argument to this command.
.
.
.TP
.B -h | --help
Display help for this command
.
.
.TP
.B -p | --preload
Preload the checkpoint files on the remote systems before restarting the
application. Disabled by default.
.
.
.TP
.B --fork
Fork off a new process, which is the restarted process. By default, the
restarted process will replace \fIompi-restart\fR.
.
.
.TP
.B -s | --seq
The sequence number of the checkpoint to restart from. By default, the most
recent sequence number is used (specified by -1).
.
.
.TP
.B -hostfile | --hostfile
The hostfile from which to restart the application. Useful in unscheduled
environments. (Same behavior as --machinefile option)
.
.
.TP
.B -machinefile | --machinefile
The machinefile from which to restart the application. Useful in unscheduled
environments. (Same behavior as --hostfile option)
.
.
.TP
.B -v | --verbose
Enable verbose output for debugging.
.
.
.TP
.B -gmca | --gmca \fR<key> <value>\fP
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
the parameter name; \fI<value>\fP is the parameter value.
.
.
.TP
.B -mca | --mca <key> <value>
Send arguments to various MCA modules.
.
.
.\" **************************
.\" Description Section
.\" **************************
.SH DESCRIPTION
.
.PP
\fIompi-restart\fR can be invoked multiple, non-overlapping times. This
allows the user to restart a previously running parallel job.
.
.
.\" **************************
.\" See Also Section
.\" **************************
.
.SH SEE ALSO
orte-ps(1), orte-clean(1), ompi-checkpoint(1), opal-checkpoint(1), opal-restart(1), opal_crs(7)
.

Просмотреть файл

@ -1,897 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* ORTE Restart Tool for restarting a previously checkpointed multiprocess job
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <stdlib.h>
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#include <string.h>
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/util/cmd_line.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/util/error.h"
#include "opal/util/path.h"
#include "opal/mca/base/base.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_cr.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/sstore.h"
#include "orte/mca/sstore/base/base.h"
#include "orte/mca/filem/base/base.h"
#include "opal/util/show_help.h"
#include "orte/util/proc_info.h"
/******************
* Local Functions
******************/
static int initialize(int argc, char *argv[]);
static int finalize(void);
static int parse_args(int argc, char *argv[]);
static int create_appfile(orte_sstore_base_global_snapshot_info_t *snapshot);
static int spawn_children(orte_sstore_base_global_snapshot_info_t *snapshot, pid_t *child_pid);
static int snapshot_info(orte_sstore_base_global_snapshot_info_t *snapshot);
static int snapshot_sort_compare_fn(opal_list_item_t **a,
opal_list_item_t **b);
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
typedef struct {
bool help;
char *snapshot_ref;
char *appfile;
bool verbose;
bool forked;
int seq_number;
char *hostfile;
int output;
bool info_only;
bool app_only;
bool showme;
char *mpirun_opts;
#if OPAL_ENABLE_CRDEBUG == 1
bool enable_crdebug;
#endif
} orte_restart_globals_t;
orte_restart_globals_t orte_restart_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL,
'h', NULL, "help",
0,
&orte_restart_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL,
'v', NULL, "verbose",
0,
&orte_restart_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL,
'\0', NULL, "fork",
0,
&orte_restart_globals.forked, OPAL_CMD_LINE_TYPE_BOOL,
"Fork off a new process which is the restarted process instead of "
"replacing orte_restart" },
{ NULL,
's', NULL, "seq",
1,
&orte_restart_globals.seq_number, OPAL_CMD_LINE_TYPE_INT,
"The sequence number of the checkpoint to start from. "
"(Default: -1, or most recent)" },
{ NULL,
'\0', "hostfile", "hostfile",
1,
&orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile to use for launch" },
{ NULL,
'\0', "machinefile", "machinefile",
1,
&orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile to use for launch" },
{ NULL,
'i', NULL, "info",
0,
&orte_restart_globals.info_only, OPAL_CMD_LINE_TYPE_BOOL,
"Display information about the checkpoint" },
{ NULL,
'a', NULL, "apponly",
0,
&orte_restart_globals.app_only, OPAL_CMD_LINE_TYPE_BOOL,
"Only create the app context file, do not restart from it" },
{ NULL,
'\0', NULL, "showme",
0,
&orte_restart_globals.showme, OPAL_CMD_LINE_TYPE_BOOL,
"Display the full command line that would have been exec'ed." },
{ NULL,
'\0', "mpirun_opts", "mpirun_opts",
1,
&orte_restart_globals.mpirun_opts, OPAL_CMD_LINE_TYPE_STRING,
"Command line options to pass directly to mpirun (be sure to quote long strings, and escape internal quotes)" },
#if OPAL_ENABLE_CRDEBUG == 1
{ NULL,
'\0', "crdebug", "crdebug",
0,
&orte_restart_globals.enable_crdebug, OPAL_CMD_LINE_TYPE_BOOL,
"Enable C/R Enhanced Debugging" },
#endif
/* End of list */
{ NULL,
'\0', NULL, NULL,
0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
int
main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
pid_t child_pid = 0;
orte_sstore_base_global_snapshot_info_t *snapshot = NULL;
char *basedir = NULL;
char *tmp_str = NULL;
/***************
* Initialize
***************/
if (ORTE_SUCCESS != (ret = initialize(argc, argv))) {
exit_status = ret;
goto cleanup;
}
snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t);
if( opal_path_is_absolute(orte_restart_globals.snapshot_ref) ) {
basedir = opal_dirname(orte_restart_globals.snapshot_ref);
tmp_str = opal_basename(orte_restart_globals.snapshot_ref);
free(orte_restart_globals.snapshot_ref);
orte_restart_globals.snapshot_ref = strdup(tmp_str);
free(tmp_str);
tmp_str = NULL;
} else if( NULL != strchr(orte_restart_globals.snapshot_ref, '/') ) {
basedir = opal_dirname(orte_restart_globals.snapshot_ref);
tmp_str = opal_basename(orte_restart_globals.snapshot_ref);
free(orte_restart_globals.snapshot_ref);
orte_restart_globals.snapshot_ref = strdup(tmp_str);
free(tmp_str);
tmp_str = NULL;
} else {
basedir = NULL; /* Use MCA parameter */
}
/*
* Note: If the seq # passed is -1, then the largest seq # is selected,
* ow the seq # requested is selected if available
* 'basedir': Snapshot Base location to look in. If NULL then MCA parameter is used
*/
if( ORTE_SUCCESS != (ret = orte_sstore.request_restart_handle(&(snapshot->ss_handle),
basedir,
orte_restart_globals.snapshot_ref,
orte_restart_globals.seq_number,
snapshot))) {
opal_show_help("help-orte-restart.txt", "invalid_filename", true,
orte_restart_globals.snapshot_ref);
exit_status = ret;
goto cleanup;
}
orte_restart_globals.seq_number = snapshot->seq_num;
if(orte_restart_globals.info_only ) {
if (ORTE_SUCCESS != (ret = snapshot_info(snapshot))) {
exit_status = ret;
goto cleanup;
}
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/******************************
* Create the app file to use with mpirun/orterun
******************************/
if( ORTE_SUCCESS != (ret = create_appfile(snapshot) ) ) {
exit_status = ret;
goto cleanup;
}
if( orte_restart_globals.app_only ) {
printf("Created Appfile:\n\t%s\n", orte_restart_globals.appfile);
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/******************************
* Restart in this process [mpirun/orterun]
******************************/
if( orte_restart_globals.verbose ) {
opal_output_verbose(10, orte_restart_globals.output,
"Restarting from file (%s)",
orte_restart_globals.snapshot_ref);
if( orte_restart_globals.forked ) {
opal_output_verbose(10, orte_restart_globals.output,
"\t Forking off a child");
} else {
opal_output_verbose(10, orte_restart_globals.output,
"\t Exec in self");
}
}
if( ORTE_SUCCESS != (ret = spawn_children(snapshot, &child_pid)) ) {
opal_show_help("help-orte-restart.txt", "restart_cmd_failure", true,
orte_restart_globals.snapshot_ref, ret);
exit_status = ret;
goto cleanup;
}
/***************
* Cleanup
***************/
cleanup:
if( NULL != basedir ) {
free(basedir);
basedir = NULL;
}
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
if( NULL != snapshot ) {
OBJ_RELEASE(snapshot);
snapshot = NULL;
}
if (OPAL_SUCCESS != (ret = finalize())) {
return ret;
}
return exit_status;
}
static int initialize(int argc, char *argv[]) {
int ret, exit_status = ORTE_SUCCESS;
char * tmp_env_var = NULL;
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
return ret;
}
/*
* Parse command line arguments
*/
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
exit_status = ret;
goto cleanup;
}
/*
* Setup OPAL Output handle from the verbose argument
*/
if( orte_restart_globals.verbose ) {
orte_restart_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_restart_globals.output, 10);
} else {
orte_restart_globals.output = 0; /* Default=STDERR */
}
/* Disable the checkpoint notification routine for this
* tool. As we will never need to checkpoint this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
(void) mca_base_var_env_name("crs", &tmp_env_var);
opal_setenv(tmp_env_var,
"none",
true, &environ);
/* Don't free the environment variable name. It is used again below */
/*
/* we are never allowed to operate as a distributed tool,
* so insist on the ess/tool component */
opal_setenv("OMPI_MCA_ess", "tool", true, &environ);
/* Setup any ORTE stuff we might need */
if (OPAL_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
exit_status = ret;
goto cleanup;
}
/* Unset these now that we no longer need them */
opal_unsetenv(tmp_env_var, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
opal_unsetenv(tmp_env_var, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
cleanup:
return exit_status;
}
static int finalize(void)
{
int ret;
if (OPAL_SUCCESS != (ret = orte_finalize())) {
return ret;
}
return ORTE_SUCCESS;
}
static int parse_args(int argc, char *argv[])
{
int i, ret, len;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
char *argv0 = NULL;
orte_restart_globals_t tmp = { false, /* help */
NULL, /* filename */
NULL, /* appfile */
false, /* verbose */
false, /* forked */
-1, /* seq_number */
NULL, /* hostfile */
-1, /* output*/
false, /* info only */
false, /* app only */
false, /* showme */
NULL}; /* mpirun_opts */
orte_restart_globals = tmp;
#if OPAL_ENABLE_CRDEBUG == 1
orte_restart_globals.enable_crdebug = false;
#endif
#if OPAL_ENABLE_FT_CR == 0
/* Warn and exit if not configured with Checkpoint/Restart */
{
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-restart.txt", "usage-no-cr",
true, args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
return ORTE_ERROR;
}
#endif
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv);
if (OPAL_SUCCESS != ret) {
if (OPAL_ERR_SILENT != ret) {
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
opal_strerror(ret));
}
return 1;
}
if (orte_restart_globals.help) {
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-restart.txt", "usage", true,
args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
/* If we show the help message, that should be all we do */
exit(0);
}
/**
* Put all of the MCA arguments in the environment
*/
mca_base_cmd_line_process_args(argv, &app_env, &global_env);
len = opal_argv_count(app_env);
for(i = 0; i < len; ++i) {
putenv(app_env[i]);
}
len = opal_argv_count(global_env);
for(i = 0; i < len; ++i) {
putenv(global_env[i]);
}
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
*/
/* get the remaining bits */
argv0 = strdup(argv[0]);
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
if (0 == argc) {
fprintf(stderr, "%s: Nothing to do\n", argv0);
fprintf(stderr, "Type '%s --help' for usge.\n", argv0);
free(argv0);
return ORTE_ERROR;
}
free(argv0);
orte_restart_globals.snapshot_ref = strdup(argv[0]);
if ( NULL == orte_restart_globals.snapshot_ref ||
0 >= strlen(orte_restart_globals.snapshot_ref) ) {
opal_show_help("help-orte-restart.txt", "invalid_filename", true,
"<none provided>");
return ORTE_ERROR;
}
/* If we have arguments after the command, then assume they
* need to be grouped together.
*/
if(argc > 1) {
orte_restart_globals.snapshot_ref = strdup(opal_argv_join(argv, ' '));
}
return ORTE_SUCCESS;
}
static int create_appfile(orte_sstore_base_global_snapshot_info_t *snapshot)
{
int exit_status = ORTE_SUCCESS;
FILE *appfile = NULL;
opal_list_item_t* item = NULL;
char *tmp_str = NULL;
char *amca_param = NULL;
char *tune_param = NULL;
char *reference_fmt_str = NULL;
char *location_str = NULL;
char *ref_location_fmt_str = NULL;
orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
/*
* Create the appfile
*/
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_SNAP_LOC_ABS,
&tmp_str);
asprintf(&orte_restart_globals.appfile, "%s/%s",
tmp_str,
strdup("restart-appfile"));
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_AMCA_PARAM,
&amca_param);
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_TUNE_PARAM,
&tune_param);
if (NULL == (appfile = fopen(orte_restart_globals.appfile, "w")) ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
/* This will give a format string that we can use */
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_LOCAL_SNAP_REF_FMT,
&reference_fmt_str);
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_LOCAL_SNAP_LOC,
&location_str);
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT,
&ref_location_fmt_str);
/*
* Sort the snapshots so that they are in order
*/
opal_list_sort(&snapshot->local_snapshots, snapshot_sort_compare_fn);
/*
* Construct the appfile
*/
for(item = opal_list_get_first(&snapshot->local_snapshots);
item != opal_list_get_end(&snapshot->local_snapshots);
item = opal_list_get_next(item) ) {
vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
fprintf(appfile, "#\n");
fprintf(appfile, "# Old Process Name: %u.%u\n",
vpid_snapshot->process_name.jobid,
vpid_snapshot->process_name.vpid);
fprintf(appfile, "#\n");
fprintf(appfile, "-np 1 ");
fprintf(appfile, "--sstore-load ");
/* loc:ref:postfix:seq */
fprintf(appfile, "%s:%s:",
location_str,
orte_restart_globals.snapshot_ref);
fprintf(appfile, reference_fmt_str, vpid_snapshot->process_name.vpid);
fprintf(appfile, ":%s:%s:%d ",
(vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
(vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
orte_restart_globals.seq_number);
if( NULL == amca_param ) {
amca_param = strdup("ft-enable-cr");
opal_show_help("help-orte-restart.txt", "amca_param_not_found", true,
amca_param);
}
fprintf(appfile, "-am %s ", amca_param);
if( NULL == tune_param ) {
tune_param = strdup("ft-enable-cr");
opal_show_help("help-orte-restart.txt", "tune_param_not_found", true,
tune_param);
}
fprintf(appfile, "-tune %s ", tune_param);
fprintf(appfile, " opal-restart ");
/*
* By default, point to the central storage location of the checkpoint.
* The active SStore module at restart time will determine if files
* need to be preloaded.
*/
fprintf(appfile, "-l %s", location_str);
fprintf(appfile, " -m %s ", orte_sstore_base_local_metadata_filename);
fprintf(appfile, "-r ");
fprintf(appfile, reference_fmt_str, vpid_snapshot->process_name.vpid);
fprintf(appfile, "\n");
}
cleanup:
if(NULL != appfile) {
fclose(appfile);
appfile = NULL;
}
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
if( NULL != location_str ) {
free(location_str);
location_str = NULL;
}
if( NULL != reference_fmt_str ) {
free(reference_fmt_str);
reference_fmt_str = NULL;
}
if( NULL != ref_location_fmt_str ) {
free(ref_location_fmt_str);
ref_location_fmt_str = NULL;
}
return exit_status;
}
static int spawn_children(orte_sstore_base_global_snapshot_info_t *snapshot, pid_t *child_pid)
{
int ret, exit_status = ORTE_SUCCESS;
char *amca_param = NULL;
char *tune_param = NULL;
char **argv = NULL;
int argc = 0, i;
int status;
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_AMCA_PARAM,
&amca_param);
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_TUNE_PARAM,
&tune_param);
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "mpirun")) ) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "-am")) ) {
exit_status = ret;
goto cleanup;
}
if( NULL == amca_param ) {
amca_param = strdup("ft-enable-cr");
opal_show_help("help-orte-restart.txt", "amca_param_not_found", true,
amca_param);
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, amca_param)) ) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "-tune")) ) {
exit_status = ret;
goto cleanup;
}
if( NULL == tune_param ) {
tune_param = strdup("ft-enable-cr");
opal_show_help("help-orte-restart.txt", "tune_param_not_found", true,
tune_param);
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, tune_param)) ) {
exit_status = ret;
goto cleanup;
}
if( NULL != orte_restart_globals.hostfile ) {
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--default-hostfile")) ) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.hostfile)) ) {
exit_status = ret;
goto cleanup;
}
}
if( orte_restart_globals.mpirun_opts ) {
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.mpirun_opts)) ) {
exit_status = ret;
goto cleanup;
}
}
#if OPAL_ENABLE_CRDEBUG == 1
if( orte_restart_globals.enable_crdebug ) {
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--crdebug")) ) {
exit_status = ret;
goto cleanup;
}
}
#endif
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--app")) ) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.appfile)) ) {
exit_status = ret;
goto cleanup;
}
if( orte_restart_globals.showme ) {
for(i = 0; i < argc; ++i ) {
/*printf("%2d: (%s)\n", i, argv[i]);*/
printf("%s ", argv[i]);
}
printf("\n");
return ORTE_SUCCESS;
}
/* To fork off a child */
if( orte_restart_globals.forked ) {
*child_pid = fork();
if( 0 == *child_pid) {
/* Child Process */
status = execvp(strdup(argv[0]), argv);
if( 0 > status) {
opal_output(orte_restart_globals.output,
"orte_restart: execv failed with status = %d\n",
status);
}
exit_status = status;
goto cleanup;
}
else if(0 < *child_pid) {
/* Parent is done once it is started */
;
}
else {
opal_output(orte_restart_globals.output,
"orte_restart: fork failed: This should never happen!");
/* Fork failed :( */
exit_status = *child_pid;
goto cleanup;
}
}
/* ... or not to fork off a child */
else {
/* Make sure to finalize so we don't leave our session directory */
orte_finalize();
status = execvp(strdup(argv[0]), argv);
if( 0 > status) {
/* execv failed */
}
exit_status = status;
goto cleanup;
}
opal_output_verbose(10, orte_restart_globals.output,
"orte_restart: Restarted Child with PID = %d\n", *child_pid);
cleanup:
if( NULL != argv)
opal_argv_free(argv);
return exit_status;
}
int snapshot_info(orte_sstore_base_global_snapshot_info_t *snapshot)
{
int ret, exit_status = ORTE_SUCCESS;
int num_seqs, processes, i;
char **snapshot_ref_seqs = NULL;
opal_list_item_t* item = NULL;
orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
char *tmp_str = NULL;
/*
* Find all sequence numbers
*/
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_SNAP_NUM_SEQ,
&tmp_str);
num_seqs = atoi(tmp_str);
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_SNAP_ALL_SEQ,
&tmp_str);
snapshot_ref_seqs = opal_argv_split(tmp_str, ',');
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
if( 0 > orte_restart_globals.seq_number ) {
opal_output(orte_restart_globals.output,
"Sequences: %d\n",
num_seqs);
}
for(i=0; i < num_seqs; ++i) {
snapshot->seq_num = atoi(snapshot_ref_seqs[i]);
if( 0 <= orte_restart_globals.seq_number &&
snapshot->seq_num != orte_restart_globals.seq_number ) {
continue;
}
if( ORTE_SUCCESS != (ret = orte_sstore_base_extract_global_metadata( snapshot ) ) ) {
exit_status = ret;
goto cleanup;
}
opal_output(orte_restart_globals.output,
"Seq: %d\n",
snapshot->seq_num);
if (NULL != snapshot->start_time ) {
opal_output(orte_restart_globals.output,
"\tBegin Timestamp: %s\n",
snapshot->start_time);
}
if (NULL != snapshot->end_time ) {
opal_output(orte_restart_globals.output,
"\tEnd Timestamp : %s\n",
snapshot->end_time);
}
processes = opal_list_get_size(&snapshot->local_snapshots);
opal_output(orte_restart_globals.output,
"\tProcesses: %d\n",
processes);
for(item = opal_list_get_first(&snapshot->local_snapshots);
item != opal_list_get_end(&snapshot->local_snapshots);
item = opal_list_get_next(item) ) {
vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
opal_output_verbose(10, orte_restart_globals.output,
"\t\tProcess: %u.%u \t CRS: %s \t Compress: %s (%s)",
vpid_snapshot->process_name.jobid,
vpid_snapshot->process_name.vpid,
vpid_snapshot->crs_comp,
vpid_snapshot->compress_comp,
vpid_snapshot->compress_postfix);
}
}
cleanup:
return exit_status;
}
static int snapshot_sort_compare_fn(opal_list_item_t **a,
opal_list_item_t **b)
{
orte_sstore_base_local_snapshot_info_t *snap_a, *snap_b;
snap_a = (orte_sstore_base_local_snapshot_info_t*)(*a);
snap_b = (orte_sstore_base_local_snapshot_info_t*)(*b);
if( snap_a->process_name.vpid > snap_b->process_name.vpid ) {
return 1;
}
else if( snap_a->process_name.vpid == snap_b->process_name.vpid ) {
return 0;
}
else {
return -1;
}
}