Коммит
08c93091f7
@ -25,12 +25,9 @@ AC_DEFUN([ORTE_CONFIG_FILES],[
|
||||
orte/tools/wrappers/Makefile
|
||||
orte/tools/wrappers/ortecc-wrapper-data.txt
|
||||
orte/tools/wrappers/orte.pc
|
||||
orte/tools/orte-checkpoint/Makefile
|
||||
orte/tools/orte-restart/Makefile
|
||||
orte/tools/orte-ps/Makefile
|
||||
orte/tools/orte-clean/Makefile
|
||||
orte/tools/orte-top/Makefile
|
||||
orte/tools/orte-migrate/Makefile
|
||||
orte/tools/orte-info/Makefile
|
||||
orte/tools/orte-server/Makefile
|
||||
orte/tools/orte-dvm/Makefile
|
||||
|
@ -84,24 +84,6 @@ $(top_builddir)/orte/tools/orte-clean/orte-clean.1:
|
||||
ompi-clean.1: $(top_builddir)/orte/tools/orte-clean/orte-clean.1
|
||||
cp -f $(top_builddir)/orte/tools/orte-clean/orte-clean.1 ompi-clean.1
|
||||
|
||||
$(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1:
|
||||
(cd $(top_builddir)/orte/tools/orte-checkpoint && $(MAKE) $(AM_MAKEFLAGS) orte-checkpoint.1)
|
||||
|
||||
ompi-checkpoint.1: $(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1
|
||||
cp -f $(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1 ompi-checkpoint.1
|
||||
|
||||
$(top_builddir)/orte/tools/orte-restart/orte-restart.1:
|
||||
(cd $(top_builddir)/orte/tools/orte-restart && $(MAKE) $(AM_MAKEFLAGS) orte-restart.1)
|
||||
|
||||
ompi-restart.1: $(top_builddir)/orte/tools/orte-restart/orte-restart.1
|
||||
cp -f $(top_builddir)/orte/tools/orte-restart/orte-restart.1 ompi-restart.1
|
||||
|
||||
$(top_builddir)/orte/tools/orte-migrate/orte-migrate.1:
|
||||
(cd $(top_builddir)/orte/tools/orte-migrate && $(MAKE) $(AM_MAKEFLAGS) orte-migrate.1)
|
||||
|
||||
ompi-migrate.1: $(top_builddir)/orte/tools/orte-migrate/orte-migrate.1
|
||||
cp -f $(top_builddir)/orte/tools/orte-migrate/orte-migrate.1 ompi-migrate.1
|
||||
|
||||
$(top_builddir)/orte/tools/orte-top/orte-top.1:
|
||||
(cd $(top_builddir)/orte/tools/orte-top && $(MAKE) $(AM_MAKEFLAGS) orte-top.1)
|
||||
|
||||
|
@ -25,29 +25,23 @@
|
||||
# orte/Makefile.am
|
||||
|
||||
SUBDIRS += \
|
||||
tools/orte-checkpoint \
|
||||
tools/orte-clean \
|
||||
tools/orte-ps \
|
||||
tools/orte-restart \
|
||||
tools/orted \
|
||||
tools/orterun \
|
||||
tools/wrappers \
|
||||
tools/orte-top \
|
||||
tools/orte-info \
|
||||
tools/orte-migrate \
|
||||
tools/orte-server
|
||||
|
||||
DIST_SUBDIRS += \
|
||||
tools/orte-checkpoint \
|
||||
tools/orte-clean \
|
||||
tools/orte-ps \
|
||||
tools/orte-restart \
|
||||
tools/orted \
|
||||
tools/orterun \
|
||||
tools/wrappers \
|
||||
tools/orte-top \
|
||||
tools/orte-info \
|
||||
tools/orte-migrate \
|
||||
tools/orte-server \
|
||||
tools/orte-dvm \
|
||||
tools/prun
|
||||
|
@ -1,51 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
include $(top_srcdir)/Makefile.ompi-rules
|
||||
|
||||
man_pages = orte-checkpoint.1
|
||||
EXTRA_DIST = orte-checkpoint.1in
|
||||
|
||||
if WANT_FT_CR
|
||||
if OPAL_INSTALL_BINARIES
|
||||
|
||||
bin_PROGRAMS = orte-checkpoint
|
||||
|
||||
nodist_man_MANS = $(man_pages)
|
||||
|
||||
# Ensure that the man pages are rebuilt if the opal_config.h file
|
||||
# changes; a "good enough" way to know if configure was run again (and
|
||||
# therefore the release date or version may have changed)
|
||||
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
|
||||
|
||||
dist_ortedata_DATA = help-orte-checkpoint.txt
|
||||
|
||||
endif # OPAL_INSTALL_BINARIES
|
||||
|
||||
orte_checkpoint_SOURCES = orte-checkpoint.c
|
||||
orte_checkpoint_LDADD = \
|
||||
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
|
||||
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
||||
|
||||
endif # WANT_FT_CR
|
||||
|
||||
distclean-local:
|
||||
rm -f $(man_pages)
|
@ -1,113 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI checkpoint tool
|
||||
#
|
||||
[usage]
|
||||
ompi-checkpoint PID_OF_MPIRUN
|
||||
Open MPI Checkpoint Tool
|
||||
|
||||
%s
|
||||
#
|
||||
[usage-no-cr]
|
||||
This build of Open MPI does *not* include Checkpoint/Restart functionality.
|
||||
If you require this functionality re-configure Open MPI with the proper
|
||||
Checkpoint/Restart options.
|
||||
|
||||
ompi-checkpoint PID_OF_MPIRUN
|
||||
Open MPI Checkpoint Tool
|
||||
|
||||
%s
|
||||
#
|
||||
[invalid_pid]
|
||||
Error: The PID (%d) is invalid because either you have not provided a PID
|
||||
or provided an invalid PID.
|
||||
Please see --help for usage.
|
||||
#
|
||||
[ckpt_failure]
|
||||
Error: The application (PID = %d) failed to checkpoint properly.
|
||||
Returned %d.
|
||||
#
|
||||
[pid_does_not_exist]
|
||||
Error: The process with PID %d is not checkpointable.
|
||||
This could be due to one of the following:
|
||||
- An application with this PID doesn't currently exist
|
||||
- The application with this PID isn't checkpointable
|
||||
- The application with this PID isn't an Open MPI application.
|
||||
We were looking for the named file:
|
||||
%s
|
||||
#
|
||||
[no_hnps]
|
||||
Error: Unable to find a list of active MPIRUN processes on this machine.
|
||||
This could be due to one of the following:
|
||||
- The PID specified (%d) is not that of an active MPIRUN.
|
||||
- The session directory location could not be found/parsed.
|
||||
|
||||
ompi-checkpoint attempted to find the session directory:
|
||||
%s/%s
|
||||
Check to make sure that this directory exists while the MPIRUN
|
||||
process is running.
|
||||
|
||||
Return Code: %d (%s)
|
||||
#
|
||||
[no_universe]
|
||||
Error: Unable to find the requested, active MPIRUN process on this machine.
|
||||
This could be due to one of the following:
|
||||
- The jobid specified by the '--hnp-jobid' option is not
|
||||
correct.
|
||||
- The PID specified (%d) is not that of an active MPIRUN.
|
||||
- The application with this PID is not checkpointable
|
||||
- The application with this PID is not an Open MPI application.
|
||||
- The session directory location could not be parsed.
|
||||
|
||||
ompi-checkpoint attempted to use the session directory:
|
||||
%s/%s
|
||||
#
|
||||
[unable_to_connect]
|
||||
Error: Unable to connect to the Head Node Process to initiate the
|
||||
checkpoint of the application.
|
||||
This could be due to one of the following:
|
||||
- The universe specified by the '--hnp-jobid' option is not
|
||||
correct.
|
||||
- The PID is not that of an active MPIRUN.
|
||||
- The application with this PID isn't checkpointable
|
||||
- The application with this PID isn't an Open MPI application.
|
||||
#
|
||||
[non-ckptable]
|
||||
Error: The job with pid %d is not checkpointable.
|
||||
This could be caused by one of the following:
|
||||
- The application is using unsupported components.
|
||||
- Your application did not select to be checkpointable
|
||||
To enable checkpointing in an application use the following AMCA parameter
|
||||
argument to mpirun:
|
||||
-am ft-enable-cr
|
||||
#
|
||||
[not_impl]
|
||||
The following feature was requested, but is not currently implemented.
|
||||
%s
|
||||
If you require this feature contact the Open MPI development group.
|
||||
|
||||
[pid_not_found]
|
||||
Error: The process with PID %d is not checkpointable.
|
||||
This could be due to one of the following:
|
||||
- An application with this PID doesn't currently exist
|
||||
- The application with this PID isn't an Open MPI application.
|
||||
#
|
||||
[hnp_not_found]
|
||||
Error: The jobid specified by the '--hnp-jobid' option does not exist.
|
@ -1,103 +0,0 @@
|
||||
.\"
|
||||
.\" Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
.\" University Research and Technology
|
||||
.\" Corporation. All rights reserved.
|
||||
.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
.\"
|
||||
.\" Man page for OMPI's ompi-checkpoint command
|
||||
.\"
|
||||
.\" .TH name section center-footer left-footer center-header
|
||||
.TH OMPI-CHECKPOINT 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
|
||||
.\" **************************
|
||||
.\" Name Section
|
||||
.\" **************************
|
||||
.SH NAME
|
||||
.
|
||||
ompi-checkpoint, orte-checkpoint \- Checkpoint a running parallel process using the Open MPI
|
||||
Checkpoint/Restart Service (CRS)
|
||||
.
|
||||
.PP
|
||||
.
|
||||
\fBNOTE:\fP \fIompi-checkpoint\fP, and \fIorte-checkpoint\fP are all exact
|
||||
synonyms for each other. Using any of the names will result in exactly
|
||||
identical behavior.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Synopsis Section
|
||||
.\" **************************
|
||||
.SH SYNOPSIS
|
||||
.
|
||||
.B ompi-checkpoint
|
||||
.B [ options ]
|
||||
.B <PID_OF_MPIRUN>
|
||||
.
|
||||
.\" **************************
|
||||
.\" Options Section
|
||||
.\" **************************
|
||||
.SH Options
|
||||
.
|
||||
\fIorte-checkpoint\fR will attempt to notify a running parallel job (identified
|
||||
by \fImpirun\fP) that it has been requested that the job checkpoint itself. A
|
||||
global snapshot handle reference is presented to the user, which is used in
|
||||
\fIompi_restart\fP to restart the job.
|
||||
.
|
||||
.TP 10
|
||||
.B <PID_OF_MPIRUN>
|
||||
Process ID of the \fImpirun\fP process.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -h | --help
|
||||
Display help for this command
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -w | --nowait
|
||||
Do not wait for the application to finish checkpointing before returning.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -s | --status
|
||||
Display status messages regarding the progression of the checkpoint request.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --term
|
||||
After checkpointing the running job, terminate it.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -v | --verbose
|
||||
Enable verbose output for debugging.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -gmca | --gmca \fR<key> <value>\fP
|
||||
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||
the parameter name; \fI<value>\fP is the parameter value.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -mca | --mca <key> <value>
|
||||
Send arguments to various MCA modules.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Description Section
|
||||
.\" **************************
|
||||
.SH DESCRIPTION
|
||||
.
|
||||
.PP
|
||||
\fIorte-checkpoint\fR can be invoked multiple, non-overlapping times.
|
||||
It is convenient to note that the user does not need to spectify
|
||||
the checkpointer to be used here, as that is determined completely by each of
|
||||
the running process in the job being checkpointed.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" See Also Section
|
||||
.\" **************************
|
||||
.
|
||||
.SH SEE ALSO
|
||||
orte-ps(1), orte-clean(1), ompi-restart(1), opal-checkpoint(1), opal-restart(1), opal_crs(7)
|
||||
.
|
@ -1,985 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* ORTE Checkpoint Tool for checkpointing a multiprocess job
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif /* HAVE_FCNTL_H */
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h> /* for mkfifo */
|
||||
#endif /* HAVE_SYS_STAT_H */
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif /* HAVE_SYS_WAIT_H */
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#include "opal/util/cmd_line.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/util/hnp_contact.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/mca/sstore/sstore.h"
|
||||
#include "orte/mca/sstore/base/base.h"
|
||||
|
||||
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||
|
||||
/******************
|
||||
* Local Functions
|
||||
******************/
|
||||
static int ckpt_init(int argc, char *argv[]); /* Initalization routine */
|
||||
static int ckpt_finalize(void); /* Finalization routine */
|
||||
static int parse_args(int argc, char *argv[]);
|
||||
static int find_hnp(void);
|
||||
|
||||
static int start_listener(void);
|
||||
static int stop_listener(void);
|
||||
static void hnp_receiver(int status,
|
||||
orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
|
||||
static void process_ckpt_update_cmd(orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer);
|
||||
|
||||
static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options);
|
||||
static int pretty_print_status(void);
|
||||
static int pretty_print_reference(void);
|
||||
|
||||
static int list_all_snapshots(void);
|
||||
|
||||
static orte_hnp_contact_t *orterun_hnp = NULL;
|
||||
static char * global_snapshot_handle = NULL;
|
||||
static int global_sequence_num = 0;
|
||||
|
||||
/*****************************************
|
||||
* Global Vars for Command line Arguments
|
||||
*****************************************/
|
||||
static bool listener_started = false;
|
||||
static bool is_checkpoint_finished = false;
|
||||
static bool is_checkpoint_established = false;
|
||||
static bool is_checkpoint_recovered = false;
|
||||
|
||||
static double timer_start = 0;
|
||||
static double timer_last = 0;
|
||||
static double get_time(void);
|
||||
|
||||
typedef struct {
|
||||
bool help;
|
||||
int pid;
|
||||
opal_crs_base_ckpt_options_t *options;
|
||||
bool term;
|
||||
bool stop;
|
||||
bool verbose;
|
||||
int verbose_level;
|
||||
orte_jobid_t req_hnp; /**< User Requested HNP */
|
||||
bool nowait; /* Do not wait for checkpoint to complete before returning */
|
||||
bool status; /* Display status messages while checkpoint is progressing */
|
||||
int output;
|
||||
int ckpt_status;
|
||||
bool list_only; /* List available checkpoints only */
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
bool enable_crdebug; /* Enable C/R Debugging */
|
||||
bool attach_debugger;
|
||||
bool detach_debugger;
|
||||
#endif
|
||||
} orte_checkpoint_globals_t;
|
||||
|
||||
orte_checkpoint_globals_t orte_checkpoint_globals;
|
||||
|
||||
opal_cmd_line_init_t cmd_line_opts[] = {
|
||||
{ NULL,
|
||||
'h', NULL, "help",
|
||||
0,
|
||||
&orte_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
|
||||
{ NULL,
|
||||
'v', NULL, "verbose",
|
||||
0,
|
||||
&orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be Verbose" },
|
||||
|
||||
{ NULL,
|
||||
'V', NULL, NULL,
|
||||
1,
|
||||
&orte_checkpoint_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Set the verbosity level (For additional debugging information)" },
|
||||
|
||||
{ NULL,
|
||||
'\0', NULL, "term",
|
||||
0,
|
||||
&(orte_checkpoint_globals.term), OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Terminate the application after checkpoint (Cannot be used with --stop)" },
|
||||
|
||||
{ NULL,
|
||||
'\0', NULL, "stop",
|
||||
0,
|
||||
&(orte_checkpoint_globals.stop), OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Send SIGSTOP to application just after checkpoint (checkpoint will not finish until SIGCONT is sent) (Cannot be used with --term)" },
|
||||
|
||||
{ NULL,
|
||||
'w', NULL, "nowait",
|
||||
0,
|
||||
&orte_checkpoint_globals.nowait, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not wait for the application to finish checkpointing before returning" },
|
||||
|
||||
{ NULL,
|
||||
's', NULL, "status",
|
||||
0,
|
||||
&orte_checkpoint_globals.status, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display status messages describing the progression of the checkpoint" },
|
||||
|
||||
{ "hnp-jobid",
|
||||
'\0', NULL, "hnp-jobid",
|
||||
1,
|
||||
&orte_checkpoint_globals.req_hnp, OPAL_CMD_LINE_TYPE_INT,
|
||||
"This should be the jobid of the HNP whose applications you wish "
|
||||
"to checkpoint." },
|
||||
|
||||
{ "hnp-pid",
|
||||
'\0', NULL, "hnp-pid",
|
||||
1,
|
||||
&orte_checkpoint_globals.pid, OPAL_CMD_LINE_TYPE_INT,
|
||||
"This should be the pid of the mpirun whose applications you wish "
|
||||
"to checkpoint." },
|
||||
|
||||
{ NULL,
|
||||
'l', NULL, "list",
|
||||
0,
|
||||
&orte_checkpoint_globals.list_only, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display a list of checkpoint files available on this machine" },
|
||||
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
{ NULL,
|
||||
'\0', "crdebug", "crdebug",
|
||||
0,
|
||||
&orte_checkpoint_globals.enable_crdebug, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable C/R Enhanced Debugging" },
|
||||
|
||||
{ NULL,
|
||||
'\0', "attach", "attach",
|
||||
0,
|
||||
&(orte_checkpoint_globals.attach_debugger), OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Wait for the debugger to attach directly after taking the checkpoint." },
|
||||
|
||||
{ NULL,
|
||||
'\0', "detach", "detach",
|
||||
0,
|
||||
&(orte_checkpoint_globals.detach_debugger), OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not wait for the debugger to reattach after taking the checkpoint." },
|
||||
#endif
|
||||
|
||||
/* End of list */
|
||||
{ NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
||||
NULL }
|
||||
};
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
/***************
|
||||
* Initialize
|
||||
***************/
|
||||
if (ORTE_SUCCESS != (ret = ckpt_init(argc, argv))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*************************************
|
||||
* Listing only Checkpoint References
|
||||
*************************************/
|
||||
if( orte_checkpoint_globals.list_only ) {
|
||||
if (ORTE_SUCCESS != (ret = list_all_snapshots())) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/***************************
|
||||
* Find the HNP that we want to connect to, if it exists
|
||||
***************************/
|
||||
if (ORTE_SUCCESS != (ret = find_hnp())) {
|
||||
/* Error printed by called function */
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*******************************
|
||||
* Checkpoint the requested PID
|
||||
*******************************/
|
||||
is_checkpoint_finished = false;
|
||||
is_checkpoint_recovered = false;
|
||||
is_checkpoint_established = false;
|
||||
|
||||
if( orte_checkpoint_globals.verbose ) {
|
||||
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||
"orte_checkpoint: Checkpointing...");
|
||||
if (0 < orte_checkpoint_globals.pid) {
|
||||
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||
"\t PID %d",
|
||||
orte_checkpoint_globals.pid);
|
||||
} else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){
|
||||
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||
"\t Mpirun (%s)",
|
||||
ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp));
|
||||
}
|
||||
|
||||
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||
"\t Connected to Mpirun %s",
|
||||
ORTE_NAME_PRINT(&orterun_hnp->name));
|
||||
|
||||
if(orte_checkpoint_globals.options->term) {
|
||||
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||
"\t Terminating after checkpoint\n");
|
||||
}
|
||||
if(orte_checkpoint_globals.options->stop) {
|
||||
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||
"\t Stopping after checkpoint\n");
|
||||
}
|
||||
}
|
||||
|
||||
if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.options)) ) {
|
||||
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
|
||||
orte_checkpoint_globals.pid, ret);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for the checkpoint to complete
|
||||
*/
|
||||
if(!orte_checkpoint_globals.nowait) {
|
||||
while( !is_checkpoint_finished ) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
|
||||
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status ||
|
||||
ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if(!orte_checkpoint_globals.nowait) {
|
||||
pretty_print_reference();
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/***************
|
||||
* Cleanup
|
||||
***************/
|
||||
if (ORTE_SUCCESS != (ret = ckpt_finalize())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int parse_args(int argc, char *argv[]) {
|
||||
int i, ret, len, exit_status = ORTE_SUCCESS ;
|
||||
opal_cmd_line_t cmd_line;
|
||||
char **app_env = NULL, **global_env = NULL;
|
||||
char * tmp_env_var = NULL;
|
||||
char *argv0 = NULL;
|
||||
|
||||
/* Init structure */
|
||||
memset(&orte_checkpoint_globals, 0, sizeof(orte_checkpoint_globals_t));
|
||||
orte_checkpoint_globals.help = false;
|
||||
orte_checkpoint_globals.pid = -1;
|
||||
orte_checkpoint_globals.verbose = false;
|
||||
orte_checkpoint_globals.verbose_level = 0;
|
||||
orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID;
|
||||
orte_checkpoint_globals.nowait = false;
|
||||
orte_checkpoint_globals.status = false;
|
||||
orte_checkpoint_globals.output = -1;
|
||||
orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
orte_checkpoint_globals.list_only = false;
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
orte_checkpoint_globals.enable_crdebug = false;
|
||||
#endif
|
||||
|
||||
orte_checkpoint_globals.options = OBJ_NEW(opal_crs_base_ckpt_options_t);
|
||||
orte_checkpoint_globals.term = false;
|
||||
orte_checkpoint_globals.stop = false;
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
orte_checkpoint_globals.attach_debugger = false;
|
||||
orte_checkpoint_globals.detach_debugger = false;
|
||||
#endif
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 0
|
||||
/* Warn and exit if not configured with Checkpoint/Restart */
|
||||
{
|
||||
char *str, *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
str = opal_show_help_string("help-orte-checkpoint.txt", "usage-no-cr",
|
||||
true, args);
|
||||
if (NULL != str) {
|
||||
printf("%s", str);
|
||||
free(str);
|
||||
}
|
||||
free(args);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Parse the command line options */
|
||||
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
||||
mca_base_open();
|
||||
mca_base_cmd_line_setup(&cmd_line);
|
||||
ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv);
|
||||
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
if (OPAL_ERR_SILENT != ret) {
|
||||
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
|
||||
opal_strerror(ret));
|
||||
}
|
||||
exit_status = 1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (orte_checkpoint_globals.help) {
|
||||
char *str, *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
str = opal_show_help_string("help-orte-checkpoint.txt", "usage", true,
|
||||
args);
|
||||
if (NULL != str) {
|
||||
printf("%s", str);
|
||||
free(str);
|
||||
}
|
||||
free(args);
|
||||
/* If we show the help message, that should be all we do */
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Put all of the MCA arguments in the environment
|
||||
*/
|
||||
mca_base_cmd_line_process_args(argc, &app_env, &global_env);
|
||||
|
||||
len = opal_argv_count(app_env);
|
||||
for(i = 0; i < len; ++i) {
|
||||
putenv(app_env[i]);
|
||||
}
|
||||
|
||||
len = opal_argv_count(global_env);
|
||||
for(i = 0; i < len; ++i) {
|
||||
putenv(global_env[i]);
|
||||
}
|
||||
|
||||
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/**
|
||||
* Now start parsing our specific arguments
|
||||
*/
|
||||
/* get the remaining bits */
|
||||
argv0 = strdup(argv[0]);
|
||||
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
|
||||
|
||||
if(orte_checkpoint_globals.list_only ) {
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp) {
|
||||
fprintf(stderr, "%s: Nothing to do\n", argv0);
|
||||
fprintf(stderr, "Type '%s --help' for usage.\n", argv0);
|
||||
exit_status = 1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
orte_checkpoint_globals.options->term = orte_checkpoint_globals.term;
|
||||
orte_checkpoint_globals.options->stop = orte_checkpoint_globals.stop;
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
orte_checkpoint_globals.options->attach_debugger = orte_checkpoint_globals.attach_debugger;
|
||||
orte_checkpoint_globals.options->detach_debugger = orte_checkpoint_globals.detach_debugger;
|
||||
#endif
|
||||
|
||||
if(orte_checkpoint_globals.verbose_level < 0 ) {
|
||||
orte_checkpoint_globals.verbose_level = 0;
|
||||
}
|
||||
|
||||
if(orte_checkpoint_globals.verbose_level > 0) {
|
||||
orte_checkpoint_globals.verbose = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the user did not supply an hnp jobid, then they must
|
||||
* supply the PID of MPIRUN
|
||||
*/
|
||||
if(0 >= argc &&
|
||||
ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp) {
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
orte_checkpoint_globals.pid = atoi(argv[0]);
|
||||
if ( 0 >= orte_checkpoint_globals.pid ) {
|
||||
opal_show_help("help-orte-checkpoint.txt", "invalid_pid", true,
|
||||
orte_checkpoint_globals.pid);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* JJH: No wait is currently not implemented or tested
|
||||
*/
|
||||
if(orte_checkpoint_globals.nowait) {
|
||||
orte_checkpoint_globals.nowait = false;
|
||||
opal_show_help("help-orte-checkpoint.txt", "not_impl",
|
||||
true,
|
||||
"Disconnected checkpoint");
|
||||
}
|
||||
|
||||
if(orte_checkpoint_globals.verbose) {
|
||||
orte_checkpoint_globals.status = true;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (NULL != argv0) {
|
||||
free(argv0);
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function attempts to find an HNP to connect to.
|
||||
*/
|
||||
static int find_hnp(void) {
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_list_t hnp_list;
|
||||
opal_list_item_t *item;
|
||||
orte_hnp_contact_t *hnpcandidate;
|
||||
|
||||
/* get the list of local hnp's available to us and setup
|
||||
* contact info for them into the RML
|
||||
*/
|
||||
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
|
||||
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
|
||||
opal_show_help("help-orte-checkpoint.txt", "no_hnps", true,
|
||||
orte_checkpoint_globals.pid,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
ret, ORTE_ERROR_NAME(ret));
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* search the list for the desired hnp */
|
||||
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
|
||||
hnpcandidate = (orte_hnp_contact_t*)item;
|
||||
if (hnpcandidate->name.jobid == orte_checkpoint_globals.req_hnp ||
|
||||
hnpcandidate->pid == orte_checkpoint_globals.pid) {
|
||||
/* this is the one we want */
|
||||
orterun_hnp = hnpcandidate;
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* If no match was found, error out */
|
||||
opal_show_help("help-orte-checkpoint.txt", "no_universe", true,
|
||||
orte_checkpoint_globals.pid,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir);
|
||||
|
||||
cleanup:
|
||||
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&hnp_list);
|
||||
|
||||
if( NULL == orterun_hnp ) {
|
||||
return ORTE_ERROR;
|
||||
} else {
|
||||
return exit_status;
|
||||
}
|
||||
}
|
||||
|
||||
static int ckpt_init(int argc, char *argv[]) {
|
||||
int exit_status = ORTE_SUCCESS, ret;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
listener_started = false;
|
||||
|
||||
/*
|
||||
* Make sure to init util before parse_args
|
||||
* to ensure installdirs is setup properly
|
||||
* before calling mca_base_open();
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse Command Line Arguments
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Disable the checkpoint notification routine for this
|
||||
* tool. As we will never need to checkpoint this tool.
|
||||
* Note: This must happen before opal_init().
|
||||
*/
|
||||
opal_cr_set_enabled(false);
|
||||
|
||||
/* Select the none component, since we don't actually use a checkpointer */
|
||||
(void) mca_base_var_env_name("crs", &tmp_env_var);
|
||||
opal_setenv(tmp_env_var,
|
||||
"none",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/* we are never allowed to operate as a distributed tool,
|
||||
* so insist on the ess/tool component */
|
||||
opal_setenv("OMPI_MCA_ess", "tool", true, &environ);
|
||||
|
||||
/***************************
|
||||
* We need all of OPAL and the TOOLS portion of ORTE - this
|
||||
* sets us up so we can talk to any HNP over the wire
|
||||
***************************/
|
||||
if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Setup ORTE Output handle from the verbose argument
|
||||
*/
|
||||
if( orte_checkpoint_globals.verbose ) {
|
||||
orte_checkpoint_globals.output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(orte_checkpoint_globals.output, orte_checkpoint_globals.verbose_level);
|
||||
} else {
|
||||
orte_checkpoint_globals.output = 0; /* Default=STDERR */
|
||||
}
|
||||
|
||||
/*
|
||||
* Start the listener
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = start_listener() ) ) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int ckpt_finalize(void) {
|
||||
int exit_status = ORTE_SUCCESS, ret;
|
||||
|
||||
/*
|
||||
* Stop the listener
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = stop_listener() ) ) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_finalize())) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int start_listener(void)
|
||||
{
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT,
|
||||
ORTE_RML_PERSISTENT, hnp_receiver, NULL);
|
||||
|
||||
listener_started = true;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int stop_listener(void)
|
||||
{
|
||||
if( !listener_started ) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT);
|
||||
|
||||
listener_started = false;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void hnp_receiver(int status,
|
||||
orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
orte_snapc_cmd_flag_t command;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
|
||||
opal_output_verbose(5, orte_checkpoint_globals.output,
|
||||
"orte_checkpoint: hnp_receiver: Receive a command message.");
|
||||
|
||||
/*
|
||||
* Otherwise this is an inter-coordinator command (usually updating state info).
|
||||
*/
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (command) {
|
||||
case ORTE_SNAPC_GLOBAL_UPDATE_CMD:
|
||||
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||
"orte_checkpoint: hnp_receiver: Status Update.");
|
||||
|
||||
process_ckpt_update_cmd(sender, buffer);
|
||||
break;
|
||||
|
||||
case ORTE_SNAPC_GLOBAL_INIT_CMD:
|
||||
case ORTE_SNAPC_GLOBAL_TERM_CMD:
|
||||
/* Do Nothing */
|
||||
break;
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
}
|
||||
}
|
||||
|
||||
static void process_ckpt_update_cmd(orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer)
|
||||
{
|
||||
int ret;
|
||||
orte_std_cntr_t count = 1;
|
||||
int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
|
||||
/*
|
||||
* Receive the data:
|
||||
* - ckpt_state
|
||||
* - global snapshot handle (upon finish only)
|
||||
* - sequence number (upon finish only)
|
||||
*/
|
||||
count = 1;
|
||||
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
|
||||
return;
|
||||
}
|
||||
orte_checkpoint_globals.ckpt_status = ckpt_status;
|
||||
|
||||
if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status ||
|
||||
ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ||
|
||||
ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status ||
|
||||
ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
|
||||
count = 1;
|
||||
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_snapshot_handle, &count, OPAL_STRING)) ) {
|
||||
return;
|
||||
}
|
||||
count = 1;
|
||||
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_sequence_num, &count, OPAL_INT)) ) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the job is not able to be checkpointed, then return
|
||||
*/
|
||||
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) {
|
||||
opal_show_help("help-orte-checkpoint.txt", "non-ckptable",
|
||||
true,
|
||||
orte_checkpoint_globals.pid);
|
||||
is_checkpoint_finished = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status) {
|
||||
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
|
||||
orte_checkpoint_globals.pid, ORTE_ERROR);
|
||||
is_checkpoint_finished = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Status progression */
|
||||
if( orte_checkpoint_globals.status ) {
|
||||
pretty_print_status();
|
||||
}
|
||||
|
||||
if( ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status) {
|
||||
is_checkpoint_finished = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Normal termination check */
|
||||
if( (ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status && is_checkpoint_established) ||
|
||||
(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status && is_checkpoint_recovered) ){
|
||||
is_checkpoint_finished = true;
|
||||
return;
|
||||
}
|
||||
else if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status ) {
|
||||
is_checkpoint_recovered = true;
|
||||
}
|
||||
else if(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ) {
|
||||
is_checkpoint_established = true;
|
||||
}
|
||||
}
|
||||
|
||||
static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_buffer_t *buffer = NULL;
|
||||
orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_INIT_CMD;
|
||||
orte_jobid_t jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) {
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
|
||||
orte_checkpoint_globals.pid);
|
||||
|
||||
timer_start = get_time();
|
||||
|
||||
/***********************************
|
||||
* Notify HNP of checkpoint request
|
||||
* Send:
|
||||
* - Command
|
||||
* - options
|
||||
* - jobid
|
||||
***********************************/
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_pack_options(buffer, options)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &jobid, 1, ORTE_JOBID))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&(orterun_hnp->name), buffer,
|
||||
ORTE_RML_TAG_CKPT, orte_rml_send_callback,
|
||||
NULL))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||
"orte_checkpoint: notify_hnp: Requested a checkpoint of jobid %s\n",
|
||||
ORTE_JOBID_PRINT(jobid));
|
||||
|
||||
cleanup:
|
||||
if( ORTE_SUCCESS != exit_status ) {
|
||||
opal_show_help("help-orte-checkpoint.txt", "unable_to_connect", true,
|
||||
orte_checkpoint_globals.pid);
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
/***************
|
||||
* Pretty Print
|
||||
***************/
|
||||
static double get_time(void) {
|
||||
double wtime;
|
||||
|
||||
#if OPAL_TIMER_USEC_NATIVE
|
||||
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
|
||||
#else
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
wtime = tv.tv_sec;
|
||||
wtime += (double)tv.tv_usec / 1000000.0;
|
||||
#endif
|
||||
|
||||
return wtime;
|
||||
}
|
||||
|
||||
static int pretty_print_status(void) {
|
||||
char * state_str = NULL;
|
||||
double cur_time;
|
||||
|
||||
cur_time = get_time();
|
||||
|
||||
if( timer_last == 0 ) {
|
||||
timer_last = cur_time;
|
||||
}
|
||||
|
||||
orte_snapc_ckpt_state_str(&state_str, orte_checkpoint_globals.ckpt_status);
|
||||
|
||||
if( NULL != global_snapshot_handle ) {
|
||||
opal_output(0,
|
||||
"[%6.2f / %6.2f] %*s - %s\n",
|
||||
(cur_time - timer_last), (cur_time - timer_start),
|
||||
25, state_str, global_snapshot_handle);
|
||||
} else {
|
||||
opal_output(0,
|
||||
"[%6.2f / %6.2f] %*s - ...\n",
|
||||
(cur_time - timer_last), (cur_time - timer_start),
|
||||
25, state_str);
|
||||
}
|
||||
|
||||
if( NULL != state_str) {
|
||||
free(state_str);
|
||||
}
|
||||
|
||||
timer_last = cur_time;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int pretty_print_reference(void)
|
||||
{
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
if( orte_checkpoint_globals.enable_crdebug ) {
|
||||
printf("Checkpoint handle: -s %3d %s\n",
|
||||
global_sequence_num,
|
||||
global_snapshot_handle);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
printf("Snapshot Ref.: %3d %s\n",
|
||||
global_sequence_num,
|
||||
global_snapshot_handle);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int list_all_snapshots(void) {
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_list_t *all_snapshots = NULL;
|
||||
opal_list_item_t* item = NULL;
|
||||
orte_sstore_base_global_snapshot_info_t *global_snapshot = NULL;
|
||||
|
||||
all_snapshots = OBJ_NEW(opal_list_t);
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_sstore_base_get_all_snapshots(all_snapshots, NULL)) ) {
|
||||
opal_output(0, "Error: Unable to list the checkpoints in the directory <%s>\n",
|
||||
orte_sstore_base_global_snapshot_dir);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* For each reference
|
||||
*/
|
||||
for(item = opal_list_get_first(all_snapshots);
|
||||
item != opal_list_get_end(all_snapshots);
|
||||
item = opal_list_get_next(item) ) {
|
||||
global_snapshot = (orte_sstore_base_global_snapshot_info_t*)item;
|
||||
|
||||
/*
|
||||
* Get a list of valid sequence numbers
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_sstore_base_find_all_seq_nums(global_snapshot,
|
||||
&(global_snapshot->num_seqs),
|
||||
&(global_snapshot->all_seqs)))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
int s;
|
||||
/* Pretty print the result - C/R Debug version */
|
||||
if( orte_checkpoint_globals.enable_crdebug ) {
|
||||
for(s = 0; s < global_snapshot->num_seqs; ++s) {
|
||||
printf("-s %s %s\n", global_snapshot->all_seqs[s], global_snapshot->reference);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/* Pretty print the result */
|
||||
printf("Snapshot Ref.: %s\t[",
|
||||
global_snapshot->reference);
|
||||
if( 0 >= global_snapshot->num_seqs ) {
|
||||
printf("No Valid Checkpoints");
|
||||
} else {
|
||||
printf("%s",
|
||||
opal_argv_join(global_snapshot->all_seqs, ','));
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
while (NULL != (item = opal_list_remove_first(all_snapshots))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_RELEASE(all_snapshots);
|
||||
|
||||
return exit_status;
|
||||
}
|
@ -1,44 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
include $(top_srcdir)/Makefile.ompi-rules
|
||||
|
||||
man_pages = orte-migrate.1
|
||||
EXTRA_DIST = orte-migrate.1in
|
||||
|
||||
if WANT_FT_CR
|
||||
if OPAL_INSTALL_BINARIES
|
||||
|
||||
bin_PROGRAMS = orte-migrate
|
||||
|
||||
nodist_man_MANS = $(man_pages)
|
||||
|
||||
# Ensure that the man pages are rebuilt if the opal_config.h file
|
||||
# changes; a "good enough" way to know if configure was run again (and
|
||||
# therefore the release date or version may have changed)
|
||||
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
|
||||
|
||||
dist_ortedata_DATA = help-orte-migrate.txt
|
||||
|
||||
endif # OPAL_INSTALL_BINARIES
|
||||
|
||||
orte_migrate_SOURCES = orte-migrate.c
|
||||
orte_migrate_LDADD = \
|
||||
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
|
||||
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
||||
|
||||
endif # WANT_FT_CR
|
||||
|
||||
distclean-local:
|
||||
rm -f $(man_pages)
|
@ -1,81 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2014 Research Organization for Information Science
|
||||
# and Technology (RIST). All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI migrate tool
|
||||
#
|
||||
[usage]
|
||||
ompi-migrate PID_OF_MPIRUN
|
||||
Open MPI Process Migration Tool
|
||||
|
||||
%s
|
||||
#
|
||||
[usage-no-cr]
|
||||
This build of Open MPI does *not* include Checkpoint/Restart functionality.
|
||||
If you require this functionality re-configure Open MPI with the proper
|
||||
Checkpoint/Restart options.
|
||||
|
||||
ompi-migrate PID_OF_MPIRUN
|
||||
Open MPI Migrage Tool
|
||||
|
||||
%s
|
||||
#
|
||||
[invalid_pid]
|
||||
Error: The PID (%d) is invalid because either you have not provided a PID
|
||||
or provided an invalid PID.
|
||||
Please see --help for usage.
|
||||
#
|
||||
[no_universe]
|
||||
Error: Unable to find the contact information for PID %d.
|
||||
This could be due to one of the following:
|
||||
- The PID is not that of an active MPIRUN.
|
||||
- The application with this PID isn't migratable
|
||||
- The application with this PID isn't an Open MPI application.
|
||||
ompi-migrate attempted to find the session directory:
|
||||
%s
|
||||
#
|
||||
[unable_to_connect]
|
||||
Error: Unable to connect to the Head Node Process to initiate the
|
||||
migration of the application.
|
||||
This could be due to one of the following:
|
||||
- The PID is not that of an active MPIRUN.
|
||||
- The application with this PID isn't migratable
|
||||
- The application with this PID isn't an Open MPI application.
|
||||
#
|
||||
[non-ckptable]
|
||||
Error: The job with pid %d is not checkpointable.
|
||||
This could be caused by one of the following:
|
||||
- The application is using unsupported components.
|
||||
- Your application did not select to be checkpointable
|
||||
To enable checkpointing in an application use the following AMCA parameter
|
||||
argument to mpirun:
|
||||
-am ft-enable-cr
|
||||
#
|
||||
[not_impl]
|
||||
The following feature was requested, but is not currently implemented.
|
||||
%s
|
||||
If you require this feature contact the Open MPI development group.
|
||||
#
|
||||
[err-inprogress]
|
||||
Error: The Job identified by PID (%d) is currently migrating other processes.
|
||||
Only one migration request can be processed at a time. Please try again
|
||||
later.
|
||||
#
|
||||
[err-other]
|
||||
Error: The Job identified by PID (%d) was not able to migrate processes in this
|
||||
job. This could be caused by any of the following:
|
||||
- Invalid node or rank specified
|
||||
- No processes on the indicated node can by migrated
|
||||
- Process migration was not enabled for this job. Make sure to indicate
|
||||
the proper AMCA file: "-am ft-enable-cr-recovery".
|
@ -1,81 +0,0 @@
|
||||
.\"
|
||||
.\" Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
.\" University Research and Technology
|
||||
.\" Corporation. All rights reserved.
|
||||
.\"
|
||||
.\" Man page for OMPI's ompi-migrate command
|
||||
.\"
|
||||
.\" .TH name section center-footer left-footer center-header
|
||||
.TH OMPI-MIGRATE 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
|
||||
.\" **************************
|
||||
.\" Name Section
|
||||
.\" **************************
|
||||
.SH NAME
|
||||
.
|
||||
ompi-migrate, orte-migrate \- Migrate processes among resources in Open MPI.
|
||||
.
|
||||
.PP
|
||||
.
|
||||
\fBNOTE:\fP \fIompi-migrate\fP, and \fIorte-migrate\fP are all exact
|
||||
synonyms for each other. Using any of the names will result in exactly
|
||||
identical behavior.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Synopsis Section
|
||||
.\" **************************
|
||||
.SH SYNOPSIS
|
||||
.
|
||||
.B ompi-migrate
|
||||
.R [ options ]
|
||||
.B <PID_OF_MPIRUN>
|
||||
.
|
||||
.\" **************************
|
||||
.\" Options Section
|
||||
.\" **************************
|
||||
.SH Options
|
||||
.
|
||||
\fIorte-migrate\fR will attempt to notify a running parallel job (identified
|
||||
by \fImpirun\fP) that a migration has been requeted.
|
||||
.
|
||||
.TP 10
|
||||
.B <PID_OF_MPIRUN>
|
||||
Process ID of the \fImpirun\fP process.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -h | --help
|
||||
Display help for this command
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -v | --verbose
|
||||
Enable verbose output for debugging.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -gmca | --gmca \fR<key> <value>\fP
|
||||
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||
the parameter name; \fI<value>\fP is the parameter value.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -mca | --mca <key> <value>
|
||||
Send arguments to various MCA modules.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Description Section
|
||||
.\" **************************
|
||||
.SH DESCRIPTION
|
||||
.
|
||||
.PP
|
||||
\fIorte-migrate\fR can be invoked multiple, non-overlapping times.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" See Also Section
|
||||
.\" **************************
|
||||
.
|
||||
.SH SEE ALSO
|
||||
orte-ps(1), orte-clean(1), ompi-restart(1), ompi-checkpoint(1), opal-checkpoint(1), opal-restart(1), opal_crs(7)
|
||||
.
|
@ -1,791 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* ORTE Process Migration Tool for migrating processes in a multiprocess job
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif /* HAVE_FCNTL_H */
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h> /* for mkfifo */
|
||||
#endif /* HAVE_SYS_STAT_H */
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif /* HAVE_SYS_WAIT_H */
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#include "opal/util/cmd_line.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/util/hnp_contact.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
|
||||
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||
|
||||
/******************
|
||||
* Local Functions
|
||||
******************/
|
||||
static int tool_init(int argc, char *argv[]); /* Initalization routine */
|
||||
static int tool_finalize(void); /* Finalization routine */
|
||||
static int parse_args(int argc, char *argv[]);
|
||||
static int find_hnp(void);
|
||||
|
||||
static int start_listener(void);
|
||||
static int stop_listener(void);
|
||||
static void hnp_receiver(int status,
|
||||
orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
|
||||
static void process_ckpt_update_cmd(orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer);
|
||||
|
||||
static int notify_hnp(void);
|
||||
static int pretty_print_status(void);
|
||||
static int pretty_print_migration(void);
|
||||
|
||||
static orte_hnp_contact_t *orterun_hnp = NULL;
|
||||
static int orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
|
||||
|
||||
/*****************************************
|
||||
* Global Vars for Command line Arguments
|
||||
*****************************************/
|
||||
static bool listener_started = false;
|
||||
|
||||
static double timer_start = 0;
|
||||
static double timer_last = 0;
|
||||
static double get_time(void);
|
||||
|
||||
typedef struct {
|
||||
bool help;
|
||||
int pid;
|
||||
bool verbose;
|
||||
int verbose_level;
|
||||
bool status;
|
||||
int output;
|
||||
char *off_nodes;
|
||||
char *off_procs;
|
||||
char *onto_nodes;
|
||||
} orte_migrate_globals_t;
|
||||
|
||||
orte_migrate_globals_t orte_migrate_globals;
|
||||
|
||||
opal_cmd_line_init_t cmd_line_opts[] = {
|
||||
{ NULL,
|
||||
'h', NULL, "help",
|
||||
0,
|
||||
&orte_migrate_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
|
||||
{ NULL,
|
||||
'v', NULL, "verbose",
|
||||
0,
|
||||
&orte_migrate_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be Verbose" },
|
||||
|
||||
{ NULL,
|
||||
'V', NULL, NULL,
|
||||
1,
|
||||
&orte_migrate_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Set the verbosity level (For additional debugging information)" },
|
||||
|
||||
{ "hnp-pid",
|
||||
'\0', NULL, "hnp-pid",
|
||||
1,
|
||||
&orte_migrate_globals.pid, OPAL_CMD_LINE_TYPE_INT,
|
||||
"This should be the pid of the mpirun whose applications you wish "
|
||||
"to migrate." },
|
||||
|
||||
{ NULL,
|
||||
'x', NULL, "off",
|
||||
1,
|
||||
&orte_migrate_globals.off_nodes, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of nodes to migrate off of (comma separated)" },
|
||||
|
||||
{ NULL,
|
||||
'r', NULL, "ranks",
|
||||
1,
|
||||
&orte_migrate_globals.off_procs, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of MPI_COMM_WORLD ranks to migrate (comma separated)" },
|
||||
|
||||
{ NULL,
|
||||
't', NULL, "onto",
|
||||
1,
|
||||
&orte_migrate_globals.onto_nodes, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of nodes to migrate onto (comma separated)" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
||||
NULL }
|
||||
};
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
||||
/***************
|
||||
* Initialize
|
||||
***************/
|
||||
if (ORTE_SUCCESS != (ret = tool_init(argc, argv))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/***************************
|
||||
* Find the HNP that we want to connect to, if it exists
|
||||
***************************/
|
||||
if( orte_migrate_globals.verbose ) {
|
||||
opal_output_verbose(10, orte_migrate_globals.output,
|
||||
"orte_migrate: Finding HNP...");
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = find_hnp())) {
|
||||
opal_show_help("help-orte-migrate.txt", "invalid_pid",
|
||||
true, orte_migrate_globals.pid);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*******************************
|
||||
* Send migration information to HNP
|
||||
*******************************/
|
||||
if( orte_migrate_globals.verbose ) {
|
||||
opal_output_verbose(10, orte_migrate_globals.output,
|
||||
"orte_migrate: Sending info to HNP...");
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = notify_hnp())) {
|
||||
opal_output(0,
|
||||
"HNP with PID %d Not found!",
|
||||
orte_migrate_globals.pid);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*******************************
|
||||
* Wait for migration to complete
|
||||
*******************************/
|
||||
while( ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status &&
|
||||
ORTE_ERRMGR_MIGRATE_STATE_ERROR != orte_migrate_ckpt_status &&
|
||||
ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS != orte_migrate_ckpt_status) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
if( orte_migrate_globals.status ) {
|
||||
orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH;
|
||||
pretty_print_status();
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/***************
|
||||
* Cleanup
|
||||
***************/
|
||||
if (ORTE_SUCCESS != (ret = tool_finalize())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int parse_args(int argc, char *argv[]) {
|
||||
int i, ret, len, exit_status = ORTE_SUCCESS ;
|
||||
opal_cmd_line_t cmd_line;
|
||||
char **app_env = NULL, **global_env = NULL;
|
||||
char * tmp_env_var = NULL;
|
||||
char *argv0 = NULL;
|
||||
|
||||
/* Init structure */
|
||||
memset(&orte_migrate_globals, 0, sizeof(orte_migrate_globals_t));
|
||||
orte_migrate_globals.help = false;
|
||||
orte_migrate_globals.pid = -1;
|
||||
orte_migrate_globals.verbose = false;
|
||||
orte_migrate_globals.verbose_level = 0;
|
||||
orte_migrate_globals.status = false;
|
||||
orte_migrate_globals.output = -1;
|
||||
orte_migrate_globals.off_nodes = NULL;
|
||||
orte_migrate_globals.off_procs = NULL;
|
||||
orte_migrate_globals.onto_nodes = NULL;
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 0
|
||||
/* Warn and exit if not configured with Migrate/Restart */
|
||||
{
|
||||
char *str, *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
str = opal_show_help_string("help-orte-migrate.txt", "usage-no-cr",
|
||||
true, args);
|
||||
if (NULL != str) {
|
||||
printf("%s", str);
|
||||
free(str);
|
||||
}
|
||||
free(args);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Parse the command line options */
|
||||
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
||||
mca_base_open();
|
||||
mca_base_cmd_line_setup(&cmd_line);
|
||||
ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv);
|
||||
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
if (OPAL_ERR_SILENT != ret) {
|
||||
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
|
||||
opal_strerror(ret));
|
||||
}
|
||||
exit_status = 1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (orte_migrate_globals.help) {
|
||||
char *str, *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
str = opal_show_help_string("help-orte-migrate.txt", "usage", true,
|
||||
args);
|
||||
if (NULL != str) {
|
||||
printf("%s", str);
|
||||
free(str);
|
||||
}
|
||||
free(args);
|
||||
/* If we show the help message, that should be all we do */
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Put all of the MCA arguments in the environment
|
||||
*/
|
||||
mca_base_cmd_line_process_args(argv, &app_env, &global_env);
|
||||
|
||||
len = opal_argv_count(app_env);
|
||||
for(i = 0; i < len; ++i) {
|
||||
putenv(app_env[i]);
|
||||
}
|
||||
|
||||
len = opal_argv_count(global_env);
|
||||
for(i = 0; i < len; ++i) {
|
||||
putenv(global_env[i]);
|
||||
}
|
||||
|
||||
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/**
|
||||
* Now start parsing our specific arguments
|
||||
*/
|
||||
/* get the remaining bits */
|
||||
argv0 = strdup(argv[0]);
|
||||
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
|
||||
|
||||
if (NULL == orte_migrate_globals.off_nodes &&
|
||||
NULL == orte_migrate_globals.off_procs) {
|
||||
fprintf(stderr, "%s: Nothing to do\n", argv0);
|
||||
fprintf(stderr, "Type '%s --help' for usage.\n", argv0);
|
||||
exit_status = 1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if(orte_migrate_globals.verbose_level < 0 ) {
|
||||
orte_migrate_globals.verbose_level = 0;
|
||||
}
|
||||
|
||||
if(orte_migrate_globals.verbose_level > 0) {
|
||||
orte_migrate_globals.verbose = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the user did not supply an hnp jobid, then they must
|
||||
* supply the PID of MPIRUN
|
||||
*/
|
||||
if(0 >= argc ) {
|
||||
fprintf(stderr, "%s: Nothing to do\n", argv[0]);
|
||||
fprintf(stderr, "Type '%s --help' for usage.\n", argv[0]);
|
||||
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
orte_migrate_globals.pid = atoi(argv[0]);
|
||||
if ( 0 >= orte_migrate_globals.pid ) {
|
||||
opal_show_help("help-orte-migrate.txt", "invalid_pid", true,
|
||||
orte_migrate_globals.pid);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if(orte_migrate_globals.verbose) {
|
||||
orte_migrate_globals.status = true;
|
||||
}
|
||||
|
||||
if(orte_migrate_globals.verbose) {
|
||||
pretty_print_migration();
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (NULL != argv0) {
|
||||
free(argv0);
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function attempts to find an HNP to connect to.
|
||||
*/
|
||||
static int find_hnp(void) {
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_list_t hnp_list;
|
||||
opal_list_item_t *item;
|
||||
orte_hnp_contact_t *hnpcandidate;
|
||||
|
||||
/* get the list of local hnp's available to us and setup
|
||||
* contact info for them into the RML
|
||||
*/
|
||||
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
|
||||
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* search the list for the desired hnp */
|
||||
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
|
||||
hnpcandidate = (orte_hnp_contact_t*)item;
|
||||
if( hnpcandidate->pid == orte_migrate_globals.pid) {
|
||||
/* this is the one we want */
|
||||
orterun_hnp = hnpcandidate;
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&hnp_list);
|
||||
|
||||
if( NULL == orterun_hnp ) {
|
||||
return ORTE_ERROR;
|
||||
} else {
|
||||
return exit_status;
|
||||
}
|
||||
}
|
||||
|
||||
static int tool_init(int argc, char *argv[]) {
|
||||
int exit_status = ORTE_SUCCESS, ret;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
listener_started = false;
|
||||
|
||||
/*
|
||||
* Make sure to init util before parse_args
|
||||
* to ensure installdirs is setup properly
|
||||
* before calling mca_base_open();
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse Command Line Arguments
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Disable the migrate notification routine for this
|
||||
* tool. As we will never need to migrate this tool.
|
||||
* Note: This must happen before opal_init().
|
||||
*/
|
||||
opal_cr_set_enabled(false);
|
||||
|
||||
/* Select the none component, since we don't actually use a migrateer */
|
||||
(void) mca_base_var_env_name("crs", &tmp_env_var);
|
||||
opal_setenv(tmp_env_var,
|
||||
"none",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/* we are never allowed to operate as a distributed tool,
|
||||
* so insist on the ess/tool component */
|
||||
opal_setenv("OMPI_MCA_ess", "tool", true, &environ);
|
||||
|
||||
/***************************
|
||||
* We need all of OPAL and the TOOLS portion of ORTE - this
|
||||
* sets us up so we can talk to any HNP over the wire
|
||||
***************************/
|
||||
if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Setup ORTE Output handle from the verbose argument
|
||||
*/
|
||||
if( orte_migrate_globals.verbose ) {
|
||||
orte_migrate_globals.output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(orte_migrate_globals.output, orte_migrate_globals.verbose_level);
|
||||
} else {
|
||||
orte_migrate_globals.output = 0; /* Default=STDERR */
|
||||
}
|
||||
|
||||
/*
|
||||
* Start the listener
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = start_listener() ) ) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int tool_finalize(void) {
|
||||
int exit_status = ORTE_SUCCESS, ret;
|
||||
|
||||
/*
|
||||
* Stop the listener
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = stop_listener() ) ) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_finalize())) {
|
||||
exit_status = ret;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int start_listener(void)
|
||||
{
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE,
|
||||
ORTE_RML_PERSISTENT, hnp_receiver, NULL);
|
||||
|
||||
listener_started = true;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int stop_listener(void)
|
||||
{
|
||||
if( !listener_started ) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE);
|
||||
|
||||
listener_started = false;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void hnp_receiver(int status,
|
||||
orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
orte_errmgr_tool_cmd_flag_t command;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
|
||||
opal_output_verbose(5, orte_migrate_globals.output,
|
||||
"orte_migrate: hnp_receiver: Receive a command message.");
|
||||
|
||||
/*
|
||||
* Otherwise this is an inter-coordinator command (usually updating state info).
|
||||
*/
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_MIGRATE_TOOL_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (command) {
|
||||
case ORTE_ERRMGR_MIGRATE_TOOL_UPDATE_CMD:
|
||||
opal_output_verbose(10, orte_migrate_globals.output,
|
||||
"orte_migrate: hnp_receiver: Status Update.");
|
||||
|
||||
process_ckpt_update_cmd(sender, buffer);
|
||||
break;
|
||||
|
||||
case ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD:
|
||||
/* Do Nothing */
|
||||
break;
|
||||
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
}
|
||||
}
|
||||
|
||||
static void process_ckpt_update_cmd(orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer)
|
||||
{
|
||||
int ret;
|
||||
orte_std_cntr_t count = 1;
|
||||
int ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
|
||||
|
||||
/*
|
||||
* Receive the data:
|
||||
* - ckpt_state
|
||||
*/
|
||||
count = 1;
|
||||
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
|
||||
goto cleanup;
|
||||
}
|
||||
orte_migrate_ckpt_status = ckpt_status;
|
||||
|
||||
/*
|
||||
* If the job is not able to be migrateed, then return
|
||||
*/
|
||||
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_migrate_ckpt_status) {
|
||||
opal_show_help("help-orte-migrate.txt", "non-ckptable",
|
||||
true,
|
||||
orte_migrate_globals.pid);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* If a migration is already in progress, then we must tell the user to
|
||||
* try again later.
|
||||
*/
|
||||
if( ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS == orte_migrate_ckpt_status) {
|
||||
opal_show_help("help-orte-migrate.txt", "err-inprogress",
|
||||
true,
|
||||
orte_migrate_globals.pid);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there was an error, display a message and exit
|
||||
*/
|
||||
if( ORTE_ERRMGR_MIGRATE_STATE_ERROR == orte_migrate_ckpt_status ) {
|
||||
opal_show_help("help-orte-migrate.txt", "err-other",
|
||||
true,
|
||||
orte_migrate_globals.pid);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are to display the status progression
|
||||
*/
|
||||
if( orte_migrate_globals.status ) {
|
||||
if(ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status) {
|
||||
pretty_print_status();
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return;
|
||||
}
|
||||
|
||||
static int notify_hnp(void)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_buffer_t *buffer = NULL;
|
||||
orte_errmgr_tool_cmd_flag_t command = ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD;
|
||||
|
||||
if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) {
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, orte_migrate_globals.output,
|
||||
"orte_migrate: notify_hnp: Contact Head Node Process PID %d\n",
|
||||
orte_migrate_globals.pid);
|
||||
|
||||
timer_start = get_time();
|
||||
|
||||
/***********************************
|
||||
* Notify HNP of migrate request
|
||||
* Send:
|
||||
* - Command
|
||||
* - Off Nodes
|
||||
* - Off Procs
|
||||
* - Onto Nodes
|
||||
***********************************/
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_ERRMGR_MIGRATE_TOOL_CMD)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_procs), 1, OPAL_STRING)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_nodes), 1, OPAL_STRING)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.onto_nodes), 1, OPAL_STRING)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&(orterun_hnp->name), buffer,
|
||||
ORTE_RML_TAG_MIGRATE, orte_rml_send_callback,
|
||||
NULL))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if( NULL != buffer) {
|
||||
OBJ_RELEASE(buffer);
|
||||
buffer = NULL;
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != exit_status ) {
|
||||
opal_show_help("help-orte-migrate.txt", "unable_to_connect", true,
|
||||
orte_migrate_globals.pid);
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
/***************
|
||||
* Pretty Print
|
||||
***************/
|
||||
static double get_time(void) {
|
||||
double wtime;
|
||||
|
||||
#if OPAL_TIMER_USEC_NATIVE
|
||||
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
|
||||
#else
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
wtime = tv.tv_sec;
|
||||
wtime += (double)tv.tv_usec / 1000000.0;
|
||||
#endif
|
||||
|
||||
return wtime;
|
||||
}
|
||||
|
||||
static int pretty_print_status(void) {
|
||||
char * state_str = NULL;
|
||||
double cur_time;
|
||||
|
||||
cur_time = get_time();
|
||||
|
||||
if( timer_last == 0 ) {
|
||||
timer_last = cur_time;
|
||||
}
|
||||
|
||||
orte_errmgr_base_migrate_state_str(&state_str, orte_migrate_ckpt_status);
|
||||
|
||||
opal_output(0,
|
||||
"[%6.2f / %6.2f] %*s - ...\n",
|
||||
(cur_time - timer_last), (cur_time - timer_start),
|
||||
25, state_str);
|
||||
|
||||
if( NULL != state_str) {
|
||||
free(state_str);
|
||||
}
|
||||
|
||||
timer_last = cur_time;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int pretty_print_migration(void)
|
||||
{
|
||||
char **loc_off_nodes = NULL;
|
||||
char **loc_off_procs = NULL;
|
||||
char **loc_onto_nodes = NULL;
|
||||
int loc_off_nodes_cnt = 0;
|
||||
int loc_off_procs_cnt = 0;
|
||||
int loc_onto_cnt = 0;
|
||||
int i;
|
||||
|
||||
if( NULL != orte_migrate_globals.off_nodes ) {
|
||||
loc_off_nodes = opal_argv_split(orte_migrate_globals.off_nodes, ',');
|
||||
loc_off_nodes_cnt = opal_argv_count(loc_off_nodes);
|
||||
}
|
||||
|
||||
if( NULL != orte_migrate_globals.off_procs ) {
|
||||
loc_off_procs = opal_argv_split(orte_migrate_globals.off_procs, ',');
|
||||
loc_off_procs_cnt = opal_argv_count(loc_off_procs);
|
||||
}
|
||||
|
||||
if( NULL != orte_migrate_globals.onto_nodes ) {
|
||||
loc_onto_nodes = opal_argv_split(orte_migrate_globals.onto_nodes, ',');
|
||||
loc_onto_cnt = opal_argv_count(loc_onto_nodes);
|
||||
}
|
||||
|
||||
printf("Migrate Nodes: (%d nodes)\n", loc_off_nodes_cnt);
|
||||
for(i = 0; i < loc_off_nodes_cnt; ++i) {
|
||||
printf("\t\"%s\"\n", loc_off_nodes[i]);
|
||||
}
|
||||
|
||||
printf("Migrate Ranks: (%d ranks)\n", loc_off_procs_cnt);
|
||||
for(i = 0; i < loc_off_procs_cnt; ++i) {
|
||||
printf("\t\"%s\"\n", loc_off_procs[i]);
|
||||
}
|
||||
|
||||
printf("Migrate Onto : (%d nodes)\n", loc_onto_cnt);
|
||||
for(i = 0; i < loc_onto_cnt; ++i) {
|
||||
printf("\t\"%s\"\n", loc_onto_nodes[i]);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,51 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
include $(top_srcdir)/Makefile.ompi-rules
|
||||
|
||||
man_pages = orte-restart.1
|
||||
EXTRA_DIST = orte-restart.1in
|
||||
|
||||
if WANT_FT_CR
|
||||
if OPAL_INSTALL_BINARIES
|
||||
|
||||
bin_PROGRAMS = orte-restart
|
||||
|
||||
nodist_man_MANS = $(man_pages)
|
||||
|
||||
# Ensure that the man pages are rebuilt if the opal_config.h file
|
||||
# changes; a "good enough" way to know if configure was run again (and
|
||||
# therefore the release date or version may have changed)
|
||||
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
|
||||
|
||||
dist_ortedata_DATA = help-orte-restart.txt
|
||||
|
||||
endif # OPAL_INSTALL_BINARIES
|
||||
|
||||
orte_restart_SOURCES = orte-restart.c
|
||||
orte_restart_LDADD = \
|
||||
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
|
||||
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
||||
|
||||
endif # WANT_FT_CR
|
||||
|
||||
distclean-local:
|
||||
rm -f $(man_pages)
|
@ -1,77 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI checkpoint tool
|
||||
#
|
||||
[usage]
|
||||
ompi-restart GLOBAL_SNAPSHOT_REF
|
||||
Open MPI Parallel Job Restart Tool
|
||||
|
||||
%s
|
||||
#
|
||||
[usage-no-cr]
|
||||
This build of Open MPI does *not* include Checkpoint/Restart functionality.
|
||||
If you require this functionality re-configure Open MPI with the proper
|
||||
Checkpoint/Restart options.
|
||||
|
||||
ompi-restart GLOBAL_SNAPSHOT_REF
|
||||
Open MPI Parallel Job Restart Tool
|
||||
|
||||
%s
|
||||
#
|
||||
[invalid_filename]
|
||||
Error: The filename provided (referenced below) could not be used for
|
||||
restarting the job. This could be for a variety of reasons:
|
||||
- The filename/handle is invalid,
|
||||
- The snapshot directory no longer exisits, or
|
||||
- There are no stable checkpoint sequences in this global snapshot.
|
||||
Please see --help for usage.
|
||||
|
||||
Filename: %s
|
||||
#
|
||||
[restart_cmd_failure]
|
||||
Error: Unable to obtain the proper restart command to restart from the
|
||||
checkpoint file (%s). Returned %d.
|
||||
#
|
||||
[comp_select_failure]
|
||||
Error: Unable to select the %s component needed to restart this
|
||||
application. (Returned %d)
|
||||
This likely indicates that the checkpointer needed is not
|
||||
available on this machine. You should move to a machine that
|
||||
has this checkpointer enabled.
|
||||
#
|
||||
[restart_failure]
|
||||
Error: The restart command:
|
||||
shell$ %s
|
||||
returned an error code %d, and was unable to restart properly.
|
||||
#
|
||||
[invalid_seq_num]
|
||||
Error: The filename (%s) and sequence number (%d) could not be used.
|
||||
This may be caused by an invalid sequence number. Try using the
|
||||
'-i' option to determine a correct value.
|
||||
#
|
||||
[amca_param_not_found]
|
||||
Warning: Unable to find the AMCA parameter in the checkpoint metadata.
|
||||
This is the option supplied to mpirun as '-am '. Restart will
|
||||
assume this value to be '%s'.
|
||||
#
|
||||
[tune_param_not_found]
|
||||
Warning: Unable to find the TUNE parameter in the checkpoint metadata.
|
||||
This is the option supplied to mpirun as '-tune '. Restart will
|
||||
assume this value to be '%s'.
|
@ -1,115 +0,0 @@
|
||||
.\"
|
||||
.\" Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
.\" University Research and Technology
|
||||
.\" Corporation. All rights reserved.
|
||||
.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
.\"
|
||||
.\" Man page for OMPI's ompi-restart command
|
||||
.\"
|
||||
.\" .TH name section center-footer left-footer center-header
|
||||
.TH OMPI-RESTART 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
|
||||
.\" **************************
|
||||
.\" Name Section
|
||||
.\" **************************
|
||||
.SH NAME
|
||||
.
|
||||
ompi-restart, orte-restart \- Restart a previously checkpointed parallel job
|
||||
using the Open PAL Checkpoint/Restart Service (CRS)
|
||||
.
|
||||
.PP
|
||||
.
|
||||
\fBNOTE:\fP \fIompi-restart\fP, and \fIorte-restart\fP are all exact
|
||||
synonyms for each other. Using any of the names will result in exactly
|
||||
identical behavior.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Synopsis Section
|
||||
.\" **************************
|
||||
.SH SYNOPSIS
|
||||
.
|
||||
.B ompi-restart
|
||||
.B [ options ]
|
||||
.B <GLOBAL SNAPSHOT HANDLE>
|
||||
.
|
||||
.\" **************************
|
||||
.\" Options Section
|
||||
.\" **************************
|
||||
.SH Options
|
||||
.
|
||||
\fIompi-restart\fR will attempt to restart a previously checkpointed parallel
|
||||
job from the global snapshot handle reference returned by \fIompi_checkpoint\fP.
|
||||
.
|
||||
.TP 10
|
||||
.B <GLOBAL SNAPSHOT HANDLE>
|
||||
The global snapshot handle reference returned by \fIompi_checkpoint\fP, used to
|
||||
restart the job. This is required to be the last argument to this command.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -h | --help
|
||||
Display help for this command
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -p | --preload
|
||||
Preload the checkpoint files on the remote systems before restarting the
|
||||
application. Disabled by default.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --fork
|
||||
Fork off a new process, which is the restarted process. By default, the
|
||||
restarted process will replace \fIompi-restart\fR.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -s | --seq
|
||||
The sequence number of the checkpoint to restart from. By default, the most
|
||||
recent sequence number is used (specified by -1).
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -hostfile | --hostfile
|
||||
The hostfile from which to restart the application. Useful in unscheduled
|
||||
environments. (Same behavior as --machinefile option)
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -machinefile | --machinefile
|
||||
The machinefile from which to restart the application. Useful in unscheduled
|
||||
environments. (Same behavior as --hostfile option)
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -v | --verbose
|
||||
Enable verbose output for debugging.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -gmca | --gmca \fR<key> <value>\fP
|
||||
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||
the parameter name; \fI<value>\fP is the parameter value.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -mca | --mca <key> <value>
|
||||
Send arguments to various MCA modules.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Description Section
|
||||
.\" **************************
|
||||
.SH DESCRIPTION
|
||||
.
|
||||
.PP
|
||||
\fIompi-restart\fR can be invoked multiple, non-overlapping times. This
|
||||
allows the user to restart a previously running parallel job.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" See Also Section
|
||||
.\" **************************
|
||||
.
|
||||
.SH SEE ALSO
|
||||
orte-ps(1), orte-clean(1), ompi-checkpoint(1), opal-checkpoint(1), opal-restart(1), opal_crs(7)
|
||||
.
|
@ -1,897 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* ORTE Restart Tool for restarting a previously checkpointed multiprocess job
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif /* HAVE_SYS_STAT_H */
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif /* HAVE_SYS_WAIT_H */
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
#include "opal/util/cmd_line.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/mca/sstore/sstore.h"
|
||||
#include "orte/mca/sstore/base/base.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
/******************
|
||||
* Local Functions
|
||||
******************/
|
||||
static int initialize(int argc, char *argv[]);
|
||||
static int finalize(void);
|
||||
static int parse_args(int argc, char *argv[]);
|
||||
static int create_appfile(orte_sstore_base_global_snapshot_info_t *snapshot);
|
||||
static int spawn_children(orte_sstore_base_global_snapshot_info_t *snapshot, pid_t *child_pid);
|
||||
static int snapshot_info(orte_sstore_base_global_snapshot_info_t *snapshot);
|
||||
static int snapshot_sort_compare_fn(opal_list_item_t **a,
|
||||
opal_list_item_t **b);
|
||||
|
||||
/*****************************************
|
||||
* Global Vars for Command line Arguments
|
||||
*****************************************/
|
||||
typedef struct {
|
||||
bool help;
|
||||
char *snapshot_ref;
|
||||
char *appfile;
|
||||
bool verbose;
|
||||
bool forked;
|
||||
int seq_number;
|
||||
char *hostfile;
|
||||
int output;
|
||||
bool info_only;
|
||||
bool app_only;
|
||||
bool showme;
|
||||
char *mpirun_opts;
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
bool enable_crdebug;
|
||||
#endif
|
||||
} orte_restart_globals_t;
|
||||
|
||||
orte_restart_globals_t orte_restart_globals;
|
||||
|
||||
opal_cmd_line_init_t cmd_line_opts[] = {
|
||||
{ NULL,
|
||||
'h', NULL, "help",
|
||||
0,
|
||||
&orte_restart_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
|
||||
{ NULL,
|
||||
'v', NULL, "verbose",
|
||||
0,
|
||||
&orte_restart_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be Verbose" },
|
||||
|
||||
{ NULL,
|
||||
'\0', NULL, "fork",
|
||||
0,
|
||||
&orte_restart_globals.forked, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Fork off a new process which is the restarted process instead of "
|
||||
"replacing orte_restart" },
|
||||
|
||||
{ NULL,
|
||||
's', NULL, "seq",
|
||||
1,
|
||||
&orte_restart_globals.seq_number, OPAL_CMD_LINE_TYPE_INT,
|
||||
"The sequence number of the checkpoint to start from. "
|
||||
"(Default: -1, or most recent)" },
|
||||
|
||||
{ NULL,
|
||||
'\0', "hostfile", "hostfile",
|
||||
1,
|
||||
&orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile to use for launch" },
|
||||
|
||||
{ NULL,
|
||||
'\0', "machinefile", "machinefile",
|
||||
1,
|
||||
&orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile to use for launch" },
|
||||
|
||||
{ NULL,
|
||||
'i', NULL, "info",
|
||||
0,
|
||||
&orte_restart_globals.info_only, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display information about the checkpoint" },
|
||||
|
||||
{ NULL,
|
||||
'a', NULL, "apponly",
|
||||
0,
|
||||
&orte_restart_globals.app_only, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Only create the app context file, do not restart from it" },
|
||||
|
||||
{ NULL,
|
||||
'\0', NULL, "showme",
|
||||
0,
|
||||
&orte_restart_globals.showme, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display the full command line that would have been exec'ed." },
|
||||
|
||||
{ NULL,
|
||||
'\0', "mpirun_opts", "mpirun_opts",
|
||||
1,
|
||||
&orte_restart_globals.mpirun_opts, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Command line options to pass directly to mpirun (be sure to quote long strings, and escape internal quotes)" },
|
||||
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
{ NULL,
|
||||
'\0', "crdebug", "crdebug",
|
||||
0,
|
||||
&orte_restart_globals.enable_crdebug, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable C/R Enhanced Debugging" },
|
||||
#endif
|
||||
|
||||
/* End of list */
|
||||
{ NULL,
|
||||
'\0', NULL, NULL,
|
||||
0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
||||
NULL }
|
||||
};
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
pid_t child_pid = 0;
|
||||
orte_sstore_base_global_snapshot_info_t *snapshot = NULL;
|
||||
char *basedir = NULL;
|
||||
char *tmp_str = NULL;
|
||||
|
||||
/***************
|
||||
* Initialize
|
||||
***************/
|
||||
if (ORTE_SUCCESS != (ret = initialize(argc, argv))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t);
|
||||
|
||||
if( opal_path_is_absolute(orte_restart_globals.snapshot_ref) ) {
|
||||
basedir = opal_dirname(orte_restart_globals.snapshot_ref);
|
||||
tmp_str = opal_basename(orte_restart_globals.snapshot_ref);
|
||||
free(orte_restart_globals.snapshot_ref);
|
||||
orte_restart_globals.snapshot_ref = strdup(tmp_str);
|
||||
free(tmp_str);
|
||||
tmp_str = NULL;
|
||||
} else if( NULL != strchr(orte_restart_globals.snapshot_ref, '/') ) {
|
||||
basedir = opal_dirname(orte_restart_globals.snapshot_ref);
|
||||
tmp_str = opal_basename(orte_restart_globals.snapshot_ref);
|
||||
free(orte_restart_globals.snapshot_ref);
|
||||
orte_restart_globals.snapshot_ref = strdup(tmp_str);
|
||||
free(tmp_str);
|
||||
tmp_str = NULL;
|
||||
} else {
|
||||
basedir = NULL; /* Use MCA parameter */
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: If the seq # passed is -1, then the largest seq # is selected,
|
||||
* ow the seq # requested is selected if available
|
||||
* 'basedir': Snapshot Base location to look in. If NULL then MCA parameter is used
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_sstore.request_restart_handle(&(snapshot->ss_handle),
|
||||
basedir,
|
||||
orte_restart_globals.snapshot_ref,
|
||||
orte_restart_globals.seq_number,
|
||||
snapshot))) {
|
||||
opal_show_help("help-orte-restart.txt", "invalid_filename", true,
|
||||
orte_restart_globals.snapshot_ref);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
orte_restart_globals.seq_number = snapshot->seq_num;
|
||||
|
||||
if(orte_restart_globals.info_only ) {
|
||||
if (ORTE_SUCCESS != (ret = snapshot_info(snapshot))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/******************************
|
||||
* Create the app file to use with mpirun/orterun
|
||||
******************************/
|
||||
if( ORTE_SUCCESS != (ret = create_appfile(snapshot) ) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( orte_restart_globals.app_only ) {
|
||||
printf("Created Appfile:\n\t%s\n", orte_restart_globals.appfile);
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/******************************
|
||||
* Restart in this process [mpirun/orterun]
|
||||
******************************/
|
||||
if( orte_restart_globals.verbose ) {
|
||||
opal_output_verbose(10, orte_restart_globals.output,
|
||||
"Restarting from file (%s)",
|
||||
orte_restart_globals.snapshot_ref);
|
||||
|
||||
if( orte_restart_globals.forked ) {
|
||||
opal_output_verbose(10, orte_restart_globals.output,
|
||||
"\t Forking off a child");
|
||||
} else {
|
||||
opal_output_verbose(10, orte_restart_globals.output,
|
||||
"\t Exec in self");
|
||||
}
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != (ret = spawn_children(snapshot, &child_pid)) ) {
|
||||
opal_show_help("help-orte-restart.txt", "restart_cmd_failure", true,
|
||||
orte_restart_globals.snapshot_ref, ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/***************
|
||||
* Cleanup
|
||||
***************/
|
||||
cleanup:
|
||||
if( NULL != basedir ) {
|
||||
free(basedir);
|
||||
basedir = NULL;
|
||||
}
|
||||
if( NULL != tmp_str ) {
|
||||
free(tmp_str);
|
||||
tmp_str = NULL;
|
||||
}
|
||||
if( NULL != snapshot ) {
|
||||
OBJ_RELEASE(snapshot);
|
||||
snapshot = NULL;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != (ret = finalize())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int initialize(int argc, char *argv[]) {
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
char * tmp_env_var = NULL;
|
||||
|
||||
/*
|
||||
* Make sure to init util before parse_args
|
||||
* to ensure installdirs is setup properly
|
||||
* before calling mca_base_open();
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse command line arguments
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Setup OPAL Output handle from the verbose argument
|
||||
*/
|
||||
if( orte_restart_globals.verbose ) {
|
||||
orte_restart_globals.output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(orte_restart_globals.output, 10);
|
||||
} else {
|
||||
orte_restart_globals.output = 0; /* Default=STDERR */
|
||||
}
|
||||
|
||||
/* Disable the checkpoint notification routine for this
|
||||
* tool. As we will never need to checkpoint this tool.
|
||||
* Note: This must happen before opal_init().
|
||||
*/
|
||||
opal_cr_set_enabled(false);
|
||||
|
||||
/* Select the none component, since we don't actually use a checkpointer */
|
||||
(void) mca_base_var_env_name("crs", &tmp_env_var);
|
||||
opal_setenv(tmp_env_var,
|
||||
"none",
|
||||
true, &environ);
|
||||
/* Don't free the environment variable name. It is used again below */
|
||||
|
||||
/*
|
||||
/* we are never allowed to operate as a distributed tool,
|
||||
* so insist on the ess/tool component */
|
||||
opal_setenv("OMPI_MCA_ess", "tool", true, &environ);
|
||||
|
||||
/* Setup any ORTE stuff we might need */
|
||||
if (OPAL_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Unset these now that we no longer need them */
|
||||
opal_unsetenv(tmp_env_var, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
|
||||
opal_unsetenv(tmp_env_var, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (OPAL_SUCCESS != (ret = orte_finalize())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int parse_args(int argc, char *argv[])
|
||||
{
|
||||
int i, ret, len;
|
||||
opal_cmd_line_t cmd_line;
|
||||
char **app_env = NULL, **global_env = NULL;
|
||||
char * tmp_env_var = NULL;
|
||||
char *argv0 = NULL;
|
||||
orte_restart_globals_t tmp = { false, /* help */
|
||||
NULL, /* filename */
|
||||
NULL, /* appfile */
|
||||
false, /* verbose */
|
||||
false, /* forked */
|
||||
-1, /* seq_number */
|
||||
NULL, /* hostfile */
|
||||
-1, /* output*/
|
||||
false, /* info only */
|
||||
false, /* app only */
|
||||
false, /* showme */
|
||||
NULL}; /* mpirun_opts */
|
||||
|
||||
orte_restart_globals = tmp;
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
orte_restart_globals.enable_crdebug = false;
|
||||
#endif
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 0
|
||||
/* Warn and exit if not configured with Checkpoint/Restart */
|
||||
{
|
||||
char *str, *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
str = opal_show_help_string("help-orte-restart.txt", "usage-no-cr",
|
||||
true, args);
|
||||
if (NULL != str) {
|
||||
printf("%s", str);
|
||||
free(str);
|
||||
}
|
||||
free(args);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Parse the command line options */
|
||||
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
||||
|
||||
mca_base_open();
|
||||
mca_base_cmd_line_setup(&cmd_line);
|
||||
ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv);
|
||||
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
if (OPAL_ERR_SILENT != ret) {
|
||||
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
|
||||
opal_strerror(ret));
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (orte_restart_globals.help) {
|
||||
char *str, *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
str = opal_show_help_string("help-orte-restart.txt", "usage", true,
|
||||
args);
|
||||
if (NULL != str) {
|
||||
printf("%s", str);
|
||||
free(str);
|
||||
}
|
||||
free(args);
|
||||
/* If we show the help message, that should be all we do */
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Put all of the MCA arguments in the environment
|
||||
*/
|
||||
mca_base_cmd_line_process_args(argv, &app_env, &global_env);
|
||||
|
||||
len = opal_argv_count(app_env);
|
||||
for(i = 0; i < len; ++i) {
|
||||
putenv(app_env[i]);
|
||||
}
|
||||
|
||||
len = opal_argv_count(global_env);
|
||||
for(i = 0; i < len; ++i) {
|
||||
putenv(global_env[i]);
|
||||
}
|
||||
|
||||
(void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
|
||||
opal_setenv(tmp_env_var,
|
||||
"1",
|
||||
true, &environ);
|
||||
free(tmp_env_var);
|
||||
tmp_env_var = NULL;
|
||||
|
||||
/**
|
||||
* Now start parsing our specific arguments
|
||||
*/
|
||||
|
||||
/* get the remaining bits */
|
||||
argv0 = strdup(argv[0]);
|
||||
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
|
||||
if (0 == argc) {
|
||||
fprintf(stderr, "%s: Nothing to do\n", argv0);
|
||||
fprintf(stderr, "Type '%s --help' for usge.\n", argv0);
|
||||
free(argv0);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
free(argv0);
|
||||
|
||||
orte_restart_globals.snapshot_ref = strdup(argv[0]);
|
||||
if ( NULL == orte_restart_globals.snapshot_ref ||
|
||||
0 >= strlen(orte_restart_globals.snapshot_ref) ) {
|
||||
opal_show_help("help-orte-restart.txt", "invalid_filename", true,
|
||||
"<none provided>");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* If we have arguments after the command, then assume they
|
||||
* need to be grouped together.
|
||||
*/
|
||||
if(argc > 1) {
|
||||
orte_restart_globals.snapshot_ref = strdup(opal_argv_join(argv, ' '));
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int create_appfile(orte_sstore_base_global_snapshot_info_t *snapshot)
|
||||
{
|
||||
int exit_status = ORTE_SUCCESS;
|
||||
FILE *appfile = NULL;
|
||||
opal_list_item_t* item = NULL;
|
||||
char *tmp_str = NULL;
|
||||
char *amca_param = NULL;
|
||||
char *tune_param = NULL;
|
||||
char *reference_fmt_str = NULL;
|
||||
char *location_str = NULL;
|
||||
char *ref_location_fmt_str = NULL;
|
||||
orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
|
||||
|
||||
/*
|
||||
* Create the appfile
|
||||
*/
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_SNAP_LOC_ABS,
|
||||
&tmp_str);
|
||||
asprintf(&orte_restart_globals.appfile, "%s/%s",
|
||||
tmp_str,
|
||||
strdup("restart-appfile"));
|
||||
if( NULL != tmp_str ) {
|
||||
free(tmp_str);
|
||||
tmp_str = NULL;
|
||||
}
|
||||
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_AMCA_PARAM,
|
||||
&amca_param);
|
||||
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_TUNE_PARAM,
|
||||
&tune_param);
|
||||
|
||||
if (NULL == (appfile = fopen(orte_restart_globals.appfile, "w")) ) {
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* This will give a format string that we can use */
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_LOCAL_SNAP_REF_FMT,
|
||||
&reference_fmt_str);
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_LOCAL_SNAP_LOC,
|
||||
&location_str);
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT,
|
||||
&ref_location_fmt_str);
|
||||
|
||||
/*
|
||||
* Sort the snapshots so that they are in order
|
||||
*/
|
||||
opal_list_sort(&snapshot->local_snapshots, snapshot_sort_compare_fn);
|
||||
|
||||
/*
|
||||
* Construct the appfile
|
||||
*/
|
||||
for(item = opal_list_get_first(&snapshot->local_snapshots);
|
||||
item != opal_list_get_end(&snapshot->local_snapshots);
|
||||
item = opal_list_get_next(item) ) {
|
||||
vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
|
||||
|
||||
fprintf(appfile, "#\n");
|
||||
fprintf(appfile, "# Old Process Name: %u.%u\n",
|
||||
vpid_snapshot->process_name.jobid,
|
||||
vpid_snapshot->process_name.vpid);
|
||||
fprintf(appfile, "#\n");
|
||||
fprintf(appfile, "-np 1 ");
|
||||
|
||||
fprintf(appfile, "--sstore-load ");
|
||||
/* loc:ref:postfix:seq */
|
||||
fprintf(appfile, "%s:%s:",
|
||||
location_str,
|
||||
orte_restart_globals.snapshot_ref);
|
||||
fprintf(appfile, reference_fmt_str, vpid_snapshot->process_name.vpid);
|
||||
fprintf(appfile, ":%s:%s:%d ",
|
||||
(vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
|
||||
(vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
|
||||
orte_restart_globals.seq_number);
|
||||
|
||||
if( NULL == amca_param ) {
|
||||
amca_param = strdup("ft-enable-cr");
|
||||
opal_show_help("help-orte-restart.txt", "amca_param_not_found", true,
|
||||
amca_param);
|
||||
}
|
||||
fprintf(appfile, "-am %s ", amca_param);
|
||||
|
||||
if( NULL == tune_param ) {
|
||||
tune_param = strdup("ft-enable-cr");
|
||||
opal_show_help("help-orte-restart.txt", "tune_param_not_found", true,
|
||||
tune_param);
|
||||
}
|
||||
fprintf(appfile, "-tune %s ", tune_param);
|
||||
|
||||
fprintf(appfile, " opal-restart ");
|
||||
|
||||
/*
|
||||
* By default, point to the central storage location of the checkpoint.
|
||||
* The active SStore module at restart time will determine if files
|
||||
* need to be preloaded.
|
||||
*/
|
||||
fprintf(appfile, "-l %s", location_str);
|
||||
fprintf(appfile, " -m %s ", orte_sstore_base_local_metadata_filename);
|
||||
|
||||
fprintf(appfile, "-r ");
|
||||
fprintf(appfile, reference_fmt_str, vpid_snapshot->process_name.vpid);
|
||||
|
||||
fprintf(appfile, "\n");
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if(NULL != appfile) {
|
||||
fclose(appfile);
|
||||
appfile = NULL;
|
||||
}
|
||||
if( NULL != tmp_str ) {
|
||||
free(tmp_str);
|
||||
tmp_str = NULL;
|
||||
}
|
||||
if( NULL != location_str ) {
|
||||
free(location_str);
|
||||
location_str = NULL;
|
||||
}
|
||||
if( NULL != reference_fmt_str ) {
|
||||
free(reference_fmt_str);
|
||||
reference_fmt_str = NULL;
|
||||
}
|
||||
if( NULL != ref_location_fmt_str ) {
|
||||
free(ref_location_fmt_str);
|
||||
ref_location_fmt_str = NULL;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int spawn_children(orte_sstore_base_global_snapshot_info_t *snapshot, pid_t *child_pid)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
char *amca_param = NULL;
|
||||
char *tune_param = NULL;
|
||||
char **argv = NULL;
|
||||
int argc = 0, i;
|
||||
int status;
|
||||
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_AMCA_PARAM,
|
||||
&amca_param);
|
||||
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_TUNE_PARAM,
|
||||
&tune_param);
|
||||
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "mpirun")) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "-am")) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if( NULL == amca_param ) {
|
||||
amca_param = strdup("ft-enable-cr");
|
||||
opal_show_help("help-orte-restart.txt", "amca_param_not_found", true,
|
||||
amca_param);
|
||||
}
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, amca_param)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "-tune")) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if( NULL == tune_param ) {
|
||||
tune_param = strdup("ft-enable-cr");
|
||||
opal_show_help("help-orte-restart.txt", "tune_param_not_found", true,
|
||||
tune_param);
|
||||
}
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, tune_param)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if( NULL != orte_restart_globals.hostfile ) {
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--default-hostfile")) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.hostfile)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
if( orte_restart_globals.mpirun_opts ) {
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.mpirun_opts)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
if( orte_restart_globals.enable_crdebug ) {
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--crdebug")) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--app")) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.appfile)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( orte_restart_globals.showme ) {
|
||||
for(i = 0; i < argc; ++i ) {
|
||||
/*printf("%2d: (%s)\n", i, argv[i]);*/
|
||||
printf("%s ", argv[i]);
|
||||
}
|
||||
printf("\n");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* To fork off a child */
|
||||
if( orte_restart_globals.forked ) {
|
||||
*child_pid = fork();
|
||||
|
||||
if( 0 == *child_pid) {
|
||||
/* Child Process */
|
||||
status = execvp(strdup(argv[0]), argv);
|
||||
if( 0 > status) {
|
||||
opal_output(orte_restart_globals.output,
|
||||
"orte_restart: execv failed with status = %d\n",
|
||||
status);
|
||||
}
|
||||
exit_status = status;
|
||||
goto cleanup;
|
||||
}
|
||||
else if(0 < *child_pid) {
|
||||
/* Parent is done once it is started */
|
||||
;
|
||||
}
|
||||
else {
|
||||
opal_output(orte_restart_globals.output,
|
||||
"orte_restart: fork failed: This should never happen!");
|
||||
/* Fork failed :( */
|
||||
exit_status = *child_pid;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* ... or not to fork off a child */
|
||||
else {
|
||||
/* Make sure to finalize so we don't leave our session directory */
|
||||
orte_finalize();
|
||||
|
||||
status = execvp(strdup(argv[0]), argv);
|
||||
if( 0 > status) {
|
||||
/* execv failed */
|
||||
}
|
||||
exit_status = status;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, orte_restart_globals.output,
|
||||
"orte_restart: Restarted Child with PID = %d\n", *child_pid);
|
||||
|
||||
cleanup:
|
||||
if( NULL != argv)
|
||||
opal_argv_free(argv);
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
int snapshot_info(orte_sstore_base_global_snapshot_info_t *snapshot)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
int num_seqs, processes, i;
|
||||
char **snapshot_ref_seqs = NULL;
|
||||
opal_list_item_t* item = NULL;
|
||||
orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
|
||||
char *tmp_str = NULL;
|
||||
|
||||
/*
|
||||
* Find all sequence numbers
|
||||
*/
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_SNAP_NUM_SEQ,
|
||||
&tmp_str);
|
||||
num_seqs = atoi(tmp_str);
|
||||
if( NULL != tmp_str ) {
|
||||
free(tmp_str);
|
||||
tmp_str = NULL;
|
||||
}
|
||||
orte_sstore.get_attr(snapshot->ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_SNAP_ALL_SEQ,
|
||||
&tmp_str);
|
||||
snapshot_ref_seqs = opal_argv_split(tmp_str, ',');
|
||||
if( NULL != tmp_str ) {
|
||||
free(tmp_str);
|
||||
tmp_str = NULL;
|
||||
}
|
||||
|
||||
if( 0 > orte_restart_globals.seq_number ) {
|
||||
opal_output(orte_restart_globals.output,
|
||||
"Sequences: %d\n",
|
||||
num_seqs);
|
||||
}
|
||||
|
||||
for(i=0; i < num_seqs; ++i) {
|
||||
snapshot->seq_num = atoi(snapshot_ref_seqs[i]);
|
||||
|
||||
if( 0 <= orte_restart_globals.seq_number &&
|
||||
snapshot->seq_num != orte_restart_globals.seq_number ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_sstore_base_extract_global_metadata( snapshot ) ) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output(orte_restart_globals.output,
|
||||
"Seq: %d\n",
|
||||
snapshot->seq_num);
|
||||
|
||||
if (NULL != snapshot->start_time ) {
|
||||
opal_output(orte_restart_globals.output,
|
||||
"\tBegin Timestamp: %s\n",
|
||||
snapshot->start_time);
|
||||
}
|
||||
if (NULL != snapshot->end_time ) {
|
||||
opal_output(orte_restart_globals.output,
|
||||
"\tEnd Timestamp : %s\n",
|
||||
snapshot->end_time);
|
||||
}
|
||||
|
||||
processes = opal_list_get_size(&snapshot->local_snapshots);
|
||||
opal_output(orte_restart_globals.output,
|
||||
"\tProcesses: %d\n",
|
||||
processes);
|
||||
|
||||
for(item = opal_list_get_first(&snapshot->local_snapshots);
|
||||
item != opal_list_get_end(&snapshot->local_snapshots);
|
||||
item = opal_list_get_next(item) ) {
|
||||
vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
|
||||
|
||||
opal_output_verbose(10, orte_restart_globals.output,
|
||||
"\t\tProcess: %u.%u \t CRS: %s \t Compress: %s (%s)",
|
||||
vpid_snapshot->process_name.jobid,
|
||||
vpid_snapshot->process_name.vpid,
|
||||
vpid_snapshot->crs_comp,
|
||||
vpid_snapshot->compress_comp,
|
||||
vpid_snapshot->compress_postfix);
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int snapshot_sort_compare_fn(opal_list_item_t **a,
|
||||
opal_list_item_t **b)
|
||||
{
|
||||
orte_sstore_base_local_snapshot_info_t *snap_a, *snap_b;
|
||||
|
||||
snap_a = (orte_sstore_base_local_snapshot_info_t*)(*a);
|
||||
snap_b = (orte_sstore_base_local_snapshot_info_t*)(*b);
|
||||
|
||||
if( snap_a->process_name.vpid > snap_b->process_name.vpid ) {
|
||||
return 1;
|
||||
}
|
||||
else if( snap_a->process_name.vpid == snap_b->process_name.vpid ) {
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
return -1;
|
||||
}
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user