From ed508010b4675fcfb63e7717860cf71c97691002 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 18 Sep 2017 07:30:47 -0700 Subject: [PATCH] Remove stale tools Signed-off-by: Ralph Castain --- config/orte_config_files.m4 | 3 - ompi/mca/rte/orte/Makefile.am | 18 - orte/tools/Makefile.am | 6 - orte/tools/orte-checkpoint/Makefile.am | 51 - .../orte-checkpoint/help-orte-checkpoint.txt | 113 -- .../tools/orte-checkpoint/orte-checkpoint.1in | 103 -- orte/tools/orte-checkpoint/orte-checkpoint.c | 985 ------------------ orte/tools/orte-migrate/Makefile.am | 44 - orte/tools/orte-migrate/help-orte-migrate.txt | 81 -- orte/tools/orte-migrate/orte-migrate.1in | 81 -- orte/tools/orte-migrate/orte-migrate.c | 791 -------------- orte/tools/orte-restart/Makefile.am | 51 - orte/tools/orte-restart/help-orte-restart.txt | 77 -- orte/tools/orte-restart/orte-restart.1in | 115 -- orte/tools/orte-restart/orte-restart.c | 897 ---------------- 15 files changed, 3416 deletions(-) delete mode 100644 orte/tools/orte-checkpoint/Makefile.am delete mode 100644 orte/tools/orte-checkpoint/help-orte-checkpoint.txt delete mode 100644 orte/tools/orte-checkpoint/orte-checkpoint.1in delete mode 100644 orte/tools/orte-checkpoint/orte-checkpoint.c delete mode 100644 orte/tools/orte-migrate/Makefile.am delete mode 100644 orte/tools/orte-migrate/help-orte-migrate.txt delete mode 100644 orte/tools/orte-migrate/orte-migrate.1in delete mode 100644 orte/tools/orte-migrate/orte-migrate.c delete mode 100644 orte/tools/orte-restart/Makefile.am delete mode 100644 orte/tools/orte-restart/help-orte-restart.txt delete mode 100644 orte/tools/orte-restart/orte-restart.1in delete mode 100644 orte/tools/orte-restart/orte-restart.c diff --git a/config/orte_config_files.m4 b/config/orte_config_files.m4 index d31bb8b947..90f69808c9 100644 --- a/config/orte_config_files.m4 +++ b/config/orte_config_files.m4 @@ -25,12 +25,9 @@ AC_DEFUN([ORTE_CONFIG_FILES],[ orte/tools/wrappers/Makefile orte/tools/wrappers/ortecc-wrapper-data.txt orte/tools/wrappers/orte.pc - orte/tools/orte-checkpoint/Makefile - orte/tools/orte-restart/Makefile orte/tools/orte-ps/Makefile orte/tools/orte-clean/Makefile orte/tools/orte-top/Makefile - orte/tools/orte-migrate/Makefile orte/tools/orte-info/Makefile orte/tools/orte-server/Makefile orte/tools/orte-dvm/Makefile diff --git a/ompi/mca/rte/orte/Makefile.am b/ompi/mca/rte/orte/Makefile.am index 80e07c30df..451436373b 100644 --- a/ompi/mca/rte/orte/Makefile.am +++ b/ompi/mca/rte/orte/Makefile.am @@ -84,24 +84,6 @@ $(top_builddir)/orte/tools/orte-clean/orte-clean.1: ompi-clean.1: $(top_builddir)/orte/tools/orte-clean/orte-clean.1 cp -f $(top_builddir)/orte/tools/orte-clean/orte-clean.1 ompi-clean.1 -$(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1: - (cd $(top_builddir)/orte/tools/orte-checkpoint && $(MAKE) $(AM_MAKEFLAGS) orte-checkpoint.1) - -ompi-checkpoint.1: $(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1 - cp -f $(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1 ompi-checkpoint.1 - -$(top_builddir)/orte/tools/orte-restart/orte-restart.1: - (cd $(top_builddir)/orte/tools/orte-restart && $(MAKE) $(AM_MAKEFLAGS) orte-restart.1) - -ompi-restart.1: $(top_builddir)/orte/tools/orte-restart/orte-restart.1 - cp -f $(top_builddir)/orte/tools/orte-restart/orte-restart.1 ompi-restart.1 - -$(top_builddir)/orte/tools/orte-migrate/orte-migrate.1: - (cd $(top_builddir)/orte/tools/orte-migrate && $(MAKE) $(AM_MAKEFLAGS) orte-migrate.1) - -ompi-migrate.1: $(top_builddir)/orte/tools/orte-migrate/orte-migrate.1 - cp -f $(top_builddir)/orte/tools/orte-migrate/orte-migrate.1 ompi-migrate.1 - $(top_builddir)/orte/tools/orte-top/orte-top.1: (cd $(top_builddir)/orte/tools/orte-top && $(MAKE) $(AM_MAKEFLAGS) orte-top.1) diff --git a/orte/tools/Makefile.am b/orte/tools/Makefile.am index ce477ff9aa..4e08658e5f 100644 --- a/orte/tools/Makefile.am +++ b/orte/tools/Makefile.am @@ -25,29 +25,23 @@ # orte/Makefile.am SUBDIRS += \ - tools/orte-checkpoint \ tools/orte-clean \ tools/orte-ps \ - tools/orte-restart \ tools/orted \ tools/orterun \ tools/wrappers \ tools/orte-top \ tools/orte-info \ - tools/orte-migrate \ tools/orte-server DIST_SUBDIRS += \ - tools/orte-checkpoint \ tools/orte-clean \ tools/orte-ps \ - tools/orte-restart \ tools/orted \ tools/orterun \ tools/wrappers \ tools/orte-top \ tools/orte-info \ - tools/orte-migrate \ tools/orte-server \ tools/orte-dvm \ tools/prun diff --git a/orte/tools/orte-checkpoint/Makefile.am b/orte/tools/orte-checkpoint/Makefile.am deleted file mode 100644 index ca0235630e..0000000000 --- a/orte/tools/orte-checkpoint/Makefile.am +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. -# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -include $(top_srcdir)/Makefile.ompi-rules - -man_pages = orte-checkpoint.1 -EXTRA_DIST = orte-checkpoint.1in - -if WANT_FT_CR -if OPAL_INSTALL_BINARIES - -bin_PROGRAMS = orte-checkpoint - -nodist_man_MANS = $(man_pages) - -# Ensure that the man pages are rebuilt if the opal_config.h file -# changes; a "good enough" way to know if configure was run again (and -# therefore the release date or version may have changed) -$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h - -dist_ortedata_DATA = help-orte-checkpoint.txt - -endif # OPAL_INSTALL_BINARIES - -orte_checkpoint_SOURCES = orte-checkpoint.c -orte_checkpoint_LDADD = \ - $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ - $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la - -endif # WANT_FT_CR - -distclean-local: - rm -f $(man_pages) diff --git a/orte/tools/orte-checkpoint/help-orte-checkpoint.txt b/orte/tools/orte-checkpoint/help-orte-checkpoint.txt deleted file mode 100644 index 426bd6929e..0000000000 --- a/orte/tools/orte-checkpoint/help-orte-checkpoint.txt +++ /dev/null @@ -1,113 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English help file for Open MPI checkpoint tool -# -[usage] -ompi-checkpoint PID_OF_MPIRUN - Open MPI Checkpoint Tool - -%s -# -[usage-no-cr] -This build of Open MPI does *not* include Checkpoint/Restart functionality. -If you require this functionality re-configure Open MPI with the proper -Checkpoint/Restart options. - -ompi-checkpoint PID_OF_MPIRUN - Open MPI Checkpoint Tool - -%s -# -[invalid_pid] -Error: The PID (%d) is invalid because either you have not provided a PID - or provided an invalid PID. - Please see --help for usage. -# -[ckpt_failure] -Error: The application (PID = %d) failed to checkpoint properly. - Returned %d. -# -[pid_does_not_exist] -Error: The process with PID %d is not checkpointable. - This could be due to one of the following: - - An application with this PID doesn't currently exist - - The application with this PID isn't checkpointable - - The application with this PID isn't an Open MPI application. - We were looking for the named file: - %s -# -[no_hnps] -Error: Unable to find a list of active MPIRUN processes on this machine. - This could be due to one of the following: - - The PID specified (%d) is not that of an active MPIRUN. - - The session directory location could not be found/parsed. - - ompi-checkpoint attempted to find the session directory: - %s/%s - Check to make sure that this directory exists while the MPIRUN - process is running. - - Return Code: %d (%s) -# -[no_universe] -Error: Unable to find the requested, active MPIRUN process on this machine. - This could be due to one of the following: - - The jobid specified by the '--hnp-jobid' option is not - correct. - - The PID specified (%d) is not that of an active MPIRUN. - - The application with this PID is not checkpointable - - The application with this PID is not an Open MPI application. - - The session directory location could not be parsed. - - ompi-checkpoint attempted to use the session directory: - %s/%s -# -[unable_to_connect] -Error: Unable to connect to the Head Node Process to initiate the - checkpoint of the application. - This could be due to one of the following: - - The universe specified by the '--hnp-jobid' option is not - correct. - - The PID is not that of an active MPIRUN. - - The application with this PID isn't checkpointable - - The application with this PID isn't an Open MPI application. -# -[non-ckptable] -Error: The job with pid %d is not checkpointable. - This could be caused by one of the following: - - The application is using unsupported components. - - Your application did not select to be checkpointable -To enable checkpointing in an application use the following AMCA parameter -argument to mpirun: - -am ft-enable-cr -# -[not_impl] -The following feature was requested, but is not currently implemented. - %s -If you require this feature contact the Open MPI development group. - -[pid_not_found] -Error: The process with PID %d is not checkpointable. - This could be due to one of the following: - - An application with this PID doesn't currently exist - - The application with this PID isn't an Open MPI application. -# -[hnp_not_found] -Error: The jobid specified by the '--hnp-jobid' option does not exist. diff --git a/orte/tools/orte-checkpoint/orte-checkpoint.1in b/orte/tools/orte-checkpoint/orte-checkpoint.1in deleted file mode 100644 index 806403bc8e..0000000000 --- a/orte/tools/orte-checkpoint/orte-checkpoint.1in +++ /dev/null @@ -1,103 +0,0 @@ -.\" -.\" Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana -.\" University Research and Technology -.\" Corporation. All rights reserved. -.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. -.\" -.\" Man page for OMPI's ompi-checkpoint command -.\" -.\" .TH name section center-footer left-footer center-header -.TH OMPI-CHECKPOINT 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" -.\" ************************** -.\" Name Section -.\" ************************** -.SH NAME -. -ompi-checkpoint, orte-checkpoint \- Checkpoint a running parallel process using the Open MPI -Checkpoint/Restart Service (CRS) -. -.PP -. -\fBNOTE:\fP \fIompi-checkpoint\fP, and \fIorte-checkpoint\fP are all exact -synonyms for each other. Using any of the names will result in exactly -identical behavior. -. -.\" ************************** -.\" Synopsis Section -.\" ************************** -.SH SYNOPSIS -. -.B ompi-checkpoint -.B [ options ] -.B -. -.\" ************************** -.\" Options Section -.\" ************************** -.SH Options -. -\fIorte-checkpoint\fR will attempt to notify a running parallel job (identified -by \fImpirun\fP) that it has been requested that the job checkpoint itself. A -global snapshot handle reference is presented to the user, which is used in -\fIompi_restart\fP to restart the job. -. -.TP 10 -.B -Process ID of the \fImpirun\fP process. -. -. -.TP -.B -h | --help -Display help for this command -. -. -.TP -.B -w | --nowait -Do not wait for the application to finish checkpointing before returning. -. -. -.TP -.B -s | --status -Display status messages regarding the progression of the checkpoint request. -. -. -.TP -.B --term -After checkpointing the running job, terminate it. -. -. -.TP -.B -v | --verbose -Enable verbose output for debugging. -. -. -.TP -.B -gmca | --gmca \fR \fP -Pass global MCA parameters that are applicable to all contexts. \fI\fP is -the parameter name; \fI\fP is the parameter value. -. -. -.TP -.B -mca | --mca -Send arguments to various MCA modules. -. -. -.\" ************************** -.\" Description Section -.\" ************************** -.SH DESCRIPTION -. -.PP -\fIorte-checkpoint\fR can be invoked multiple, non-overlapping times. -It is convenient to note that the user does not need to spectify -the checkpointer to be used here, as that is determined completely by each of -the running process in the job being checkpointed. -. -. -.\" ************************** -.\" See Also Section -.\" ************************** -. -.SH SEE ALSO - orte-ps(1), orte-clean(1), ompi-restart(1), opal-checkpoint(1), opal-restart(1), opal_crs(7) -. diff --git a/orte/tools/orte-checkpoint/orte-checkpoint.c b/orte/tools/orte-checkpoint/orte-checkpoint.c deleted file mode 100644 index 4a70de4763..0000000000 --- a/orte/tools/orte-checkpoint/orte-checkpoint.c +++ /dev/null @@ -1,985 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * ORTE Checkpoint Tool for checkpointing a multiprocess job - * - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include -#include -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_FCNTL_H -#include -#endif /* HAVE_FCNTL_H */ -#ifdef HAVE_SYS_TYPES_H -#include -#endif /* HAVE_SYS_TYPES_H */ -#ifdef HAVE_SYS_STAT_H -#include /* for mkfifo */ -#endif /* HAVE_SYS_STAT_H */ -#ifdef HAVE_SYS_WAIT_H -#include -#endif /* HAVE_SYS_WAIT_H */ -#include - - -#include "opal/util/cmd_line.h" -#include "opal/util/output.h" -#include "opal/util/argv.h" -#include "opal/util/opal_environ.h" -#include "opal/mca/base/base.h" -#include "opal/mca/crs/crs.h" -#include "opal/mca/crs/base/base.h" -#include "opal/runtime/opal.h" -#include "opal/runtime/opal_cr.h" - -#include "orte/runtime/runtime.h" -#include "orte/runtime/orte_cr.h" -#include "orte/util/hnp_contact.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/name_fns.h" -#include "opal/util/show_help.h" -#include "orte/util/proc_info.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/errmgr/errmgr.h" -#include "opal/dss/dss.h" -#include "orte/mca/snapc/snapc.h" -#include "orte/mca/snapc/base/base.h" -#include "orte/mca/sstore/sstore.h" -#include "orte/mca/sstore/base/base.h" - -#include MCA_timer_IMPLEMENTATION_HEADER - -/****************** - * Local Functions - ******************/ -static int ckpt_init(int argc, char *argv[]); /* Initalization routine */ -static int ckpt_finalize(void); /* Finalization routine */ -static int parse_args(int argc, char *argv[]); -static int find_hnp(void); - -static int start_listener(void); -static int stop_listener(void); -static void hnp_receiver(int status, - orte_process_name_t* sender, - opal_buffer_t* buffer, - orte_rml_tag_t tag, - void* cbdata); - -static void process_ckpt_update_cmd(orte_process_name_t* sender, - opal_buffer_t* buffer); - -static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options); -static int pretty_print_status(void); -static int pretty_print_reference(void); - -static int list_all_snapshots(void); - -static orte_hnp_contact_t *orterun_hnp = NULL; -static char * global_snapshot_handle = NULL; -static int global_sequence_num = 0; - -/***************************************** - * Global Vars for Command line Arguments - *****************************************/ -static bool listener_started = false; -static bool is_checkpoint_finished = false; -static bool is_checkpoint_established = false; -static bool is_checkpoint_recovered = false; - -static double timer_start = 0; -static double timer_last = 0; -static double get_time(void); - -typedef struct { - bool help; - int pid; - opal_crs_base_ckpt_options_t *options; - bool term; - bool stop; - bool verbose; - int verbose_level; - orte_jobid_t req_hnp; /**< User Requested HNP */ - bool nowait; /* Do not wait for checkpoint to complete before returning */ - bool status; /* Display status messages while checkpoint is progressing */ - int output; - int ckpt_status; - bool list_only; /* List available checkpoints only */ -#if OPAL_ENABLE_CRDEBUG == 1 - bool enable_crdebug; /* Enable C/R Debugging */ - bool attach_debugger; - bool detach_debugger; -#endif -} orte_checkpoint_globals_t; - -orte_checkpoint_globals_t orte_checkpoint_globals; - -opal_cmd_line_init_t cmd_line_opts[] = { - { NULL, - 'h', NULL, "help", - 0, - &orte_checkpoint_globals.help, OPAL_CMD_LINE_TYPE_BOOL, - "This help message" }, - - { NULL, - 'v', NULL, "verbose", - 0, - &orte_checkpoint_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, - "Be Verbose" }, - - { NULL, - 'V', NULL, NULL, - 1, - &orte_checkpoint_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT, - "Set the verbosity level (For additional debugging information)" }, - - { NULL, - '\0', NULL, "term", - 0, - &(orte_checkpoint_globals.term), OPAL_CMD_LINE_TYPE_BOOL, - "Terminate the application after checkpoint (Cannot be used with --stop)" }, - - { NULL, - '\0', NULL, "stop", - 0, - &(orte_checkpoint_globals.stop), OPAL_CMD_LINE_TYPE_BOOL, - "Send SIGSTOP to application just after checkpoint (checkpoint will not finish until SIGCONT is sent) (Cannot be used with --term)" }, - - { NULL, - 'w', NULL, "nowait", - 0, - &orte_checkpoint_globals.nowait, OPAL_CMD_LINE_TYPE_BOOL, - "Do not wait for the application to finish checkpointing before returning" }, - - { NULL, - 's', NULL, "status", - 0, - &orte_checkpoint_globals.status, OPAL_CMD_LINE_TYPE_BOOL, - "Display status messages describing the progression of the checkpoint" }, - - { "hnp-jobid", - '\0', NULL, "hnp-jobid", - 1, - &orte_checkpoint_globals.req_hnp, OPAL_CMD_LINE_TYPE_INT, - "This should be the jobid of the HNP whose applications you wish " - "to checkpoint." }, - - { "hnp-pid", - '\0', NULL, "hnp-pid", - 1, - &orte_checkpoint_globals.pid, OPAL_CMD_LINE_TYPE_INT, - "This should be the pid of the mpirun whose applications you wish " - "to checkpoint." }, - - { NULL, - 'l', NULL, "list", - 0, - &orte_checkpoint_globals.list_only, OPAL_CMD_LINE_TYPE_BOOL, - "Display a list of checkpoint files available on this machine" }, - -#if OPAL_ENABLE_CRDEBUG == 1 - { NULL, - '\0', "crdebug", "crdebug", - 0, - &orte_checkpoint_globals.enable_crdebug, OPAL_CMD_LINE_TYPE_BOOL, - "Enable C/R Enhanced Debugging" }, - - { NULL, - '\0', "attach", "attach", - 0, - &(orte_checkpoint_globals.attach_debugger), OPAL_CMD_LINE_TYPE_BOOL, - "Wait for the debugger to attach directly after taking the checkpoint." }, - - { NULL, - '\0', "detach", "detach", - 0, - &(orte_checkpoint_globals.detach_debugger), OPAL_CMD_LINE_TYPE_BOOL, - "Do not wait for the debugger to reattach after taking the checkpoint." }, -#endif - - /* End of list */ - { NULL, '\0', NULL, NULL, 0, - NULL, OPAL_CMD_LINE_TYPE_NULL, - NULL } -}; - -int -main(int argc, char *argv[]) -{ - int ret, exit_status = ORTE_SUCCESS; - - /*************** - * Initialize - ***************/ - if (ORTE_SUCCESS != (ret = ckpt_init(argc, argv))) { - exit_status = ret; - goto cleanup; - } - - /************************************* - * Listing only Checkpoint References - *************************************/ - if( orte_checkpoint_globals.list_only ) { - if (ORTE_SUCCESS != (ret = list_all_snapshots())) { - exit_status = ret; - goto cleanup; - } - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /*************************** - * Find the HNP that we want to connect to, if it exists - ***************************/ - if (ORTE_SUCCESS != (ret = find_hnp())) { - /* Error printed by called function */ - exit_status = ret; - goto cleanup; - } - - /******************************* - * Checkpoint the requested PID - *******************************/ - is_checkpoint_finished = false; - is_checkpoint_recovered = false; - is_checkpoint_established = false; - - if( orte_checkpoint_globals.verbose ) { - opal_output_verbose(10, orte_checkpoint_globals.output, - "orte_checkpoint: Checkpointing..."); - if (0 < orte_checkpoint_globals.pid) { - opal_output_verbose(10, orte_checkpoint_globals.output, - "\t PID %d", - orte_checkpoint_globals.pid); - } else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){ - opal_output_verbose(10, orte_checkpoint_globals.output, - "\t Mpirun (%s)", - ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp)); - } - - opal_output_verbose(10, orte_checkpoint_globals.output, - "\t Connected to Mpirun %s", - ORTE_NAME_PRINT(&orterun_hnp->name)); - - if(orte_checkpoint_globals.options->term) { - opal_output_verbose(10, orte_checkpoint_globals.output, - "\t Terminating after checkpoint\n"); - } - if(orte_checkpoint_globals.options->stop) { - opal_output_verbose(10, orte_checkpoint_globals.output, - "\t Stopping after checkpoint\n"); - } - } - - if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.options)) ) { - opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, - orte_checkpoint_globals.pid, ret); - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * Wait for the checkpoint to complete - */ - if(!orte_checkpoint_globals.nowait) { - while( !is_checkpoint_finished ) { - opal_progress(); - } - } - - if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status || - ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) { - exit_status = ORTE_ERROR; - goto cleanup; - } - - if(!orte_checkpoint_globals.nowait) { - pretty_print_reference(); - } - - cleanup: - /*************** - * Cleanup - ***************/ - if (ORTE_SUCCESS != (ret = ckpt_finalize())) { - return ret; - } - - return exit_status; -} - -static int parse_args(int argc, char *argv[]) { - int i, ret, len, exit_status = ORTE_SUCCESS ; - opal_cmd_line_t cmd_line; - char **app_env = NULL, **global_env = NULL; - char * tmp_env_var = NULL; - char *argv0 = NULL; - - /* Init structure */ - memset(&orte_checkpoint_globals, 0, sizeof(orte_checkpoint_globals_t)); - orte_checkpoint_globals.help = false; - orte_checkpoint_globals.pid = -1; - orte_checkpoint_globals.verbose = false; - orte_checkpoint_globals.verbose_level = 0; - orte_checkpoint_globals.req_hnp = ORTE_JOBID_INVALID; - orte_checkpoint_globals.nowait = false; - orte_checkpoint_globals.status = false; - orte_checkpoint_globals.output = -1; - orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE; - orte_checkpoint_globals.list_only = false; -#if OPAL_ENABLE_CRDEBUG == 1 - orte_checkpoint_globals.enable_crdebug = false; -#endif - - orte_checkpoint_globals.options = OBJ_NEW(opal_crs_base_ckpt_options_t); - orte_checkpoint_globals.term = false; - orte_checkpoint_globals.stop = false; -#if OPAL_ENABLE_CRDEBUG == 1 - orte_checkpoint_globals.attach_debugger = false; - orte_checkpoint_globals.detach_debugger = false; -#endif - -#if OPAL_ENABLE_FT_CR == 0 - /* Warn and exit if not configured with Checkpoint/Restart */ - { - char *str, *args = NULL; - args = opal_cmd_line_get_usage_msg(&cmd_line); - str = opal_show_help_string("help-orte-checkpoint.txt", "usage-no-cr", - true, args); - if (NULL != str) { - printf("%s", str); - free(str); - } - free(args); - exit_status = ORTE_ERROR; - goto cleanup; - } -#endif - - /* Parse the command line options */ - opal_cmd_line_create(&cmd_line, cmd_line_opts); - mca_base_open(); - mca_base_cmd_line_setup(&cmd_line); - ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv); - - if (OPAL_SUCCESS != ret) { - if (OPAL_ERR_SILENT != ret) { - fprintf(stderr, "%s: command line error (%s)\n", argv[0], - opal_strerror(ret)); - } - exit_status = 1; - goto cleanup; - } - - if (orte_checkpoint_globals.help) { - char *str, *args = NULL; - args = opal_cmd_line_get_usage_msg(&cmd_line); - str = opal_show_help_string("help-orte-checkpoint.txt", "usage", true, - args); - if (NULL != str) { - printf("%s", str); - free(str); - } - free(args); - /* If we show the help message, that should be all we do */ - exit(0); - } - - /** - * Put all of the MCA arguments in the environment - */ - mca_base_cmd_line_process_args(argc, &app_env, &global_env); - - len = opal_argv_count(app_env); - for(i = 0; i < len; ++i) { - putenv(app_env[i]); - } - - len = opal_argv_count(global_env); - for(i = 0; i < len; ++i) { - putenv(global_env[i]); - } - - (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var); - opal_setenv(tmp_env_var, - "1", - true, &environ); - free(tmp_env_var); - tmp_env_var = NULL; - - /** - * Now start parsing our specific arguments - */ - /* get the remaining bits */ - argv0 = strdup(argv[0]); - opal_cmd_line_get_tail(&cmd_line, &argc, &argv); - - if(orte_checkpoint_globals.list_only ) { - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - if (0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp) { - fprintf(stderr, "%s: Nothing to do\n", argv0); - fprintf(stderr, "Type '%s --help' for usage.\n", argv0); - exit_status = 1; - goto cleanup; - } - - orte_checkpoint_globals.options->term = orte_checkpoint_globals.term; - orte_checkpoint_globals.options->stop = orte_checkpoint_globals.stop; -#if OPAL_ENABLE_CRDEBUG == 1 - orte_checkpoint_globals.options->attach_debugger = orte_checkpoint_globals.attach_debugger; - orte_checkpoint_globals.options->detach_debugger = orte_checkpoint_globals.detach_debugger; -#endif - - if(orte_checkpoint_globals.verbose_level < 0 ) { - orte_checkpoint_globals.verbose_level = 0; - } - - if(orte_checkpoint_globals.verbose_level > 0) { - orte_checkpoint_globals.verbose = true; - } - - /* - * If the user did not supply an hnp jobid, then they must - * supply the PID of MPIRUN - */ - if(0 >= argc && - ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp) { - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - orte_checkpoint_globals.pid = atoi(argv[0]); - if ( 0 >= orte_checkpoint_globals.pid ) { - opal_show_help("help-orte-checkpoint.txt", "invalid_pid", true, - orte_checkpoint_globals.pid); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * JJH: No wait is currently not implemented or tested - */ - if(orte_checkpoint_globals.nowait) { - orte_checkpoint_globals.nowait = false; - opal_show_help("help-orte-checkpoint.txt", "not_impl", - true, - "Disconnected checkpoint"); - } - - if(orte_checkpoint_globals.verbose) { - orte_checkpoint_globals.status = true; - } - - cleanup: - if (NULL != argv0) { - free(argv0); - } - - return exit_status; -} - -/* - * This function attempts to find an HNP to connect to. - */ -static int find_hnp(void) { - int ret, exit_status = ORTE_SUCCESS; - opal_list_t hnp_list; - opal_list_item_t *item; - orte_hnp_contact_t *hnpcandidate; - - /* get the list of local hnp's available to us and setup - * contact info for them into the RML - */ - OBJ_CONSTRUCT(&hnp_list, opal_list_t); - if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) { - opal_show_help("help-orte-checkpoint.txt", "no_hnps", true, - orte_checkpoint_globals.pid, - orte_process_info.tmpdir_base, - orte_process_info.top_session_dir, - ret, ORTE_ERROR_NAME(ret)); - exit_status = ret; - goto cleanup; - } - - /* search the list for the desired hnp */ - while (NULL != (item = opal_list_remove_first(&hnp_list))) { - hnpcandidate = (orte_hnp_contact_t*)item; - if (hnpcandidate->name.jobid == orte_checkpoint_globals.req_hnp || - hnpcandidate->pid == orte_checkpoint_globals.pid) { - /* this is the one we want */ - orterun_hnp = hnpcandidate; - exit_status = ORTE_SUCCESS; - goto cleanup; - } - } - - /* If no match was found, error out */ - opal_show_help("help-orte-checkpoint.txt", "no_universe", true, - orte_checkpoint_globals.pid, - orte_process_info.tmpdir_base, - orte_process_info.top_session_dir); - -cleanup: - while (NULL != (item = opal_list_remove_first(&hnp_list))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&hnp_list); - - if( NULL == orterun_hnp ) { - return ORTE_ERROR; - } else { - return exit_status; - } -} - -static int ckpt_init(int argc, char *argv[]) { - int exit_status = ORTE_SUCCESS, ret; - char * tmp_env_var = NULL; - - listener_started = false; - - /* - * Make sure to init util before parse_args - * to ensure installdirs is setup properly - * before calling mca_base_open(); - */ - if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) { - return ret; - } - - /* - * Parse Command Line Arguments - */ - if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) { - return ret; - } - - /* Disable the checkpoint notification routine for this - * tool. As we will never need to checkpoint this tool. - * Note: This must happen before opal_init(). - */ - opal_cr_set_enabled(false); - - /* Select the none component, since we don't actually use a checkpointer */ - (void) mca_base_var_env_name("crs", &tmp_env_var); - opal_setenv(tmp_env_var, - "none", - true, &environ); - free(tmp_env_var); - tmp_env_var = NULL; - - /* we are never allowed to operate as a distributed tool, - * so insist on the ess/tool component */ - opal_setenv("OMPI_MCA_ess", "tool", true, &environ); - - /*************************** - * We need all of OPAL and the TOOLS portion of ORTE - this - * sets us up so we can talk to any HNP over the wire - ***************************/ - if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) { - exit_status = ret; - goto cleanup; - } - - /* - * Setup ORTE Output handle from the verbose argument - */ - if( orte_checkpoint_globals.verbose ) { - orte_checkpoint_globals.output = opal_output_open(NULL); - opal_output_set_verbosity(orte_checkpoint_globals.output, orte_checkpoint_globals.verbose_level); - } else { - orte_checkpoint_globals.output = 0; /* Default=STDERR */ - } - - /* - * Start the listener - */ - if( ORTE_SUCCESS != (ret = start_listener() ) ) { - exit_status = ret; - } - - cleanup: - return exit_status; -} - -static int ckpt_finalize(void) { - int exit_status = ORTE_SUCCESS, ret; - - /* - * Stop the listener - */ - if( ORTE_SUCCESS != (ret = stop_listener() ) ) { - exit_status = ret; - } - - if (ORTE_SUCCESS != (ret = orte_finalize())) { - exit_status = ret; - } - - return exit_status; -} - -static int start_listener(void) -{ - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT, - ORTE_RML_PERSISTENT, hnp_receiver, NULL); - - listener_started = true; - return ORTE_SUCCESS; -} - -static int stop_listener(void) -{ - if( !listener_started ) { - return ORTE_ERROR; - } - - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT); - - listener_started = false; - return ORTE_SUCCESS; -} - -static void hnp_receiver(int status, - orte_process_name_t* sender, - opal_buffer_t* buffer, - orte_rml_tag_t tag, - void* cbdata) -{ - orte_snapc_cmd_flag_t command; - orte_std_cntr_t count; - int rc; - - opal_output_verbose(5, orte_checkpoint_globals.output, - "orte_checkpoint: hnp_receiver: Receive a command message."); - - /* - * Otherwise this is an inter-coordinator command (usually updating state info). - */ - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_CMD))) { - ORTE_ERROR_LOG(rc); - return; - } - - switch (command) { - case ORTE_SNAPC_GLOBAL_UPDATE_CMD: - opal_output_verbose(10, orte_checkpoint_globals.output, - "orte_checkpoint: hnp_receiver: Status Update."); - - process_ckpt_update_cmd(sender, buffer); - break; - - case ORTE_SNAPC_GLOBAL_INIT_CMD: - case ORTE_SNAPC_GLOBAL_TERM_CMD: - /* Do Nothing */ - break; - default: - ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); - } -} - -static void process_ckpt_update_cmd(orte_process_name_t* sender, - opal_buffer_t* buffer) -{ - int ret; - orte_std_cntr_t count = 1; - int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE; - - /* - * Receive the data: - * - ckpt_state - * - global snapshot handle (upon finish only) - * - sequence number (upon finish only) - */ - count = 1; - if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) { - return; - } - orte_checkpoint_globals.ckpt_status = ckpt_status; - - if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status || - ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status || - ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status || - ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) { - count = 1; - if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_snapshot_handle, &count, OPAL_STRING)) ) { - return; - } - count = 1; - if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_sequence_num, &count, OPAL_INT)) ) { - return; - } - } - - /* - * If the job is not able to be checkpointed, then return - */ - if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) { - opal_show_help("help-orte-checkpoint.txt", "non-ckptable", - true, - orte_checkpoint_globals.pid); - is_checkpoint_finished = true; - return; - } - - if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status) { - opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, - orte_checkpoint_globals.pid, ORTE_ERROR); - is_checkpoint_finished = true; - return; - } - - /* Status progression */ - if( orte_checkpoint_globals.status ) { - pretty_print_status(); - } - - if( ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status) { - is_checkpoint_finished = true; - return; - } - - /* Normal termination check */ - if( (ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status && is_checkpoint_established) || - (ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status && is_checkpoint_recovered) ){ - is_checkpoint_finished = true; - return; - } - else if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status ) { - is_checkpoint_recovered = true; - } - else if(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ) { - is_checkpoint_established = true; - } -} - -static int notify_process_for_checkpoint(opal_crs_base_ckpt_options_t *options) -{ - int ret, exit_status = ORTE_SUCCESS; - opal_buffer_t *buffer = NULL; - orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_INIT_CMD; - orte_jobid_t jobid = ORTE_JOBID_INVALID; - - if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) { - exit_status = ORTE_ERROR; - goto cleanup; - } - - opal_output_verbose(10, orte_checkpoint_globals.output, - "orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n", - orte_checkpoint_globals.pid); - - timer_start = get_time(); - - /*********************************** - * Notify HNP of checkpoint request - * Send: - * - Command - * - options - * - jobid - ***********************************/ - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SNAPC_CMD)) ) { - exit_status = ret; - goto cleanup; - } - - if( ORTE_SUCCESS != (ret = orte_snapc_base_pack_options(buffer, options)) ) { - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &jobid, 1, ORTE_JOBID))) { - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&(orterun_hnp->name), buffer, - ORTE_RML_TAG_CKPT, orte_rml_send_callback, - NULL))) { - exit_status = ret; - goto cleanup; - } - - opal_output_verbose(10, orte_checkpoint_globals.output, - "orte_checkpoint: notify_hnp: Requested a checkpoint of jobid %s\n", - ORTE_JOBID_PRINT(jobid)); - - cleanup: - if( ORTE_SUCCESS != exit_status ) { - opal_show_help("help-orte-checkpoint.txt", "unable_to_connect", true, - orte_checkpoint_globals.pid); - } - - return exit_status; -} - -/*************** - * Pretty Print - ***************/ -static double get_time(void) { - double wtime; - -#if OPAL_TIMER_USEC_NATIVE - wtime = (double)opal_timer_base_get_usec() / 1000000.0; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - wtime = tv.tv_sec; - wtime += (double)tv.tv_usec / 1000000.0; -#endif - - return wtime; -} - -static int pretty_print_status(void) { - char * state_str = NULL; - double cur_time; - - cur_time = get_time(); - - if( timer_last == 0 ) { - timer_last = cur_time; - } - - orte_snapc_ckpt_state_str(&state_str, orte_checkpoint_globals.ckpt_status); - - if( NULL != global_snapshot_handle ) { - opal_output(0, - "[%6.2f / %6.2f] %*s - %s\n", - (cur_time - timer_last), (cur_time - timer_start), - 25, state_str, global_snapshot_handle); - } else { - opal_output(0, - "[%6.2f / %6.2f] %*s - ...\n", - (cur_time - timer_last), (cur_time - timer_start), - 25, state_str); - } - - if( NULL != state_str) { - free(state_str); - } - - timer_last = cur_time; - - return ORTE_SUCCESS; -} - -static int pretty_print_reference(void) -{ -#if OPAL_ENABLE_CRDEBUG == 1 - if( orte_checkpoint_globals.enable_crdebug ) { - printf("Checkpoint handle: -s %3d %s\n", - global_sequence_num, - global_snapshot_handle); - return ORTE_SUCCESS; - } -#endif - - printf("Snapshot Ref.: %3d %s\n", - global_sequence_num, - global_snapshot_handle); - - return ORTE_SUCCESS; -} - -static int list_all_snapshots(void) { - int ret, exit_status = ORTE_SUCCESS; - opal_list_t *all_snapshots = NULL; - opal_list_item_t* item = NULL; - orte_sstore_base_global_snapshot_info_t *global_snapshot = NULL; - - all_snapshots = OBJ_NEW(opal_list_t); - - if( ORTE_SUCCESS != (ret = orte_sstore_base_get_all_snapshots(all_snapshots, NULL)) ) { - opal_output(0, "Error: Unable to list the checkpoints in the directory <%s>\n", - orte_sstore_base_global_snapshot_dir); - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * For each reference - */ - for(item = opal_list_get_first(all_snapshots); - item != opal_list_get_end(all_snapshots); - item = opal_list_get_next(item) ) { - global_snapshot = (orte_sstore_base_global_snapshot_info_t*)item; - - /* - * Get a list of valid sequence numbers - */ - if( ORTE_SUCCESS != (ret = orte_sstore_base_find_all_seq_nums(global_snapshot, - &(global_snapshot->num_seqs), - &(global_snapshot->all_seqs)))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - -#if OPAL_ENABLE_CRDEBUG == 1 - int s; - /* Pretty print the result - C/R Debug version */ - if( orte_checkpoint_globals.enable_crdebug ) { - for(s = 0; s < global_snapshot->num_seqs; ++s) { - printf("-s %s %s\n", global_snapshot->all_seqs[s], global_snapshot->reference); - } - } - else -#endif - { - /* Pretty print the result */ - printf("Snapshot Ref.: %s\t[", - global_snapshot->reference); - if( 0 >= global_snapshot->num_seqs ) { - printf("No Valid Checkpoints"); - } else { - printf("%s", - opal_argv_join(global_snapshot->all_seqs, ',')); - } - printf("]\n"); - } - } - - cleanup: - while (NULL != (item = opal_list_remove_first(all_snapshots))) { - OBJ_RELEASE(item); - } - OBJ_RELEASE(all_snapshots); - - return exit_status; -} diff --git a/orte/tools/orte-migrate/Makefile.am b/orte/tools/orte-migrate/Makefile.am deleted file mode 100644 index b38a7a8274..0000000000 --- a/orte/tools/orte-migrate/Makefile.am +++ /dev/null @@ -1,44 +0,0 @@ -# -# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. -# -# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -include $(top_srcdir)/Makefile.ompi-rules - -man_pages = orte-migrate.1 -EXTRA_DIST = orte-migrate.1in - -if WANT_FT_CR -if OPAL_INSTALL_BINARIES - -bin_PROGRAMS = orte-migrate - -nodist_man_MANS = $(man_pages) - -# Ensure that the man pages are rebuilt if the opal_config.h file -# changes; a "good enough" way to know if configure was run again (and -# therefore the release date or version may have changed) -$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h - -dist_ortedata_DATA = help-orte-migrate.txt - -endif # OPAL_INSTALL_BINARIES - -orte_migrate_SOURCES = orte-migrate.c -orte_migrate_LDADD = \ - $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ - $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la - -endif # WANT_FT_CR - -distclean-local: - rm -f $(man_pages) diff --git a/orte/tools/orte-migrate/help-orte-migrate.txt b/orte/tools/orte-migrate/help-orte-migrate.txt deleted file mode 100644 index 7c4c9de6ff..0000000000 --- a/orte/tools/orte-migrate/help-orte-migrate.txt +++ /dev/null @@ -1,81 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# -# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2014 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English help file for Open MPI migrate tool -# -[usage] -ompi-migrate PID_OF_MPIRUN - Open MPI Process Migration Tool - -%s -# -[usage-no-cr] -This build of Open MPI does *not* include Checkpoint/Restart functionality. -If you require this functionality re-configure Open MPI with the proper -Checkpoint/Restart options. - -ompi-migrate PID_OF_MPIRUN - Open MPI Migrage Tool - -%s -# -[invalid_pid] -Error: The PID (%d) is invalid because either you have not provided a PID - or provided an invalid PID. - Please see --help for usage. -# -[no_universe] -Error: Unable to find the contact information for PID %d. - This could be due to one of the following: - - The PID is not that of an active MPIRUN. - - The application with this PID isn't migratable - - The application with this PID isn't an Open MPI application. - ompi-migrate attempted to find the session directory: - %s -# -[unable_to_connect] -Error: Unable to connect to the Head Node Process to initiate the - migration of the application. - This could be due to one of the following: - - The PID is not that of an active MPIRUN. - - The application with this PID isn't migratable - - The application with this PID isn't an Open MPI application. -# -[non-ckptable] -Error: The job with pid %d is not checkpointable. - This could be caused by one of the following: - - The application is using unsupported components. - - Your application did not select to be checkpointable -To enable checkpointing in an application use the following AMCA parameter -argument to mpirun: - -am ft-enable-cr -# -[not_impl] -The following feature was requested, but is not currently implemented. - %s -If you require this feature contact the Open MPI development group. -# -[err-inprogress] -Error: The Job identified by PID (%d) is currently migrating other processes. - Only one migration request can be processed at a time. Please try again - later. -# -[err-other] -Error: The Job identified by PID (%d) was not able to migrate processes in this - job. This could be caused by any of the following: - - Invalid node or rank specified - - No processes on the indicated node can by migrated - - Process migration was not enabled for this job. Make sure to indicate - the proper AMCA file: "-am ft-enable-cr-recovery". diff --git a/orte/tools/orte-migrate/orte-migrate.1in b/orte/tools/orte-migrate/orte-migrate.1in deleted file mode 100644 index 079c95e552..0000000000 --- a/orte/tools/orte-migrate/orte-migrate.1in +++ /dev/null @@ -1,81 +0,0 @@ -.\" -.\" Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana -.\" University Research and Technology -.\" Corporation. All rights reserved. -.\" -.\" Man page for OMPI's ompi-migrate command -.\" -.\" .TH name section center-footer left-footer center-header -.TH OMPI-MIGRATE 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" -.\" ************************** -.\" Name Section -.\" ************************** -.SH NAME -. -ompi-migrate, orte-migrate \- Migrate processes among resources in Open MPI. -. -.PP -. -\fBNOTE:\fP \fIompi-migrate\fP, and \fIorte-migrate\fP are all exact -synonyms for each other. Using any of the names will result in exactly -identical behavior. -. -.\" ************************** -.\" Synopsis Section -.\" ************************** -.SH SYNOPSIS -. -.B ompi-migrate -.R [ options ] -.B -. -.\" ************************** -.\" Options Section -.\" ************************** -.SH Options -. -\fIorte-migrate\fR will attempt to notify a running parallel job (identified -by \fImpirun\fP) that a migration has been requeted. -. -.TP 10 -.B -Process ID of the \fImpirun\fP process. -. -. -.TP -.B -h | --help -Display help for this command -. -. -.TP -.B -v | --verbose -Enable verbose output for debugging. -. -. -.TP -.B -gmca | --gmca \fR \fP -Pass global MCA parameters that are applicable to all contexts. \fI\fP is -the parameter name; \fI\fP is the parameter value. -. -. -.TP -.B -mca | --mca -Send arguments to various MCA modules. -. -. -.\" ************************** -.\" Description Section -.\" ************************** -.SH DESCRIPTION -. -.PP -\fIorte-migrate\fR can be invoked multiple, non-overlapping times. -. -. -.\" ************************** -.\" See Also Section -.\" ************************** -. -.SH SEE ALSO - orte-ps(1), orte-clean(1), ompi-restart(1), ompi-checkpoint(1), opal-checkpoint(1), opal-restart(1), opal_crs(7) -. diff --git a/orte/tools/orte-migrate/orte-migrate.c b/orte/tools/orte-migrate/orte-migrate.c deleted file mode 100644 index dddf2d4fa1..0000000000 --- a/orte/tools/orte-migrate/orte-migrate.c +++ /dev/null @@ -1,791 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. - * Copyright (c) 2016 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * ORTE Process Migration Tool for migrating processes in a multiprocess job - * - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include -#include -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_FCNTL_H -#include -#endif /* HAVE_FCNTL_H */ -#ifdef HAVE_SYS_TYPES_H -#include -#endif /* HAVE_SYS_TYPES_H */ -#ifdef HAVE_SYS_STAT_H -#include /* for mkfifo */ -#endif /* HAVE_SYS_STAT_H */ -#ifdef HAVE_SYS_WAIT_H -#include -#endif /* HAVE_SYS_WAIT_H */ -#include - - -#include "opal/util/cmd_line.h" -#include "opal/util/output.h" -#include "opal/util/argv.h" -#include "opal/util/opal_environ.h" -#include "opal/mca/base/base.h" -#include "opal/mca/crs/crs.h" -#include "opal/mca/crs/base/base.h" -#include "opal/runtime/opal.h" -#include "opal/runtime/opal_cr.h" - -#include "orte/runtime/runtime.h" -#include "orte/runtime/orte_cr.h" -#include "orte/util/hnp_contact.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/name_fns.h" -#include "opal/util/show_help.h" -#include "orte/util/proc_info.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/errmgr/errmgr.h" -#include "opal/dss/dss.h" -#include "orte/mca/snapc/snapc.h" -#include "orte/mca/snapc/base/base.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" - -#include MCA_timer_IMPLEMENTATION_HEADER - -/****************** - * Local Functions - ******************/ -static int tool_init(int argc, char *argv[]); /* Initalization routine */ -static int tool_finalize(void); /* Finalization routine */ -static int parse_args(int argc, char *argv[]); -static int find_hnp(void); - -static int start_listener(void); -static int stop_listener(void); -static void hnp_receiver(int status, - orte_process_name_t* sender, - opal_buffer_t* buffer, - orte_rml_tag_t tag, - void* cbdata); - -static void process_ckpt_update_cmd(orte_process_name_t* sender, - opal_buffer_t* buffer); - -static int notify_hnp(void); -static int pretty_print_status(void); -static int pretty_print_migration(void); - -static orte_hnp_contact_t *orterun_hnp = NULL; -static int orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; - -/***************************************** - * Global Vars for Command line Arguments - *****************************************/ -static bool listener_started = false; - -static double timer_start = 0; -static double timer_last = 0; -static double get_time(void); - -typedef struct { - bool help; - int pid; - bool verbose; - int verbose_level; - bool status; - int output; - char *off_nodes; - char *off_procs; - char *onto_nodes; -} orte_migrate_globals_t; - -orte_migrate_globals_t orte_migrate_globals; - -opal_cmd_line_init_t cmd_line_opts[] = { - { NULL, - 'h', NULL, "help", - 0, - &orte_migrate_globals.help, OPAL_CMD_LINE_TYPE_BOOL, - "This help message" }, - - { NULL, - 'v', NULL, "verbose", - 0, - &orte_migrate_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, - "Be Verbose" }, - - { NULL, - 'V', NULL, NULL, - 1, - &orte_migrate_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT, - "Set the verbosity level (For additional debugging information)" }, - - { "hnp-pid", - '\0', NULL, "hnp-pid", - 1, - &orte_migrate_globals.pid, OPAL_CMD_LINE_TYPE_INT, - "This should be the pid of the mpirun whose applications you wish " - "to migrate." }, - - { NULL, - 'x', NULL, "off", - 1, - &orte_migrate_globals.off_nodes, OPAL_CMD_LINE_TYPE_STRING, - "List of nodes to migrate off of (comma separated)" }, - - { NULL, - 'r', NULL, "ranks", - 1, - &orte_migrate_globals.off_procs, OPAL_CMD_LINE_TYPE_STRING, - "List of MPI_COMM_WORLD ranks to migrate (comma separated)" }, - - { NULL, - 't', NULL, "onto", - 1, - &orte_migrate_globals.onto_nodes, OPAL_CMD_LINE_TYPE_STRING, - "List of nodes to migrate onto (comma separated)" }, - - /* End of list */ - { NULL, '\0', NULL, NULL, 0, - NULL, OPAL_CMD_LINE_TYPE_NULL, - NULL } -}; - -int -main(int argc, char *argv[]) -{ - int ret, exit_status = ORTE_SUCCESS; - - /*************** - * Initialize - ***************/ - if (ORTE_SUCCESS != (ret = tool_init(argc, argv))) { - exit_status = ret; - goto cleanup; - } - - /*************************** - * Find the HNP that we want to connect to, if it exists - ***************************/ - if( orte_migrate_globals.verbose ) { - opal_output_verbose(10, orte_migrate_globals.output, - "orte_migrate: Finding HNP..."); - } - if (ORTE_SUCCESS != (ret = find_hnp())) { - opal_show_help("help-orte-migrate.txt", "invalid_pid", - true, orte_migrate_globals.pid); - exit_status = ret; - goto cleanup; - } - - /******************************* - * Send migration information to HNP - *******************************/ - if( orte_migrate_globals.verbose ) { - opal_output_verbose(10, orte_migrate_globals.output, - "orte_migrate: Sending info to HNP..."); - } - if (ORTE_SUCCESS != (ret = notify_hnp())) { - opal_output(0, - "HNP with PID %d Not found!", - orte_migrate_globals.pid); - exit_status = ret; - goto cleanup; - } - - /******************************* - * Wait for migration to complete - *******************************/ - while( ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status && - ORTE_ERRMGR_MIGRATE_STATE_ERROR != orte_migrate_ckpt_status && - ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS != orte_migrate_ckpt_status) { - opal_progress(); - } - - if( orte_migrate_globals.status ) { - orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH; - pretty_print_status(); - } - - cleanup: - /*************** - * Cleanup - ***************/ - if (ORTE_SUCCESS != (ret = tool_finalize())) { - return ret; - } - - return exit_status; -} - -static int parse_args(int argc, char *argv[]) { - int i, ret, len, exit_status = ORTE_SUCCESS ; - opal_cmd_line_t cmd_line; - char **app_env = NULL, **global_env = NULL; - char * tmp_env_var = NULL; - char *argv0 = NULL; - - /* Init structure */ - memset(&orte_migrate_globals, 0, sizeof(orte_migrate_globals_t)); - orte_migrate_globals.help = false; - orte_migrate_globals.pid = -1; - orte_migrate_globals.verbose = false; - orte_migrate_globals.verbose_level = 0; - orte_migrate_globals.status = false; - orte_migrate_globals.output = -1; - orte_migrate_globals.off_nodes = NULL; - orte_migrate_globals.off_procs = NULL; - orte_migrate_globals.onto_nodes = NULL; - -#if OPAL_ENABLE_FT_CR == 0 - /* Warn and exit if not configured with Migrate/Restart */ - { - char *str, *args = NULL; - args = opal_cmd_line_get_usage_msg(&cmd_line); - str = opal_show_help_string("help-orte-migrate.txt", "usage-no-cr", - true, args); - if (NULL != str) { - printf("%s", str); - free(str); - } - free(args); - exit_status = ORTE_ERROR; - goto cleanup; - } -#endif - - /* Parse the command line options */ - opal_cmd_line_create(&cmd_line, cmd_line_opts); - mca_base_open(); - mca_base_cmd_line_setup(&cmd_line); - ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv); - - if (OPAL_SUCCESS != ret) { - if (OPAL_ERR_SILENT != ret) { - fprintf(stderr, "%s: command line error (%s)\n", argv[0], - opal_strerror(ret)); - } - exit_status = 1; - goto cleanup; - } - - if (orte_migrate_globals.help) { - char *str, *args = NULL; - args = opal_cmd_line_get_usage_msg(&cmd_line); - str = opal_show_help_string("help-orte-migrate.txt", "usage", true, - args); - if (NULL != str) { - printf("%s", str); - free(str); - } - free(args); - /* If we show the help message, that should be all we do */ - exit(0); - } - - /** - * Put all of the MCA arguments in the environment - */ - mca_base_cmd_line_process_args(argv, &app_env, &global_env); - - len = opal_argv_count(app_env); - for(i = 0; i < len; ++i) { - putenv(app_env[i]); - } - - len = opal_argv_count(global_env); - for(i = 0; i < len; ++i) { - putenv(global_env[i]); - } - - (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var); - opal_setenv(tmp_env_var, - "1", - true, &environ); - free(tmp_env_var); - tmp_env_var = NULL; - - /** - * Now start parsing our specific arguments - */ - /* get the remaining bits */ - argv0 = strdup(argv[0]); - opal_cmd_line_get_tail(&cmd_line, &argc, &argv); - - if (NULL == orte_migrate_globals.off_nodes && - NULL == orte_migrate_globals.off_procs) { - fprintf(stderr, "%s: Nothing to do\n", argv0); - fprintf(stderr, "Type '%s --help' for usage.\n", argv0); - exit_status = 1; - goto cleanup; - } - - if(orte_migrate_globals.verbose_level < 0 ) { - orte_migrate_globals.verbose_level = 0; - } - - if(orte_migrate_globals.verbose_level > 0) { - orte_migrate_globals.verbose = true; - } - - /* - * If the user did not supply an hnp jobid, then they must - * supply the PID of MPIRUN - */ - if(0 >= argc ) { - fprintf(stderr, "%s: Nothing to do\n", argv[0]); - fprintf(stderr, "Type '%s --help' for usage.\n", argv[0]); - - exit_status = ORTE_ERROR; - goto cleanup; - } - - orte_migrate_globals.pid = atoi(argv[0]); - if ( 0 >= orte_migrate_globals.pid ) { - opal_show_help("help-orte-migrate.txt", "invalid_pid", true, - orte_migrate_globals.pid); - exit_status = ORTE_ERROR; - goto cleanup; - } - - if(orte_migrate_globals.verbose) { - orte_migrate_globals.status = true; - } - - if(orte_migrate_globals.verbose) { - pretty_print_migration(); - } - - cleanup: - if (NULL != argv0) { - free(argv0); - } - - return exit_status; -} - -/* - * This function attempts to find an HNP to connect to. - */ -static int find_hnp(void) { - int ret, exit_status = ORTE_SUCCESS; - opal_list_t hnp_list; - opal_list_item_t *item; - orte_hnp_contact_t *hnpcandidate; - - /* get the list of local hnp's available to us and setup - * contact info for them into the RML - */ - OBJ_CONSTRUCT(&hnp_list, opal_list_t); - if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* search the list for the desired hnp */ - while (NULL != (item = opal_list_remove_first(&hnp_list))) { - hnpcandidate = (orte_hnp_contact_t*)item; - if( hnpcandidate->pid == orte_migrate_globals.pid) { - /* this is the one we want */ - orterun_hnp = hnpcandidate; - exit_status = ORTE_SUCCESS; - goto cleanup; - } - } - -cleanup: - while (NULL != (item = opal_list_remove_first(&hnp_list))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&hnp_list); - - if( NULL == orterun_hnp ) { - return ORTE_ERROR; - } else { - return exit_status; - } -} - -static int tool_init(int argc, char *argv[]) { - int exit_status = ORTE_SUCCESS, ret; - char * tmp_env_var = NULL; - - listener_started = false; - - /* - * Make sure to init util before parse_args - * to ensure installdirs is setup properly - * before calling mca_base_open(); - */ - if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) { - return ret; - } - - /* - * Parse Command Line Arguments - */ - if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) { - return ret; - } - - /* Disable the migrate notification routine for this - * tool. As we will never need to migrate this tool. - * Note: This must happen before opal_init(). - */ - opal_cr_set_enabled(false); - - /* Select the none component, since we don't actually use a migrateer */ - (void) mca_base_var_env_name("crs", &tmp_env_var); - opal_setenv(tmp_env_var, - "none", - true, &environ); - free(tmp_env_var); - tmp_env_var = NULL; - - /* we are never allowed to operate as a distributed tool, - * so insist on the ess/tool component */ - opal_setenv("OMPI_MCA_ess", "tool", true, &environ); - - /*************************** - * We need all of OPAL and the TOOLS portion of ORTE - this - * sets us up so we can talk to any HNP over the wire - ***************************/ - if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) { - exit_status = ret; - goto cleanup; - } - - /* - * Setup ORTE Output handle from the verbose argument - */ - if( orte_migrate_globals.verbose ) { - orte_migrate_globals.output = opal_output_open(NULL); - opal_output_set_verbosity(orte_migrate_globals.output, orte_migrate_globals.verbose_level); - } else { - orte_migrate_globals.output = 0; /* Default=STDERR */ - } - - /* - * Start the listener - */ - if( ORTE_SUCCESS != (ret = start_listener() ) ) { - exit_status = ret; - } - - cleanup: - return exit_status; -} - -static int tool_finalize(void) { - int exit_status = ORTE_SUCCESS, ret; - - /* - * Stop the listener - */ - if( ORTE_SUCCESS != (ret = stop_listener() ) ) { - exit_status = ret; - } - - if (ORTE_SUCCESS != (ret = orte_finalize())) { - exit_status = ret; - } - - return exit_status; -} - -static int start_listener(void) -{ - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE, - ORTE_RML_PERSISTENT, hnp_receiver, NULL); - - listener_started = true; - return ORTE_SUCCESS; -} - -static int stop_listener(void) -{ - if( !listener_started ) { - return ORTE_ERROR; - } - - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE); - - listener_started = false; - return ORTE_SUCCESS; -} - -static void hnp_receiver(int status, - orte_process_name_t* sender, - opal_buffer_t* buffer, - orte_rml_tag_t tag, - void* cbdata) -{ - orte_errmgr_tool_cmd_flag_t command; - orte_std_cntr_t count; - int rc; - - opal_output_verbose(5, orte_migrate_globals.output, - "orte_migrate: hnp_receiver: Receive a command message."); - - /* - * Otherwise this is an inter-coordinator command (usually updating state info). - */ - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_MIGRATE_TOOL_CMD))) { - ORTE_ERROR_LOG(rc); - return; - } - - switch (command) { - case ORTE_ERRMGR_MIGRATE_TOOL_UPDATE_CMD: - opal_output_verbose(10, orte_migrate_globals.output, - "orte_migrate: hnp_receiver: Status Update."); - - process_ckpt_update_cmd(sender, buffer); - break; - - case ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD: - /* Do Nothing */ - break; - - default: - ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); - } -} - -static void process_ckpt_update_cmd(orte_process_name_t* sender, - opal_buffer_t* buffer) -{ - int ret; - orte_std_cntr_t count = 1; - int ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; - - /* - * Receive the data: - * - ckpt_state - */ - count = 1; - if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) { - goto cleanup; - } - orte_migrate_ckpt_status = ckpt_status; - - /* - * If the job is not able to be migrateed, then return - */ - if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_migrate_ckpt_status) { - opal_show_help("help-orte-migrate.txt", "non-ckptable", - true, - orte_migrate_globals.pid); - goto cleanup; - } - - /* - * If a migration is already in progress, then we must tell the user to - * try again later. - */ - if( ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS == orte_migrate_ckpt_status) { - opal_show_help("help-orte-migrate.txt", "err-inprogress", - true, - orte_migrate_globals.pid); - goto cleanup; - } - - /* - * If there was an error, display a message and exit - */ - if( ORTE_ERRMGR_MIGRATE_STATE_ERROR == orte_migrate_ckpt_status ) { - opal_show_help("help-orte-migrate.txt", "err-other", - true, - orte_migrate_globals.pid); - goto cleanup; - } - - /* - * If we are to display the status progression - */ - if( orte_migrate_globals.status ) { - if(ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status) { - pretty_print_status(); - } - } - - cleanup: - return; -} - -static int notify_hnp(void) -{ - int ret, exit_status = ORTE_SUCCESS; - opal_buffer_t *buffer = NULL; - orte_errmgr_tool_cmd_flag_t command = ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD; - - if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) { - exit_status = ORTE_ERROR; - goto cleanup; - } - - opal_output_verbose(10, orte_migrate_globals.output, - "orte_migrate: notify_hnp: Contact Head Node Process PID %d\n", - orte_migrate_globals.pid); - - timer_start = get_time(); - - /*********************************** - * Notify HNP of migrate request - * Send: - * - Command - * - Off Nodes - * - Off Procs - * - Onto Nodes - ***********************************/ - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_ERRMGR_MIGRATE_TOOL_CMD)) ) { - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_procs), 1, OPAL_STRING)) ) { - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_nodes), 1, OPAL_STRING)) ) { - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.onto_nodes), 1, OPAL_STRING)) ) { - exit_status = ret; - goto cleanup; - } - - if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&(orterun_hnp->name), buffer, - ORTE_RML_TAG_MIGRATE, orte_rml_send_callback, - NULL))) { - exit_status = ret; - goto cleanup; - } - - cleanup: - if( NULL != buffer) { - OBJ_RELEASE(buffer); - buffer = NULL; - } - - if( ORTE_SUCCESS != exit_status ) { - opal_show_help("help-orte-migrate.txt", "unable_to_connect", true, - orte_migrate_globals.pid); - } - - return exit_status; -} - -/*************** - * Pretty Print - ***************/ -static double get_time(void) { - double wtime; - -#if OPAL_TIMER_USEC_NATIVE - wtime = (double)opal_timer_base_get_usec() / 1000000.0; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - wtime = tv.tv_sec; - wtime += (double)tv.tv_usec / 1000000.0; -#endif - - return wtime; -} - -static int pretty_print_status(void) { - char * state_str = NULL; - double cur_time; - - cur_time = get_time(); - - if( timer_last == 0 ) { - timer_last = cur_time; - } - - orte_errmgr_base_migrate_state_str(&state_str, orte_migrate_ckpt_status); - - opal_output(0, - "[%6.2f / %6.2f] %*s - ...\n", - (cur_time - timer_last), (cur_time - timer_start), - 25, state_str); - - if( NULL != state_str) { - free(state_str); - } - - timer_last = cur_time; - - return ORTE_SUCCESS; -} - -static int pretty_print_migration(void) -{ - char **loc_off_nodes = NULL; - char **loc_off_procs = NULL; - char **loc_onto_nodes = NULL; - int loc_off_nodes_cnt = 0; - int loc_off_procs_cnt = 0; - int loc_onto_cnt = 0; - int i; - - if( NULL != orte_migrate_globals.off_nodes ) { - loc_off_nodes = opal_argv_split(orte_migrate_globals.off_nodes, ','); - loc_off_nodes_cnt = opal_argv_count(loc_off_nodes); - } - - if( NULL != orte_migrate_globals.off_procs ) { - loc_off_procs = opal_argv_split(orte_migrate_globals.off_procs, ','); - loc_off_procs_cnt = opal_argv_count(loc_off_procs); - } - - if( NULL != orte_migrate_globals.onto_nodes ) { - loc_onto_nodes = opal_argv_split(orte_migrate_globals.onto_nodes, ','); - loc_onto_cnt = opal_argv_count(loc_onto_nodes); - } - - printf("Migrate Nodes: (%d nodes)\n", loc_off_nodes_cnt); - for(i = 0; i < loc_off_nodes_cnt; ++i) { - printf("\t\"%s\"\n", loc_off_nodes[i]); - } - - printf("Migrate Ranks: (%d ranks)\n", loc_off_procs_cnt); - for(i = 0; i < loc_off_procs_cnt; ++i) { - printf("\t\"%s\"\n", loc_off_procs[i]); - } - - printf("Migrate Onto : (%d nodes)\n", loc_onto_cnt); - for(i = 0; i < loc_onto_cnt; ++i) { - printf("\t\"%s\"\n", loc_onto_nodes[i]); - } - - return ORTE_SUCCESS; -} - diff --git a/orte/tools/orte-restart/Makefile.am b/orte/tools/orte-restart/Makefile.am deleted file mode 100644 index abc808378b..0000000000 --- a/orte/tools/orte-restart/Makefile.am +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. -# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -include $(top_srcdir)/Makefile.ompi-rules - -man_pages = orte-restart.1 -EXTRA_DIST = orte-restart.1in - -if WANT_FT_CR -if OPAL_INSTALL_BINARIES - -bin_PROGRAMS = orte-restart - -nodist_man_MANS = $(man_pages) - -# Ensure that the man pages are rebuilt if the opal_config.h file -# changes; a "good enough" way to know if configure was run again (and -# therefore the release date or version may have changed) -$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h - -dist_ortedata_DATA = help-orte-restart.txt - -endif # OPAL_INSTALL_BINARIES - -orte_restart_SOURCES = orte-restart.c -orte_restart_LDADD = \ - $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ - $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la - -endif # WANT_FT_CR - -distclean-local: - rm -f $(man_pages) diff --git a/orte/tools/orte-restart/help-orte-restart.txt b/orte/tools/orte-restart/help-orte-restart.txt deleted file mode 100644 index 202e8677f4..0000000000 --- a/orte/tools/orte-restart/help-orte-restart.txt +++ /dev/null @@ -1,77 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English help file for Open MPI checkpoint tool -# -[usage] -ompi-restart GLOBAL_SNAPSHOT_REF - Open MPI Parallel Job Restart Tool - -%s -# -[usage-no-cr] -This build of Open MPI does *not* include Checkpoint/Restart functionality. -If you require this functionality re-configure Open MPI with the proper -Checkpoint/Restart options. - -ompi-restart GLOBAL_SNAPSHOT_REF - Open MPI Parallel Job Restart Tool - -%s -# -[invalid_filename] -Error: The filename provided (referenced below) could not be used for - restarting the job. This could be for a variety of reasons: - - The filename/handle is invalid, - - The snapshot directory no longer exisits, or - - There are no stable checkpoint sequences in this global snapshot. - Please see --help for usage. - -Filename: %s -# -[restart_cmd_failure] -Error: Unable to obtain the proper restart command to restart from the - checkpoint file (%s). Returned %d. -# -[comp_select_failure] -Error: Unable to select the %s component needed to restart this - application. (Returned %d) - This likely indicates that the checkpointer needed is not - available on this machine. You should move to a machine that - has this checkpointer enabled. -# -[restart_failure] -Error: The restart command: - shell$ %s - returned an error code %d, and was unable to restart properly. -# -[invalid_seq_num] -Error: The filename (%s) and sequence number (%d) could not be used. - This may be caused by an invalid sequence number. Try using the - '-i' option to determine a correct value. -# -[amca_param_not_found] -Warning: Unable to find the AMCA parameter in the checkpoint metadata. - This is the option supplied to mpirun as '-am '. Restart will - assume this value to be '%s'. -# -[tune_param_not_found] -Warning: Unable to find the TUNE parameter in the checkpoint metadata. - This is the option supplied to mpirun as '-tune '. Restart will - assume this value to be '%s'. diff --git a/orte/tools/orte-restart/orte-restart.1in b/orte/tools/orte-restart/orte-restart.1in deleted file mode 100644 index 0d39667519..0000000000 --- a/orte/tools/orte-restart/orte-restart.1in +++ /dev/null @@ -1,115 +0,0 @@ -.\" -.\" Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana -.\" University Research and Technology -.\" Corporation. All rights reserved. -.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. -.\" -.\" Man page for OMPI's ompi-restart command -.\" -.\" .TH name section center-footer left-footer center-header -.TH OMPI-RESTART 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" -.\" ************************** -.\" Name Section -.\" ************************** -.SH NAME -. -ompi-restart, orte-restart \- Restart a previously checkpointed parallel job -using the Open PAL Checkpoint/Restart Service (CRS) -. -.PP -. -\fBNOTE:\fP \fIompi-restart\fP, and \fIorte-restart\fP are all exact -synonyms for each other. Using any of the names will result in exactly -identical behavior. -. -.\" ************************** -.\" Synopsis Section -.\" ************************** -.SH SYNOPSIS -. -.B ompi-restart -.B [ options ] -.B -. -.\" ************************** -.\" Options Section -.\" ************************** -.SH Options -. -\fIompi-restart\fR will attempt to restart a previously checkpointed parallel -job from the global snapshot handle reference returned by \fIompi_checkpoint\fP. -. -.TP 10 -.B -The global snapshot handle reference returned by \fIompi_checkpoint\fP, used to -restart the job. This is required to be the last argument to this command. -. -. -.TP -.B -h | --help -Display help for this command -. -. -.TP -.B -p | --preload -Preload the checkpoint files on the remote systems before restarting the -application. Disabled by default. -. -. -.TP -.B --fork -Fork off a new process, which is the restarted process. By default, the -restarted process will replace \fIompi-restart\fR. -. -. -.TP -.B -s | --seq -The sequence number of the checkpoint to restart from. By default, the most -recent sequence number is used (specified by -1). -. -. -.TP -.B -hostfile | --hostfile -The hostfile from which to restart the application. Useful in unscheduled -environments. (Same behavior as --machinefile option) -. -. -.TP -.B -machinefile | --machinefile -The machinefile from which to restart the application. Useful in unscheduled -environments. (Same behavior as --hostfile option) -. -. -.TP -.B -v | --verbose -Enable verbose output for debugging. -. -. -.TP -.B -gmca | --gmca \fR \fP -Pass global MCA parameters that are applicable to all contexts. \fI\fP is -the parameter name; \fI\fP is the parameter value. -. -. -.TP -.B -mca | --mca -Send arguments to various MCA modules. -. -. -.\" ************************** -.\" Description Section -.\" ************************** -.SH DESCRIPTION -. -.PP -\fIompi-restart\fR can be invoked multiple, non-overlapping times. This -allows the user to restart a previously running parallel job. -. -. -.\" ************************** -.\" See Also Section -.\" ************************** -. -.SH SEE ALSO - orte-ps(1), orte-clean(1), ompi-checkpoint(1), opal-checkpoint(1), opal-restart(1), opal_crs(7) -. diff --git a/orte/tools/orte-restart/orte-restart.c b/orte/tools/orte-restart/orte-restart.c deleted file mode 100644 index cd76c533fa..0000000000 --- a/orte/tools/orte-restart/orte-restart.c +++ /dev/null @@ -1,897 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * ORTE Restart Tool for restarting a previously checkpointed multiprocess job - * - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#include -#ifdef HAVE_SYS_STAT_H -#include -#endif /* HAVE_SYS_STAT_H */ -#ifdef HAVE_SYS_TYPES_H -#include -#endif /* HAVE_SYS_TYPES_H */ -#ifdef HAVE_SYS_WAIT_H -#include -#endif /* HAVE_SYS_WAIT_H */ -#include - -#include "opal/runtime/opal.h" -#include "opal/runtime/opal_cr.h" -#include "opal/util/cmd_line.h" -#include "opal/util/output.h" -#include "opal/util/argv.h" -#include "opal/util/opal_environ.h" -#include "opal/util/basename.h" -#include "opal/util/error.h" -#include "opal/util/path.h" -#include "opal/mca/base/base.h" -#include "opal/mca/crs/crs.h" -#include "opal/mca/crs/base/base.h" - -#include "orte/runtime/runtime.h" -#include "orte/runtime/orte_cr.h" -#include "orte/mca/snapc/snapc.h" -#include "orte/mca/snapc/base/base.h" -#include "orte/mca/sstore/sstore.h" -#include "orte/mca/sstore/base/base.h" -#include "orte/mca/filem/base/base.h" -#include "opal/util/show_help.h" -#include "orte/util/proc_info.h" - -/****************** - * Local Functions - ******************/ -static int initialize(int argc, char *argv[]); -static int finalize(void); -static int parse_args(int argc, char *argv[]); -static int create_appfile(orte_sstore_base_global_snapshot_info_t *snapshot); -static int spawn_children(orte_sstore_base_global_snapshot_info_t *snapshot, pid_t *child_pid); -static int snapshot_info(orte_sstore_base_global_snapshot_info_t *snapshot); -static int snapshot_sort_compare_fn(opal_list_item_t **a, - opal_list_item_t **b); - -/***************************************** - * Global Vars for Command line Arguments - *****************************************/ -typedef struct { - bool help; - char *snapshot_ref; - char *appfile; - bool verbose; - bool forked; - int seq_number; - char *hostfile; - int output; - bool info_only; - bool app_only; - bool showme; - char *mpirun_opts; -#if OPAL_ENABLE_CRDEBUG == 1 - bool enable_crdebug; -#endif -} orte_restart_globals_t; - -orte_restart_globals_t orte_restart_globals; - -opal_cmd_line_init_t cmd_line_opts[] = { - { NULL, - 'h', NULL, "help", - 0, - &orte_restart_globals.help, OPAL_CMD_LINE_TYPE_BOOL, - "This help message" }, - - { NULL, - 'v', NULL, "verbose", - 0, - &orte_restart_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, - "Be Verbose" }, - - { NULL, - '\0', NULL, "fork", - 0, - &orte_restart_globals.forked, OPAL_CMD_LINE_TYPE_BOOL, - "Fork off a new process which is the restarted process instead of " - "replacing orte_restart" }, - - { NULL, - 's', NULL, "seq", - 1, - &orte_restart_globals.seq_number, OPAL_CMD_LINE_TYPE_INT, - "The sequence number of the checkpoint to start from. " - "(Default: -1, or most recent)" }, - - { NULL, - '\0', "hostfile", "hostfile", - 1, - &orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile to use for launch" }, - - { NULL, - '\0', "machinefile", "machinefile", - 1, - &orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile to use for launch" }, - - { NULL, - 'i', NULL, "info", - 0, - &orte_restart_globals.info_only, OPAL_CMD_LINE_TYPE_BOOL, - "Display information about the checkpoint" }, - - { NULL, - 'a', NULL, "apponly", - 0, - &orte_restart_globals.app_only, OPAL_CMD_LINE_TYPE_BOOL, - "Only create the app context file, do not restart from it" }, - - { NULL, - '\0', NULL, "showme", - 0, - &orte_restart_globals.showme, OPAL_CMD_LINE_TYPE_BOOL, - "Display the full command line that would have been exec'ed." }, - - { NULL, - '\0', "mpirun_opts", "mpirun_opts", - 1, - &orte_restart_globals.mpirun_opts, OPAL_CMD_LINE_TYPE_STRING, - "Command line options to pass directly to mpirun (be sure to quote long strings, and escape internal quotes)" }, - -#if OPAL_ENABLE_CRDEBUG == 1 - { NULL, - '\0', "crdebug", "crdebug", - 0, - &orte_restart_globals.enable_crdebug, OPAL_CMD_LINE_TYPE_BOOL, - "Enable C/R Enhanced Debugging" }, -#endif - - /* End of list */ - { NULL, - '\0', NULL, NULL, - 0, - NULL, OPAL_CMD_LINE_TYPE_NULL, - NULL } -}; - -int -main(int argc, char *argv[]) -{ - int ret, exit_status = ORTE_SUCCESS; - pid_t child_pid = 0; - orte_sstore_base_global_snapshot_info_t *snapshot = NULL; - char *basedir = NULL; - char *tmp_str = NULL; - - /*************** - * Initialize - ***************/ - if (ORTE_SUCCESS != (ret = initialize(argc, argv))) { - exit_status = ret; - goto cleanup; - } - - snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t); - - if( opal_path_is_absolute(orte_restart_globals.snapshot_ref) ) { - basedir = opal_dirname(orte_restart_globals.snapshot_ref); - tmp_str = opal_basename(orte_restart_globals.snapshot_ref); - free(orte_restart_globals.snapshot_ref); - orte_restart_globals.snapshot_ref = strdup(tmp_str); - free(tmp_str); - tmp_str = NULL; - } else if( NULL != strchr(orte_restart_globals.snapshot_ref, '/') ) { - basedir = opal_dirname(orte_restart_globals.snapshot_ref); - tmp_str = opal_basename(orte_restart_globals.snapshot_ref); - free(orte_restart_globals.snapshot_ref); - orte_restart_globals.snapshot_ref = strdup(tmp_str); - free(tmp_str); - tmp_str = NULL; - } else { - basedir = NULL; /* Use MCA parameter */ - } - - /* - * Note: If the seq # passed is -1, then the largest seq # is selected, - * ow the seq # requested is selected if available - * 'basedir': Snapshot Base location to look in. If NULL then MCA parameter is used - */ - if( ORTE_SUCCESS != (ret = orte_sstore.request_restart_handle(&(snapshot->ss_handle), - basedir, - orte_restart_globals.snapshot_ref, - orte_restart_globals.seq_number, - snapshot))) { - opal_show_help("help-orte-restart.txt", "invalid_filename", true, - orte_restart_globals.snapshot_ref); - exit_status = ret; - goto cleanup; - } - orte_restart_globals.seq_number = snapshot->seq_num; - - if(orte_restart_globals.info_only ) { - if (ORTE_SUCCESS != (ret = snapshot_info(snapshot))) { - exit_status = ret; - goto cleanup; - } - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /****************************** - * Create the app file to use with mpirun/orterun - ******************************/ - if( ORTE_SUCCESS != (ret = create_appfile(snapshot) ) ) { - exit_status = ret; - goto cleanup; - } - - if( orte_restart_globals.app_only ) { - printf("Created Appfile:\n\t%s\n", orte_restart_globals.appfile); - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /****************************** - * Restart in this process [mpirun/orterun] - ******************************/ - if( orte_restart_globals.verbose ) { - opal_output_verbose(10, orte_restart_globals.output, - "Restarting from file (%s)", - orte_restart_globals.snapshot_ref); - - if( orte_restart_globals.forked ) { - opal_output_verbose(10, orte_restart_globals.output, - "\t Forking off a child"); - } else { - opal_output_verbose(10, orte_restart_globals.output, - "\t Exec in self"); - } - } - - if( ORTE_SUCCESS != (ret = spawn_children(snapshot, &child_pid)) ) { - opal_show_help("help-orte-restart.txt", "restart_cmd_failure", true, - orte_restart_globals.snapshot_ref, ret); - exit_status = ret; - goto cleanup; - } - - /*************** - * Cleanup - ***************/ - cleanup: - if( NULL != basedir ) { - free(basedir); - basedir = NULL; - } - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - if( NULL != snapshot ) { - OBJ_RELEASE(snapshot); - snapshot = NULL; - } - - if (OPAL_SUCCESS != (ret = finalize())) { - return ret; - } - - return exit_status; -} - -static int initialize(int argc, char *argv[]) { - int ret, exit_status = ORTE_SUCCESS; - char * tmp_env_var = NULL; - - /* - * Make sure to init util before parse_args - * to ensure installdirs is setup properly - * before calling mca_base_open(); - */ - if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) { - return ret; - } - - /* - * Parse command line arguments - */ - if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) { - exit_status = ret; - goto cleanup; - } - - /* - * Setup OPAL Output handle from the verbose argument - */ - if( orte_restart_globals.verbose ) { - orte_restart_globals.output = opal_output_open(NULL); - opal_output_set_verbosity(orte_restart_globals.output, 10); - } else { - orte_restart_globals.output = 0; /* Default=STDERR */ - } - - /* Disable the checkpoint notification routine for this - * tool. As we will never need to checkpoint this tool. - * Note: This must happen before opal_init(). - */ - opal_cr_set_enabled(false); - - /* Select the none component, since we don't actually use a checkpointer */ - (void) mca_base_var_env_name("crs", &tmp_env_var); - opal_setenv(tmp_env_var, - "none", - true, &environ); - /* Don't free the environment variable name. It is used again below */ - - /* - /* we are never allowed to operate as a distributed tool, - * so insist on the ess/tool component */ - opal_setenv("OMPI_MCA_ess", "tool", true, &environ); - - /* Setup any ORTE stuff we might need */ - if (OPAL_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) { - exit_status = ret; - goto cleanup; - } - - /* Unset these now that we no longer need them */ - opal_unsetenv(tmp_env_var, &environ); - free(tmp_env_var); - tmp_env_var = NULL; - - (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var); - opal_unsetenv(tmp_env_var, &environ); - free(tmp_env_var); - tmp_env_var = NULL; - - cleanup: - return exit_status; -} - -static int finalize(void) -{ - int ret; - - if (OPAL_SUCCESS != (ret = orte_finalize())) { - return ret; - } - - return ORTE_SUCCESS; -} - -static int parse_args(int argc, char *argv[]) -{ - int i, ret, len; - opal_cmd_line_t cmd_line; - char **app_env = NULL, **global_env = NULL; - char * tmp_env_var = NULL; - char *argv0 = NULL; - orte_restart_globals_t tmp = { false, /* help */ - NULL, /* filename */ - NULL, /* appfile */ - false, /* verbose */ - false, /* forked */ - -1, /* seq_number */ - NULL, /* hostfile */ - -1, /* output*/ - false, /* info only */ - false, /* app only */ - false, /* showme */ - NULL}; /* mpirun_opts */ - - orte_restart_globals = tmp; -#if OPAL_ENABLE_CRDEBUG == 1 - orte_restart_globals.enable_crdebug = false; -#endif - -#if OPAL_ENABLE_FT_CR == 0 - /* Warn and exit if not configured with Checkpoint/Restart */ - { - char *str, *args = NULL; - args = opal_cmd_line_get_usage_msg(&cmd_line); - str = opal_show_help_string("help-orte-restart.txt", "usage-no-cr", - true, args); - if (NULL != str) { - printf("%s", str); - free(str); - } - free(args); - return ORTE_ERROR; - } -#endif - - /* Parse the command line options */ - opal_cmd_line_create(&cmd_line, cmd_line_opts); - - mca_base_open(); - mca_base_cmd_line_setup(&cmd_line); - ret = opal_cmd_line_parse(&cmd_line, true, false, argc, argv); - - if (OPAL_SUCCESS != ret) { - if (OPAL_ERR_SILENT != ret) { - fprintf(stderr, "%s: command line error (%s)\n", argv[0], - opal_strerror(ret)); - } - return 1; - } - - if (orte_restart_globals.help) { - char *str, *args = NULL; - args = opal_cmd_line_get_usage_msg(&cmd_line); - str = opal_show_help_string("help-orte-restart.txt", "usage", true, - args); - if (NULL != str) { - printf("%s", str); - free(str); - } - free(args); - /* If we show the help message, that should be all we do */ - exit(0); - } - - /** - * Put all of the MCA arguments in the environment - */ - mca_base_cmd_line_process_args(argv, &app_env, &global_env); - - len = opal_argv_count(app_env); - for(i = 0; i < len; ++i) { - putenv(app_env[i]); - } - - len = opal_argv_count(global_env); - for(i = 0; i < len; ++i) { - putenv(global_env[i]); - } - - (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var); - opal_setenv(tmp_env_var, - "1", - true, &environ); - free(tmp_env_var); - tmp_env_var = NULL; - - /** - * Now start parsing our specific arguments - */ - - /* get the remaining bits */ - argv0 = strdup(argv[0]); - opal_cmd_line_get_tail(&cmd_line, &argc, &argv); - if (0 == argc) { - fprintf(stderr, "%s: Nothing to do\n", argv0); - fprintf(stderr, "Type '%s --help' for usge.\n", argv0); - free(argv0); - return ORTE_ERROR; - } - free(argv0); - - orte_restart_globals.snapshot_ref = strdup(argv[0]); - if ( NULL == orte_restart_globals.snapshot_ref || - 0 >= strlen(orte_restart_globals.snapshot_ref) ) { - opal_show_help("help-orte-restart.txt", "invalid_filename", true, - ""); - return ORTE_ERROR; - } - - /* If we have arguments after the command, then assume they - * need to be grouped together. - */ - if(argc > 1) { - orte_restart_globals.snapshot_ref = strdup(opal_argv_join(argv, ' ')); - } - - return ORTE_SUCCESS; -} - -static int create_appfile(orte_sstore_base_global_snapshot_info_t *snapshot) -{ - int exit_status = ORTE_SUCCESS; - FILE *appfile = NULL; - opal_list_item_t* item = NULL; - char *tmp_str = NULL; - char *amca_param = NULL; - char *tune_param = NULL; - char *reference_fmt_str = NULL; - char *location_str = NULL; - char *ref_location_fmt_str = NULL; - orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL; - - /* - * Create the appfile - */ - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_GLOBAL_SNAP_LOC_ABS, - &tmp_str); - asprintf(&orte_restart_globals.appfile, "%s/%s", - tmp_str, - strdup("restart-appfile")); - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_GLOBAL_AMCA_PARAM, - &amca_param); - - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_GLOBAL_TUNE_PARAM, - &tune_param); - - if (NULL == (appfile = fopen(orte_restart_globals.appfile, "w")) ) { - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* This will give a format string that we can use */ - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_LOCAL_SNAP_REF_FMT, - &reference_fmt_str); - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_LOCAL_SNAP_LOC, - &location_str); - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT, - &ref_location_fmt_str); - - /* - * Sort the snapshots so that they are in order - */ - opal_list_sort(&snapshot->local_snapshots, snapshot_sort_compare_fn); - - /* - * Construct the appfile - */ - for(item = opal_list_get_first(&snapshot->local_snapshots); - item != opal_list_get_end(&snapshot->local_snapshots); - item = opal_list_get_next(item) ) { - vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item; - - fprintf(appfile, "#\n"); - fprintf(appfile, "# Old Process Name: %u.%u\n", - vpid_snapshot->process_name.jobid, - vpid_snapshot->process_name.vpid); - fprintf(appfile, "#\n"); - fprintf(appfile, "-np 1 "); - - fprintf(appfile, "--sstore-load "); - /* loc:ref:postfix:seq */ - fprintf(appfile, "%s:%s:", - location_str, - orte_restart_globals.snapshot_ref); - fprintf(appfile, reference_fmt_str, vpid_snapshot->process_name.vpid); - fprintf(appfile, ":%s:%s:%d ", - (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp), - (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix), - orte_restart_globals.seq_number); - - if( NULL == amca_param ) { - amca_param = strdup("ft-enable-cr"); - opal_show_help("help-orte-restart.txt", "amca_param_not_found", true, - amca_param); - } - fprintf(appfile, "-am %s ", amca_param); - - if( NULL == tune_param ) { - tune_param = strdup("ft-enable-cr"); - opal_show_help("help-orte-restart.txt", "tune_param_not_found", true, - tune_param); - } - fprintf(appfile, "-tune %s ", tune_param); - - fprintf(appfile, " opal-restart "); - - /* - * By default, point to the central storage location of the checkpoint. - * The active SStore module at restart time will determine if files - * need to be preloaded. - */ - fprintf(appfile, "-l %s", location_str); - fprintf(appfile, " -m %s ", orte_sstore_base_local_metadata_filename); - - fprintf(appfile, "-r "); - fprintf(appfile, reference_fmt_str, vpid_snapshot->process_name.vpid); - - fprintf(appfile, "\n"); - } - - cleanup: - if(NULL != appfile) { - fclose(appfile); - appfile = NULL; - } - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - if( NULL != location_str ) { - free(location_str); - location_str = NULL; - } - if( NULL != reference_fmt_str ) { - free(reference_fmt_str); - reference_fmt_str = NULL; - } - if( NULL != ref_location_fmt_str ) { - free(ref_location_fmt_str); - ref_location_fmt_str = NULL; - } - - return exit_status; -} - -static int spawn_children(orte_sstore_base_global_snapshot_info_t *snapshot, pid_t *child_pid) -{ - int ret, exit_status = ORTE_SUCCESS; - char *amca_param = NULL; - char *tune_param = NULL; - char **argv = NULL; - int argc = 0, i; - int status; - - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_GLOBAL_AMCA_PARAM, - &amca_param); - - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_GLOBAL_TUNE_PARAM, - &tune_param); - - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "mpirun")) ) { - exit_status = ret; - goto cleanup; - } - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "-am")) ) { - exit_status = ret; - goto cleanup; - } - if( NULL == amca_param ) { - amca_param = strdup("ft-enable-cr"); - opal_show_help("help-orte-restart.txt", "amca_param_not_found", true, - amca_param); - } - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, amca_param)) ) { - exit_status = ret; - goto cleanup; - } - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "-tune")) ) { - exit_status = ret; - goto cleanup; - } - if( NULL == tune_param ) { - tune_param = strdup("ft-enable-cr"); - opal_show_help("help-orte-restart.txt", "tune_param_not_found", true, - tune_param); - } - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, tune_param)) ) { - exit_status = ret; - goto cleanup; - } - if( NULL != orte_restart_globals.hostfile ) { - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--default-hostfile")) ) { - exit_status = ret; - goto cleanup; - } - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.hostfile)) ) { - exit_status = ret; - goto cleanup; - } - } - if( orte_restart_globals.mpirun_opts ) { - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.mpirun_opts)) ) { - exit_status = ret; - goto cleanup; - } - } -#if OPAL_ENABLE_CRDEBUG == 1 - if( orte_restart_globals.enable_crdebug ) { - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--crdebug")) ) { - exit_status = ret; - goto cleanup; - } - } -#endif - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--app")) ) { - exit_status = ret; - goto cleanup; - } - if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.appfile)) ) { - exit_status = ret; - goto cleanup; - } - - if( orte_restart_globals.showme ) { - for(i = 0; i < argc; ++i ) { - /*printf("%2d: (%s)\n", i, argv[i]);*/ - printf("%s ", argv[i]); - } - printf("\n"); - return ORTE_SUCCESS; - } - - /* To fork off a child */ - if( orte_restart_globals.forked ) { - *child_pid = fork(); - - if( 0 == *child_pid) { - /* Child Process */ - status = execvp(strdup(argv[0]), argv); - if( 0 > status) { - opal_output(orte_restart_globals.output, - "orte_restart: execv failed with status = %d\n", - status); - } - exit_status = status; - goto cleanup; - } - else if(0 < *child_pid) { - /* Parent is done once it is started */ - ; - } - else { - opal_output(orte_restart_globals.output, - "orte_restart: fork failed: This should never happen!"); - /* Fork failed :( */ - exit_status = *child_pid; - goto cleanup; - } - } - /* ... or not to fork off a child */ - else { - /* Make sure to finalize so we don't leave our session directory */ - orte_finalize(); - - status = execvp(strdup(argv[0]), argv); - if( 0 > status) { - /* execv failed */ - } - exit_status = status; - goto cleanup; - } - - opal_output_verbose(10, orte_restart_globals.output, - "orte_restart: Restarted Child with PID = %d\n", *child_pid); - - cleanup: - if( NULL != argv) - opal_argv_free(argv); - - return exit_status; -} - -int snapshot_info(orte_sstore_base_global_snapshot_info_t *snapshot) -{ - int ret, exit_status = ORTE_SUCCESS; - int num_seqs, processes, i; - char **snapshot_ref_seqs = NULL; - opal_list_item_t* item = NULL; - orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL; - char *tmp_str = NULL; - - /* - * Find all sequence numbers - */ - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_GLOBAL_SNAP_NUM_SEQ, - &tmp_str); - num_seqs = atoi(tmp_str); - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - orte_sstore.get_attr(snapshot->ss_handle, - SSTORE_METADATA_GLOBAL_SNAP_ALL_SEQ, - &tmp_str); - snapshot_ref_seqs = opal_argv_split(tmp_str, ','); - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - - if( 0 > orte_restart_globals.seq_number ) { - opal_output(orte_restart_globals.output, - "Sequences: %d\n", - num_seqs); - } - - for(i=0; i < num_seqs; ++i) { - snapshot->seq_num = atoi(snapshot_ref_seqs[i]); - - if( 0 <= orte_restart_globals.seq_number && - snapshot->seq_num != orte_restart_globals.seq_number ) { - continue; - } - - if( ORTE_SUCCESS != (ret = orte_sstore_base_extract_global_metadata( snapshot ) ) ) { - exit_status = ret; - goto cleanup; - } - - opal_output(orte_restart_globals.output, - "Seq: %d\n", - snapshot->seq_num); - - if (NULL != snapshot->start_time ) { - opal_output(orte_restart_globals.output, - "\tBegin Timestamp: %s\n", - snapshot->start_time); - } - if (NULL != snapshot->end_time ) { - opal_output(orte_restart_globals.output, - "\tEnd Timestamp : %s\n", - snapshot->end_time); - } - - processes = opal_list_get_size(&snapshot->local_snapshots); - opal_output(orte_restart_globals.output, - "\tProcesses: %d\n", - processes); - - for(item = opal_list_get_first(&snapshot->local_snapshots); - item != opal_list_get_end(&snapshot->local_snapshots); - item = opal_list_get_next(item) ) { - vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item; - - opal_output_verbose(10, orte_restart_globals.output, - "\t\tProcess: %u.%u \t CRS: %s \t Compress: %s (%s)", - vpid_snapshot->process_name.jobid, - vpid_snapshot->process_name.vpid, - vpid_snapshot->crs_comp, - vpid_snapshot->compress_comp, - vpid_snapshot->compress_postfix); - } - } - - cleanup: - return exit_status; -} - -static int snapshot_sort_compare_fn(opal_list_item_t **a, - opal_list_item_t **b) -{ - orte_sstore_base_local_snapshot_info_t *snap_a, *snap_b; - - snap_a = (orte_sstore_base_local_snapshot_info_t*)(*a); - snap_b = (orte_sstore_base_local_snapshot_info_t*)(*b); - - if( snap_a->process_name.vpid > snap_b->process_name.vpid ) { - return 1; - } - else if( snap_a->process_name.vpid == snap_b->process_name.vpid ) { - return 0; - } - else { - return -1; - } -}