1
1
openmpi/orte/tools/orte-migrate/orte-migrate.c

778 строки
21 KiB
C
Исходник Обычный вид История

A number of C/R enhancements per RFC below: http://www.open-mpi.org/community/lists/devel/2010/07/8240.php Documentation: http://osl.iu.edu/research/ft/ Major Changes: -------------- * Added C/R-enabled Debugging support. Enabled with the --enable-crdebug flag. See the following website for more information: http://osl.iu.edu/research/ft/crdebug/ * Added Stable Storage (SStore) framework for checkpoint storage * 'central' component does a direct to central storage save * 'stage' component stages checkpoints to central storage while the application continues execution. * 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress) * 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching) * Added Compression (compress) framework to support * Add two new ErrMgr recovery policies * {{{crmig}}} C/R Process Migration * {{{autor}}} C/R Automatic Recovery * Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component * Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option) * {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342) * {{{OMPI_CR_Restart}}} * {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules) * {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192) * {{{OMPI_CR_Quiesce_start}}} * {{{OMPI_CR_Quiesce_checkpoint}}} * {{{OMPI_CR_Quiesce_end}}} * {{{OMPI_CR_self_register_checkpoint_callback}}} * {{{OMPI_CR_self_register_restart_callback}}} * {{{OMPI_CR_self_register_continue_callback}}} * The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future. * Add a progress meter to: * FileM rsh (filem_rsh_process_meter) * SnapC full (snapc_full_progress_meter) * SStore stage (sstore_stage_progress_meter) * Added 2 new command line options to ompi-restart * --showme : Display the full command line that would have been exec'ed. * --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413) * Deprecated some MCA params: * crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir * snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir * snapc_base_global_shared deprecated, use sstore_stage_global_is_shared * snapc_base_store_in_place deprecated, replaced with different components of SStore * snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref * snapc_base_establish_global_snapshot_dir deprecated, never well supported * snapc_full_skip_filem deprecated, use sstore_stage_skip_filem Minor Changes: -------------- * Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing. * Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components * Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it. * Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}} * Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set. * opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality. * Cleanup the CRS framework and components to work with the SStore framework. * Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably). * Add 'quiesce' hook to CRCP for a future enhancement. * We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}. * Add optional application level INC callbacks (registered through the CR MPI Ext interface). * Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive. * {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked. * {{{opal-restart}}} also support local decompression before restarting * {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata * {{{orte-restart}}} now uses the SStore framework to work with the metadata * Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality. * Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}. * Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped. * Make sure to decrement the number of 'num_local_procs' in the orted when one goes away. * odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options. * Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities. * Improve the checks for 'already checkpointing' error path. * A a recovery output timer, to show how long it takes to restart a job * Do a better job of cleaning up the old session directory on restart. * Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment) * Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize. This commit was SVN r23587. The following Trac tickets were found above: Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924 Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097 Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161 Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192 Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208 Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342 Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-10 20:51:11 +00:00
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* ORTE Process Migration Tool for migrating processes in a multiprocess job
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <errno.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h> /* for mkfifo */
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/util/cmd_line.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_cr.h"
#include "orte/util/hnp_contact.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "opal/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "opal/dss/dss.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/******************
* Local Functions
******************/
static int tool_init(int argc, char *argv[]); /* Initalization routine */
static int tool_finalize(void); /* Finalization routine */
static int parse_args(int argc, char *argv[]);
static int find_hnp(void);
static int start_listener(void);
static int stop_listener(void);
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata);
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer);
static int notify_hnp(void);
static int pretty_print_status(void);
static int pretty_print_migration(void);
static orte_hnp_contact_t *orterun_hnp = NULL;
static int orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
static bool listener_started = false;
static double timer_start = 0;
static double timer_last = 0;
static double get_time(void);
typedef struct {
bool help;
int pid;
bool verbose;
int verbose_level;
bool status;
int output;
char *off_nodes;
char *off_procs;
char *onto_nodes;
} orte_migrate_globals_t;
orte_migrate_globals_t orte_migrate_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL, NULL, NULL,
'h', NULL, "help",
0,
&orte_migrate_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL,
'v', NULL, "verbose",
0,
&orte_migrate_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL, NULL, NULL,
'V', NULL, NULL,
1,
&orte_migrate_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
"Set the verbosity level (For additional debugging information)" },
{ "hnp-pid", NULL, NULL,
'\0', NULL, "hnp-pid",
1,
&orte_migrate_globals.pid, OPAL_CMD_LINE_TYPE_INT,
"This should be the pid of the mpirun whose applications you wish "
"to migrate." },
{ NULL, NULL, NULL,
'x', NULL, "off",
1,
&orte_migrate_globals.off_nodes, OPAL_CMD_LINE_TYPE_STRING,
"List of nodes to migrate off of (comma separated)" },
{ NULL, NULL, NULL,
'r', NULL, "ranks",
1,
&orte_migrate_globals.off_procs, OPAL_CMD_LINE_TYPE_STRING,
"List of MPI_COMM_WORLD ranks to migrate (comma separated)" },
{ NULL, NULL, NULL,
't', NULL, "onto",
1,
&orte_migrate_globals.onto_nodes, OPAL_CMD_LINE_TYPE_STRING,
"List of nodes to migrate onto (comma separated)" },
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
int
main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
/***************
* Initialize
***************/
if (ORTE_SUCCESS != (ret = tool_init(argc, argv))) {
exit_status = ret;
goto cleanup;
}
/***************************
* Find the HNP that we want to connect to, if it exists
***************************/
if( orte_migrate_globals.verbose ) {
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: Finding HNP...");
}
if (ORTE_SUCCESS != (ret = find_hnp())) {
opal_show_help("help-orte-migrate.txt", "invalid_pid",
true, orte_migrate_globals.pid);
exit_status = ret;
goto cleanup;
}
/*******************************
* Send migration information to HNP
*******************************/
if( orte_migrate_globals.verbose ) {
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: Sending info to HNP...");
}
if (ORTE_SUCCESS != (ret = notify_hnp())) {
opal_output(0,
"HNP with PID %d Not found!",
orte_migrate_globals.pid);
exit_status = ret;
goto cleanup;
}
/*******************************
* Wait for migration to complete
*******************************/
while( ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status &&
ORTE_ERRMGR_MIGRATE_STATE_ERROR != orte_migrate_ckpt_status &&
ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS != orte_migrate_ckpt_status) {
opal_progress();
}
if( orte_migrate_globals.status ) {
orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH;
pretty_print_status();
}
cleanup:
/***************
* Cleanup
***************/
if (ORTE_SUCCESS != (ret = tool_finalize())) {
return ret;
}
return exit_status;
}
static int parse_args(int argc, char *argv[]) {
int i, ret, len, exit_status = ORTE_SUCCESS ;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
/* Init structure */
memset(&orte_migrate_globals, 0, sizeof(orte_migrate_globals_t));
orte_migrate_globals.help = false;
orte_migrate_globals.pid = -1;
orte_migrate_globals.verbose = false;
orte_migrate_globals.verbose_level = 0;
orte_migrate_globals.status = false;
orte_migrate_globals.output = -1;
orte_migrate_globals.off_nodes = NULL;
orte_migrate_globals.off_procs = NULL;
orte_migrate_globals.onto_nodes = NULL;
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
/**
* Put all of the MCA arguments in the environment
*/
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
len = opal_argv_count(app_env);
for(i = 0; i < len; ++i) {
putenv(app_env[i]);
}
len = opal_argv_count(global_env);
for(i = 0; i < len; ++i) {
putenv(global_env[i]);
}
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
*/
/* get the remaining bits */
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
#if OPAL_ENABLE_FT_CR == 0
/* Warn and exit if not configured with Migrate/Restart */
{
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_show_help("help-orte-migrate.txt", "usage-no-cr",
true, args);
free(args);
exit_status = ORTE_ERROR;
goto cleanup;
}
#endif
if (OPAL_SUCCESS != ret ||
orte_migrate_globals.help ||
0 >= argc ||
(NULL == orte_migrate_globals.off_nodes && NULL == orte_migrate_globals.off_procs) ) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_show_help("help-orte-migrate.txt", "usage", true,
args);
free(args);
exit_status = ORTE_ERROR;
goto cleanup;
}
if(orte_migrate_globals.verbose_level < 0 ) {
orte_migrate_globals.verbose_level = 0;
}
if(orte_migrate_globals.verbose_level > 0) {
orte_migrate_globals.verbose = true;
}
/*
* If the user did not supply an hnp jobid, then they must
* supply the PID of MPIRUN
*/
if(0 >= argc ) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
orte_migrate_globals.pid = atoi(argv[0]);
if ( 0 >= orte_migrate_globals.pid ) {
opal_show_help("help-orte-migrate.txt", "invalid_pid", true,
orte_migrate_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
if(orte_migrate_globals.verbose) {
orte_migrate_globals.status = true;
}
if(orte_migrate_globals.verbose) {
pretty_print_migration();
}
cleanup:
return exit_status;
}
/*
* This function attempts to find an HNP to connect to.
*/
static int find_hnp(void) {
int ret, exit_status = ORTE_SUCCESS;
opal_list_t hnp_list;
opal_list_item_t *item;
orte_hnp_contact_t *hnpcandidate;
/* get the list of local hnp's available to us and setup
* contact info for them into the RML
*/
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* search the list for the desired hnp */
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
hnpcandidate = (orte_hnp_contact_t*)item;
if( hnpcandidate->pid == orte_migrate_globals.pid) {
/* this is the one we want */
orterun_hnp = hnpcandidate;
exit_status = ORTE_SUCCESS;
goto cleanup;
}
}
cleanup:
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
if( NULL == orterun_hnp ) {
return ORTE_ERROR;
} else {
return exit_status;
}
}
static int tool_init(int argc, char *argv[]) {
int exit_status = ORTE_SUCCESS, ret;
char * tmp_env_var = NULL;
listener_started = false;
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
return ret;
}
/*
* Parse Command Line Arguments
*/
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
return ret;
}
/* Disable the migrate notification routine for this
* tool. As we will never need to migrate this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a migrateer */
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/***************************
* We need all of OPAL and the TOOLS portion of ORTE - this
* sets us up so we can talk to any HNP over the wire
***************************/
if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
exit_status = ret;
goto cleanup;
}
/*
* Setup ORTE Output handle from the verbose argument
*/
if( orte_migrate_globals.verbose ) {
orte_migrate_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_migrate_globals.output, orte_migrate_globals.verbose_level);
} else {
orte_migrate_globals.output = 0; /* Default=STDERR */
}
/*
* Start the listener
*/
if( ORTE_SUCCESS != (ret = start_listener() ) ) {
exit_status = ret;
}
cleanup:
return exit_status;
}
static int tool_finalize(void) {
int exit_status = ORTE_SUCCESS, ret;
/*
* Stop the listener
*/
if( ORTE_SUCCESS != (ret = stop_listener() ) ) {
exit_status = ret;
}
if (ORTE_SUCCESS != (ret = orte_finalize())) {
exit_status = ret;
}
return exit_status;
}
static int start_listener(void)
{
int ret, exit_status = ORTE_SUCCESS;
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_MIGRATE,
ORTE_RML_PERSISTENT,
hnp_receiver,
NULL))) {
exit_status = ret;
goto cleanup;
}
listener_started = true;
cleanup:
return exit_status;
}
static int stop_listener(void)
{
int ret, exit_status = ORTE_SUCCESS;
if( !listener_started ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_rml.recv_cancel(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_MIGRATE))) {
exit_status = ret;
goto cleanup;
}
listener_started = false;
cleanup:
return exit_status;
}
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
orte_errmgr_tool_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
opal_output_verbose(5, orte_migrate_globals.output,
"orte_migrate: hnp_receiver: Receive a command message.");
/*
* Otherwise this is an inter-coordinator command (usually updating state info).
*/
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_MIGRATE_TOOL_CMD))) {
ORTE_ERROR_LOG(rc);
return;
}
switch (command) {
case ORTE_ERRMGR_MIGRATE_TOOL_UPDATE_CMD:
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: hnp_receiver: Status Update.");
process_ckpt_update_cmd(sender, buffer);
break;
case ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD:
/* Do Nothing */
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
}
}
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer)
{
int ret, exit_status = ORTE_SUCCESS;
orte_std_cntr_t count = 1;
int ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
/*
* Receive the data:
* - ckpt_state
*/
count = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
exit_status = ret;
goto cleanup;
}
orte_migrate_ckpt_status = ckpt_status;
/*
* If the job is not able to be migrateed, then return
*/
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_migrate_ckpt_status) {
opal_show_help("help-orte-migrate.txt", "non-ckptable",
true,
orte_migrate_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* If a migration is already in progress, then we must tell the user to
* try again later.
*/
if( ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS == orte_migrate_ckpt_status) {
opal_show_help("help-orte-migrate.txt", "err-inprogress",
true,
orte_migrate_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* If there was an error, display a message and exit
*/
if( ORTE_ERRMGR_MIGRATE_STATE_ERROR == orte_migrate_ckpt_status ) {
opal_show_help("help-orte-migrate.txt", "err-other",
true,
orte_migrate_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
A number of C/R enhancements per RFC below: http://www.open-mpi.org/community/lists/devel/2010/07/8240.php Documentation: http://osl.iu.edu/research/ft/ Major Changes: -------------- * Added C/R-enabled Debugging support. Enabled with the --enable-crdebug flag. See the following website for more information: http://osl.iu.edu/research/ft/crdebug/ * Added Stable Storage (SStore) framework for checkpoint storage * 'central' component does a direct to central storage save * 'stage' component stages checkpoints to central storage while the application continues execution. * 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress) * 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching) * Added Compression (compress) framework to support * Add two new ErrMgr recovery policies * {{{crmig}}} C/R Process Migration * {{{autor}}} C/R Automatic Recovery * Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component * Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option) * {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342) * {{{OMPI_CR_Restart}}} * {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules) * {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192) * {{{OMPI_CR_Quiesce_start}}} * {{{OMPI_CR_Quiesce_checkpoint}}} * {{{OMPI_CR_Quiesce_end}}} * {{{OMPI_CR_self_register_checkpoint_callback}}} * {{{OMPI_CR_self_register_restart_callback}}} * {{{OMPI_CR_self_register_continue_callback}}} * The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future. * Add a progress meter to: * FileM rsh (filem_rsh_process_meter) * SnapC full (snapc_full_progress_meter) * SStore stage (sstore_stage_progress_meter) * Added 2 new command line options to ompi-restart * --showme : Display the full command line that would have been exec'ed. * --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413) * Deprecated some MCA params: * crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir * snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir * snapc_base_global_shared deprecated, use sstore_stage_global_is_shared * snapc_base_store_in_place deprecated, replaced with different components of SStore * snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref * snapc_base_establish_global_snapshot_dir deprecated, never well supported * snapc_full_skip_filem deprecated, use sstore_stage_skip_filem Minor Changes: -------------- * Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing. * Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components * Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it. * Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}} * Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set. * opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality. * Cleanup the CRS framework and components to work with the SStore framework. * Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably). * Add 'quiesce' hook to CRCP for a future enhancement. * We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}. * Add optional application level INC callbacks (registered through the CR MPI Ext interface). * Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive. * {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked. * {{{opal-restart}}} also support local decompression before restarting * {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata * {{{orte-restart}}} now uses the SStore framework to work with the metadata * Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality. * Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}. * Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped. * Make sure to decrement the number of 'num_local_procs' in the orted when one goes away. * odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options. * Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities. * Improve the checks for 'already checkpointing' error path. * A a recovery output timer, to show how long it takes to restart a job * Do a better job of cleaning up the old session directory on restart. * Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment) * Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize. This commit was SVN r23587. The following Trac tickets were found above: Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924 Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097 Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161 Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192 Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208 Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342 Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-10 20:51:11 +00:00
/*
* If we are to display the status progression
*/
if( orte_migrate_globals.status ) {
if(ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status) {
pretty_print_status();
}
}
cleanup:
return;
}
static int notify_hnp(void)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t *buffer = NULL;
orte_errmgr_tool_cmd_flag_t command = ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD;
if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) {
exit_status = ORTE_ERROR;
goto cleanup;
}
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: notify_hnp: Contact Head Node Process PID %d\n",
orte_migrate_globals.pid);
timer_start = get_time();
/***********************************
* Notify HNP of migrate request
* Send:
* - Command
* - Off Nodes
* - Off Procs
* - Onto Nodes
***********************************/
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_ERRMGR_MIGRATE_TOOL_CMD)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_procs), 1, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_nodes), 1, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.onto_nodes), 1, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
if ( 0 > (ret = orte_rml.send_buffer(&(orterun_hnp->name), buffer, ORTE_RML_TAG_MIGRATE, 0)) ) {
exit_status = ret;
goto cleanup;
}
cleanup:
if( NULL != buffer) {
OBJ_RELEASE(buffer);
buffer = NULL;
}
if( ORTE_SUCCESS != exit_status ) {
opal_show_help("help-orte-migrate.txt", "unable_to_connect", true,
orte_migrate_globals.pid);
}
return exit_status;
}
/***************
* Pretty Print
***************/
static double get_time(void) {
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static int pretty_print_status(void) {
char * state_str = NULL;
double cur_time;
cur_time = get_time();
if( timer_last == 0 ) {
timer_last = cur_time;
}
orte_errmgr_base_migrate_state_str(&state_str, orte_migrate_ckpt_status);
opal_output(0,
"[%6.2f / %6.2f] %*s - ...\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str);
if( NULL != state_str) {
free(state_str);
}
timer_last = cur_time;
return ORTE_SUCCESS;
}
static int pretty_print_migration(void)
{
char **loc_off_nodes = NULL;
char **loc_off_procs = NULL;
char **loc_onto_nodes = NULL;
int loc_off_nodes_cnt = 0;
int loc_off_procs_cnt = 0;
int loc_onto_cnt = 0;
int i;
if( NULL != orte_migrate_globals.off_nodes ) {
loc_off_nodes = opal_argv_split(orte_migrate_globals.off_nodes, ',');
loc_off_nodes_cnt = opal_argv_count(loc_off_nodes);
}
if( NULL != orte_migrate_globals.off_procs ) {
loc_off_procs = opal_argv_split(orte_migrate_globals.off_procs, ',');
loc_off_procs_cnt = opal_argv_count(loc_off_procs);
}
if( NULL != orte_migrate_globals.onto_nodes ) {
loc_onto_nodes = opal_argv_split(orte_migrate_globals.onto_nodes, ',');
loc_onto_cnt = opal_argv_count(loc_onto_nodes);
}
printf("Migrate Nodes: (%d nodes)\n", loc_off_nodes_cnt);
for(i = 0; i < loc_off_nodes_cnt; ++i) {
printf("\t\"%s\"\n", loc_off_nodes[i]);
}
printf("Migrate Ranks: (%d ranks)\n", loc_off_procs_cnt);
for(i = 0; i < loc_off_procs_cnt; ++i) {
printf("\t\"%s\"\n", loc_off_procs[i]);
}
printf("Migrate Onto : (%d nodes)\n", loc_onto_cnt);
for(i = 0; i < loc_onto_cnt; ++i) {
printf("\t\"%s\"\n", loc_onto_nodes[i]);
}
return ORTE_SUCCESS;
}