
This commit looks larger than it really is since it includes a fair amount of code cleanup. The SIGSTOP/SIGCONT+checkpointing work uses some of the functionality in r20391. Basic use case below (note that the checkpoint generated is useable as usual if the stopped application is terminated). {{{ shell 1) mpirun -np 2 -am ft-enable-cr my-app ... running ... shell 2) ompi-checkpoint --stop -v MPIRUN_PID [localhost:001300] [ 0.00 / 0.20] Requested - ... [localhost:001300] [ 0.00 / 0.20] Pending - ... [localhost:001300] [ 0.01 / 0.21] Running - ... [localhost:001300] [ 1.01 / 1.22] Stopped - ompi_global_snapshot_1234.ckpt Snapshot Ref.: 0 ompi_global_snapshot_1234.ckpt shell 2) killall -CONT mpirun ... Application Continues execution in shell 1 ... }}} Other items in this commit are mostly cleanup that has been sitting off-trunk for too long: * Add a new {{{opal_crs_base_ckpt_options_t}}} type that encapsulates the various options that could be passed to the CRS. Currently only TERM and STOP, but this makes adding others ''much'' easier. * Eliminate ORTE_SNAPC_CKPT_STATE_PENDING_TERM, since it served a redundant purpose with the new options type. * Lay some basic ground work for some future features. This commit was SVN r21995. The following SVN revision numbers were found above: r20391 --> open-mpi/ompi@0704b98668
165 строки
4.2 KiB
C
165 строки
4.2 KiB
C
/*
|
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
*
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "opal_config.h"
|
|
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif
|
|
#include <sys/types.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
|
|
#include "opal/util/opal_environ.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/opal_environ.h"
|
|
|
|
#include "opal/constants.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
#include "opal/mca/crs/crs.h"
|
|
#include "opal/mca/crs/base/base.h"
|
|
#include "opal/runtime/opal_cr.h"
|
|
|
|
#include "crs_none.h"
|
|
|
|
int opal_crs_none_module_init(void)
|
|
{
|
|
/*
|
|
* If not a tool, and requesting C/R support print a warning.
|
|
*/
|
|
if( opal_crs_none_select_warning &&
|
|
!opal_cr_is_tool && opal_cr_is_enabled ) {
|
|
opal_show_help("help-opal-crs-none.txt", "none:select-warning",
|
|
true);
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_none_module_finalize(void)
|
|
{
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_none_checkpoint(pid_t pid,
|
|
opal_crs_base_snapshot_t *snapshot,
|
|
opal_crs_base_ckpt_options_t *options,
|
|
opal_crs_state_type_t *state)
|
|
{
|
|
int ret;
|
|
|
|
*state = OPAL_CRS_CONTINUE;
|
|
|
|
snapshot->component_name = strdup("none");
|
|
snapshot->reference_name = strdup("none");
|
|
snapshot->local_location = strdup("");
|
|
snapshot->remote_location = strdup("");
|
|
snapshot->cold_start = false;
|
|
|
|
/*
|
|
* Update the snapshot metadata
|
|
*/
|
|
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, "none") ) ) {
|
|
opal_output(0,
|
|
"crs:none: checkpoint(): Error: Unable to write component name to the directory for (%s).",
|
|
snapshot->reference_name);
|
|
return ret;
|
|
}
|
|
|
|
if( options->stop ) {
|
|
opal_output(0,
|
|
"crs:none: checkpoint(): Error: SIGSTOP Not currently supported!");
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_none_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
|
|
{
|
|
char **tmp_argv = NULL;
|
|
char **cr_argv = NULL;
|
|
int status;
|
|
|
|
*child_pid = getpid();
|
|
|
|
opal_crs_base_metadata_read_token(base_snapshot->local_location, CRS_METADATA_CONTEXT, &tmp_argv);
|
|
|
|
if( opal_argv_count(tmp_argv) <= 0 ) {
|
|
opal_output_verbose(10, opal_crs_base_output,
|
|
"crs:none: none_restart: No command line to exec, so just returning");
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
if ( NULL == (cr_argv = opal_argv_split(tmp_argv[0], ' ')) ) {
|
|
return OPAL_ERROR;
|
|
}
|
|
|
|
if( !spawn_child ) {
|
|
opal_output_verbose(10, opal_crs_base_output,
|
|
"crs:none: none_restart: exec :(%s, %s):",
|
|
strdup(cr_argv[0]),
|
|
opal_argv_join(cr_argv, ' '));
|
|
|
|
status = execvp(strdup(cr_argv[0]), cr_argv);
|
|
|
|
if(status < 0) {
|
|
opal_output(opal_crs_base_output,
|
|
"crs:none: none_restart: Child failed to execute :(%d):", status);
|
|
}
|
|
opal_output(opal_crs_base_output,
|
|
"crs:none: none_restart: execvp returned %d", status);
|
|
return status;
|
|
} else {
|
|
opal_output(opal_crs_base_output,
|
|
"crs:none: none_restart: Spawn not implemented");
|
|
return OPAL_ERR_NOT_IMPLEMENTED;
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_none_disable_checkpoint(void)
|
|
{
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_none_enable_checkpoint(void)
|
|
{
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_none_prelaunch(int32_t rank,
|
|
char *base_snapshot_dir,
|
|
char **app,
|
|
char **cwd,
|
|
char ***argv,
|
|
char ***env)
|
|
{
|
|
char * tmp_env_var = NULL;
|
|
|
|
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
|
opal_setenv(tmp_env_var,
|
|
"0", true, env);
|
|
free(tmp_env_var);
|
|
tmp_env_var = NULL;
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int opal_crs_none_reg_thread(void)
|
|
{
|
|
return OPAL_SUCCESS;
|
|
}
|