b5ac320826
* Work around duplicate node names in the map. It should not happen normally, but if the rmaps component gets this wrong provide a work around. Ralph is working on a rmaps fix for this, so we will likely remove/comment out the fix later. This commit was SVN r25572.
860 строки
27 KiB
C
860 строки
27 KiB
C
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
|
*
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
#include <sys/types.h>
|
|
#endif /* HAVE_SYS_TYPES_H */
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
#include <sys/types.h>
|
|
#endif /* HAVE_SYS_TYPES_H */
|
|
#ifdef HAVE_SYS_STAT_H
|
|
#include <sys/stat.h>
|
|
#endif /* HAVE_SYS_STAT_H */
|
|
#ifdef HAVE_DIRENT_H
|
|
#include <dirent.h>
|
|
#endif /* HAVE_DIRENT_H */
|
|
#include <time.h>
|
|
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "opal/util/os_dirpath.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/basename.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/mca/crs/crs.h"
|
|
#include "opal/mca/crs/base/base.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/rml/rml_types.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/mca/notifier/notifier.h"
|
|
|
|
#include "orte/mca/sstore/sstore.h"
|
|
#include "orte/mca/sstore/base/base.h"
|
|
|
|
#include "orte/mca/snapc/snapc.h"
|
|
#include "orte/mca/snapc/base/base.h"
|
|
|
|
/******************
|
|
* Local Functions
|
|
******************/
|
|
size_t orte_snapc_base_snapshot_seq_number = 0;
|
|
|
|
/******************
|
|
* Object stuff
|
|
******************/
|
|
OBJ_CLASS_INSTANCE(orte_snapc_base_local_snapshot_t,
|
|
opal_list_item_t,
|
|
orte_snapc_base_local_snapshot_construct,
|
|
orte_snapc_base_local_snapshot_destruct);
|
|
|
|
void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *snapshot)
|
|
{
|
|
snapshot->process_name.jobid = 0;
|
|
snapshot->process_name.vpid = 0;
|
|
ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN);
|
|
|
|
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
|
|
|
snapshot->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
|
}
|
|
|
|
void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot)
|
|
{
|
|
snapshot->process_name.jobid = 0;
|
|
snapshot->process_name.vpid = 0;
|
|
ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN);
|
|
|
|
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
|
|
|
snapshot->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
|
}
|
|
|
|
/****/
|
|
OBJ_CLASS_INSTANCE(orte_snapc_base_global_snapshot_t,
|
|
opal_list_item_t,
|
|
orte_snapc_base_global_snapshot_construct,
|
|
orte_snapc_base_global_snapshot_destruct);
|
|
|
|
void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot)
|
|
{
|
|
OBJ_CONSTRUCT(&(snapshot->local_snapshots), opal_list_t);
|
|
|
|
snapshot->options = OBJ_NEW(opal_crs_base_ckpt_options_t);
|
|
|
|
snapshot->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
|
}
|
|
|
|
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
|
|
{
|
|
opal_list_item_t* item = NULL;
|
|
|
|
while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&(snapshot->local_snapshots));
|
|
|
|
if( NULL != snapshot->options ) {
|
|
OBJ_RELEASE(snapshot->options);
|
|
snapshot->options = NULL;
|
|
}
|
|
|
|
snapshot->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
|
}
|
|
|
|
OBJ_CLASS_INSTANCE(orte_snapc_base_quiesce_t,
|
|
opal_object_t,
|
|
orte_snapc_base_quiesce_construct,
|
|
orte_snapc_base_quiesce_destruct);
|
|
|
|
void orte_snapc_base_quiesce_construct(orte_snapc_base_quiesce_t *quiesce)
|
|
{
|
|
quiesce->epoch = -1;
|
|
quiesce->snapshot = NULL;
|
|
quiesce->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
|
quiesce->ss_snapshot = NULL;
|
|
quiesce->handle = NULL;
|
|
quiesce->target_dir = NULL;
|
|
quiesce->crs_name = NULL;
|
|
quiesce->cmdline = NULL;
|
|
quiesce->cr_state = OPAL_CRS_NONE;
|
|
quiesce->checkpointing = false;
|
|
quiesce->restarting = false;
|
|
|
|
quiesce->migrating = false;
|
|
quiesce->num_migrating = 0;
|
|
OBJ_CONSTRUCT(&(quiesce->migrating_procs), opal_pointer_array_t);
|
|
opal_pointer_array_init(&(quiesce->migrating_procs), 8, INT32_MAX, 8);
|
|
}
|
|
|
|
void orte_snapc_base_quiesce_destruct( orte_snapc_base_quiesce_t *quiesce)
|
|
{
|
|
int i;
|
|
void *item = NULL;
|
|
|
|
quiesce->epoch = -1;
|
|
|
|
if( NULL != quiesce->snapshot ) {
|
|
OBJ_RELEASE(quiesce->snapshot);
|
|
quiesce->snapshot = NULL;
|
|
}
|
|
|
|
quiesce->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
|
if( NULL != quiesce->ss_snapshot ) {
|
|
OBJ_RELEASE(quiesce->ss_snapshot);
|
|
quiesce->ss_snapshot = NULL;
|
|
}
|
|
|
|
if( NULL != quiesce->handle ) {
|
|
free(quiesce->handle);
|
|
quiesce->handle = NULL;
|
|
}
|
|
if( NULL != quiesce->target_dir ) {
|
|
free(quiesce->target_dir);
|
|
quiesce->target_dir = NULL;
|
|
}
|
|
if( NULL != quiesce->crs_name ) {
|
|
free(quiesce->crs_name);
|
|
quiesce->crs_name = NULL;
|
|
}
|
|
if( NULL != quiesce->cmdline ) {
|
|
free(quiesce->cmdline);
|
|
quiesce->cmdline = NULL;
|
|
}
|
|
|
|
quiesce->cr_state = OPAL_CRS_NONE;
|
|
quiesce->checkpointing = false;
|
|
quiesce->restarting = false;
|
|
|
|
quiesce->migrating = false;
|
|
quiesce->num_migrating = 0;
|
|
for( i = 0; i < quiesce->migrating_procs.size; ++i) {
|
|
item = opal_pointer_array_get_item(&(quiesce->migrating_procs), i);
|
|
if( NULL != item ) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
}
|
|
OBJ_DESTRUCT(&(quiesce->migrating_procs));
|
|
}
|
|
|
|
OBJ_CLASS_INSTANCE(orte_snapc_base_request_op_t,
|
|
opal_object_t,
|
|
orte_snapc_base_request_op_construct,
|
|
orte_snapc_base_request_op_destruct);
|
|
|
|
void orte_snapc_base_request_op_construct(orte_snapc_base_request_op_t *op)
|
|
{
|
|
op->event = ORTE_SNAPC_OP_NONE;
|
|
op->is_active = false;
|
|
op->leader = -1;
|
|
|
|
op->seq_num = -1;
|
|
op->global_handle = NULL;
|
|
op->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
|
|
|
op->mig_num = -1;
|
|
op->mig_vpids = NULL;
|
|
/*op->mig_host_pref = NULL;*/
|
|
op->mig_vpid_pref = NULL;
|
|
op->mig_off_node = NULL;
|
|
}
|
|
|
|
void orte_snapc_base_request_op_destruct( orte_snapc_base_request_op_t *op)
|
|
{
|
|
op->event = ORTE_SNAPC_OP_NONE;
|
|
op->is_active = false;
|
|
op->leader = -1;
|
|
|
|
op->seq_num = -1;
|
|
if(NULL != op->global_handle ) {
|
|
free(op->global_handle);
|
|
op->global_handle = NULL;
|
|
}
|
|
|
|
op->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
|
|
|
op->mig_num = -1;
|
|
/*
|
|
if( NULL != op->mig_vpids ) {
|
|
free( op->mig_vpids );
|
|
op->mig_vpids = NULL;
|
|
}
|
|
|
|
if( NULL != op->mig_host_pref ) {
|
|
free( op->mig_host_pref );
|
|
op->mig_host_pref = NULL;
|
|
}
|
|
|
|
if( NULL != op->mig_vpid_pref ) {
|
|
free( op->mig_vpid_pref );
|
|
op->mig_vpid_pref = NULL;
|
|
}
|
|
|
|
if( NULL != op->mig_off_node ) {
|
|
free( op->mig_off_node );
|
|
op->mig_off_node = NULL;
|
|
}
|
|
*/
|
|
}
|
|
|
|
|
|
/***********************
|
|
* None component stuff
|
|
************************/
|
|
int orte_snapc_base_none_open(void)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_snapc_base_none_close(void)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_snapc_base_none_query(mca_base_module_t **module, int *priority)
|
|
{
|
|
*module = NULL;
|
|
*priority = 0;
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int orte_snapc_base_module_init(bool seed, bool app)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_snapc_base_module_finalize(void)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
/* None RML command line response callback */
|
|
static void snapc_none_global_cmdline_request(int status,
|
|
orte_process_name_t* sender,
|
|
opal_buffer_t *buffer,
|
|
orte_rml_tag_t tag,
|
|
void* cbdata);
|
|
int orte_snapc_base_none_setup_job(orte_jobid_t jobid)
|
|
{
|
|
int exit_status = ORTE_SUCCESS;
|
|
int rc;
|
|
|
|
/*
|
|
* Coordinator command listener
|
|
*/
|
|
orte_snapc_base_snapshot_seq_number = -1;
|
|
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
|
ORTE_RML_TAG_CKPT,
|
|
ORTE_RML_PERSISTENT,
|
|
snapc_none_global_cmdline_request,
|
|
NULL))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
exit_status = rc;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
int orte_snapc_base_none_release_job(orte_jobid_t jobid)
|
|
{
|
|
/*
|
|
* Remove the checkpoint request callback
|
|
*/
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_snapc_base_none_ft_event(int state)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_snapc_base_none_start_ckpt(orte_snapc_base_quiesce_t *datum)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_snapc_base_none_end_ckpt(orte_snapc_base_quiesce_t *datum)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
/********************
|
|
* Local Functions
|
|
********************/
|
|
/* None RML response callback */
|
|
static void snapc_none_global_cmdline_request(int status,
|
|
orte_process_name_t* sender,
|
|
opal_buffer_t *buffer,
|
|
orte_rml_tag_t tag,
|
|
void* cbdata)
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
orte_snapc_cmd_flag_t command;
|
|
orte_std_cntr_t n = 1;
|
|
opal_crs_base_ckpt_options_t *options = NULL;
|
|
orte_jobid_t jobid;
|
|
|
|
options = OBJ_NEW(opal_crs_base_ckpt_options_t);
|
|
|
|
n = 1;
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* orte_checkpoint has requested that a checkpoint be taken
|
|
* Respond that a checkpoint cannot be taken at this time
|
|
*/
|
|
if (ORTE_SNAPC_GLOBAL_INIT_CMD == command) {
|
|
/*
|
|
* Do the basic handshake with the orte_checkpoint command
|
|
*/
|
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, options, &jobid)) ) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Respond with an invalid response
|
|
*/
|
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, 0, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
/*
|
|
* Unknown command
|
|
*/
|
|
else {
|
|
ORTE_ERROR_LOG(ret);
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
if( NULL != options ) {
|
|
OBJ_RELEASE(options);
|
|
options = NULL;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
/********************
|
|
* Utility functions
|
|
********************/
|
|
|
|
/* Report the checkpoint status over the notifier interface */
|
|
void orte_snapc_ckpt_state_notify(int state)
|
|
{
|
|
switch(state) {
|
|
case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
|
|
orte_notifier.log(ORTE_NOTIFIER_INFO, ORTE_SNAPC_CKPT_NOTIFY(state),
|
|
"%d: Checkpoint established for process %s.",
|
|
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_NO_CKPT:
|
|
orte_notifier.log(ORTE_NOTIFIER_WARN, ORTE_SNAPC_CKPT_NOTIFY(state),
|
|
"%d: Process %s is not checkpointable.",
|
|
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_ERROR:
|
|
orte_notifier.log(ORTE_NOTIFIER_WARN, ORTE_SNAPC_CKPT_NOTIFY(state),
|
|
"%d: Failed to checkpoint process %s.",
|
|
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_RECOVERED:
|
|
orte_notifier.log(ORTE_NOTIFIER_INFO, ORTE_SNAPC_CKPT_NOTIFY(state),
|
|
"%d: Successfully restarted process %s.",
|
|
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_NO_RESTART:
|
|
orte_notifier.log(ORTE_NOTIFIER_WARN, ORTE_SNAPC_CKPT_NOTIFY(state),
|
|
"%d: Failed to restart process %s.",
|
|
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
|
|
break;
|
|
/* ADK: We currently do not notify for these states, but good to
|
|
* have them around anyways. */
|
|
case ORTE_SNAPC_CKPT_STATE_NONE:
|
|
case ORTE_SNAPC_CKPT_STATE_REQUEST:
|
|
case ORTE_SNAPC_CKPT_STATE_PENDING:
|
|
case ORTE_SNAPC_CKPT_STATE_RUNNING:
|
|
case ORTE_SNAPC_CKPT_STATE_STOPPED:
|
|
case ORTE_SNAPC_CKPT_STATE_MIGRATING:
|
|
case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
|
|
opal_buffer_t* buffer,
|
|
opal_crs_base_ckpt_options_t *options,
|
|
orte_jobid_t *jobid)
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
orte_std_cntr_t count = 1;
|
|
orte_ns_cmp_bitmask_t mask;
|
|
|
|
mask = ORTE_NS_CMP_ALL;
|
|
|
|
/*
|
|
* Do not send to self, as that is silly.
|
|
*/
|
|
if (OPAL_EQUAL ==
|
|
orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
|
|
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
|
|
"%s) base:ckpt_init_cmd: Error: Do not send to self!\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
|
|
"%s) base:ckpt_init_cmd: Receiving commands\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
|
|
|
|
/********************
|
|
* Receive command line checkpoint request:
|
|
* - Command (already received)
|
|
* - options
|
|
* - jobid
|
|
********************/
|
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_unpack_options(buffer, options)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"%s) base:ckpt_init_cmd: Error: Unpack (options) Failure (ret = %d)\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
count = 1;
|
|
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, jobid, &count, ORTE_JOBID)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
|
ret, __LINE__);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
|
|
"%s) base:ckpt_init_cmd: Received [%d, %d, %s]\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
|
(int)(options->term),
|
|
(int)(options->stop),
|
|
ORTE_JOBID_PRINT(*jobid)));
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
int orte_snapc_base_unpack_options(opal_buffer_t* buffer,
|
|
opal_crs_base_ckpt_options_t *options)
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
orte_std_cntr_t count = 1;
|
|
|
|
count = 1;
|
|
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->term), &count, OPAL_BOOL)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"snapc:base:unpack_options: Error: Unpack (term) Failure (ret = %d)\n",
|
|
ret);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
count = 1;
|
|
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->stop), &count, OPAL_BOOL)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"snapc:base:unpack_options: Error: Unpack (stop) Failure (ret = %d)\n",
|
|
ret);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
count = 1;
|
|
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->inc_prep_only), &count, OPAL_BOOL)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"snapc:base:unpack_options: Error: Unpack (inc_prep_only) Failure (ret = %d)\n",
|
|
ret);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
count = 1;
|
|
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->inc_recover_only), &count, OPAL_BOOL)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"snapc:base:unpack_options: Error: Unpack (inc_recover_only) Failure (ret = %d)\n",
|
|
ret);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
#if OPAL_ENABLE_CRDEBUG == 1
|
|
count = 1;
|
|
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->attach_debugger), &count, OPAL_BOOL)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"snapc:base:unpack_options: Error: Unpack (attach_debugger) Failure (ret = %d)\n",
|
|
ret);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
count = 1;
|
|
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->detach_debugger), &count, OPAL_BOOL)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"snapc:base:unpack_options: Error: Unpack (detach_debugger) Failure (ret = %d)\n",
|
|
ret);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
#endif
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
int orte_snapc_base_pack_options(opal_buffer_t* buffer,
|
|
opal_crs_base_ckpt_options_t *options)
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->term), 1, OPAL_BOOL))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->stop), 1, OPAL_BOOL))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->inc_prep_only), 1, OPAL_BOOL))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->inc_recover_only), 1, OPAL_BOOL))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
#if OPAL_ENABLE_CRDEBUG == 1
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->attach_debugger), 1, OPAL_BOOL))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->detach_debugger), 1, OPAL_BOOL))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
#endif
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
|
orte_sstore_base_handle_t ss_handle,
|
|
int ckpt_status)
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
opal_buffer_t *loc_buffer = NULL;
|
|
orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_UPDATE_CMD;
|
|
char *global_snapshot_handle = NULL;
|
|
char *tmp_str = NULL;
|
|
int seq_num;
|
|
orte_ns_cmp_bitmask_t mask;
|
|
|
|
/*
|
|
* Noop if invalid peer, or peer not specified (JJH Double check this)
|
|
*/
|
|
if( NULL == peer ||
|
|
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) {
|
|
/*return ORTE_ERR_BAD_PARAM;*/
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
mask = ORTE_NS_CMP_ALL;
|
|
|
|
/*
|
|
* Do not send to self, as that is silly.
|
|
*/
|
|
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
|
|
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
|
|
"%s) base:ckpt_update_cmd: Error: Do not send to self!\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Pass on the checkpoint state over the notifier interface.
|
|
*/
|
|
orte_snapc_ckpt_state_notify(ckpt_status);
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
|
|
"%s) base:ckpt_update_cmd: Sending update command <status %d>\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
|
ckpt_status));
|
|
|
|
/********************
|
|
* Send over the status of the checkpoint
|
|
* - ckpt_state
|
|
* - global snapshot handle (upon finish only)
|
|
* - sequence number (upon finish only)
|
|
********************/
|
|
if (NULL == (loc_buffer = OBJ_NEW(opal_buffer_t))) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &ckpt_status, 1, OPAL_INT))) {
|
|
opal_output(orte_snapc_base_output,
|
|
"%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
|
ret, __LINE__);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SNAPC_CKPT_STATE_RECOVERED == ckpt_status ||
|
|
ORTE_SNAPC_CKPT_STATE_ESTABLISHED == ckpt_status ||
|
|
ORTE_SNAPC_CKPT_STATE_STOPPED == ckpt_status ||
|
|
ORTE_SNAPC_CKPT_STATE_ERROR == ckpt_status ) {
|
|
|
|
if( ORTE_SNAPC_CKPT_STATE_ERROR != ckpt_status ) {
|
|
if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
|
|
SSTORE_METADATA_GLOBAL_SNAP_REF,
|
|
&global_snapshot_handle)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
|
|
ORTE_ERROR_LOG(ret);
|
|
/* Do not exit here, continue so that we can inform the tool
|
|
* that the checkpoint has failed
|
|
*/
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
|
|
SSTORE_METADATA_GLOBAL_SNAP_SEQ,
|
|
&tmp_str)) ) {
|
|
opal_output(orte_snapc_base_output,
|
|
"%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
|
|
ORTE_ERROR_LOG(ret);
|
|
/* Do not exit here, continue so that we can inform the tool
|
|
* that the checkpoint has failed
|
|
*/
|
|
}
|
|
|
|
if( NULL != tmp_str ) {
|
|
seq_num = atoi(tmp_str);
|
|
} else {
|
|
seq_num = -1;
|
|
}
|
|
} else {
|
|
/* Checkpoint Error Case */
|
|
global_snapshot_handle = NULL;
|
|
seq_num = -1;
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
|
|
"%s) base:ckpt_update_cmd: Sending update command <status %d> + <ref %s> <seq %d>\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
|
ckpt_status, global_snapshot_handle, seq_num));
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &global_snapshot_handle, 1, OPAL_STRING))) {
|
|
opal_output(orte_snapc_base_output,
|
|
"%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
|
ret, __LINE__);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &seq_num, 1, OPAL_INT))) {
|
|
opal_output(orte_snapc_base_output,
|
|
"%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
|
ret, __LINE__);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
|
|
opal_output(orte_snapc_base_output,
|
|
"%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
|
|
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
|
|
ret, __LINE__);
|
|
ORTE_ERROR_LOG(ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
if(NULL != loc_buffer) {
|
|
OBJ_RELEASE(loc_buffer);
|
|
loc_buffer = NULL;
|
|
}
|
|
if( NULL != global_snapshot_handle ){
|
|
free(global_snapshot_handle);
|
|
global_snapshot_handle = NULL;
|
|
}
|
|
if( NULL != tmp_str ) {
|
|
free(tmp_str);
|
|
tmp_str = NULL;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
/****************************
|
|
* Command line tool request functions
|
|
****************************/
|
|
/* JJH TODO - Move the command line functions here ? */
|
|
|
|
/*****************************
|
|
* Snapshot metadata functions
|
|
*****************************/
|
|
int orte_snapc_ckpt_state_str(char ** state_str, int state)
|
|
{
|
|
switch(state) {
|
|
case ORTE_SNAPC_CKPT_STATE_NONE:
|
|
*state_str = strdup(" -- ");
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_REQUEST:
|
|
*state_str = strdup("Requested");
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_PENDING:
|
|
*state_str = strdup("Pending");
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_RUNNING:
|
|
*state_str = strdup("Running");
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_STOPPED:
|
|
*state_str = strdup("Stopped");
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_MIGRATING:
|
|
*state_str = strdup("Migrating");
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
|
|
*state_str = strdup("Checkpoint Established");
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_RECOVERED:
|
|
*state_str = strdup("Continuing/Recovered");
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
|
|
*state_str = strdup("Locally Finished");
|
|
break;
|
|
case ORTE_SNAPC_CKPT_STATE_ERROR:
|
|
*state_str = strdup("Error");
|
|
break;
|
|
default:
|
|
asprintf(state_str, "Unknown %d", state);
|
|
break;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|