Move in some accumulated small features and minor bug fixes for C/R support.
{{{ svn merge -r 16447:16475 https://svn.open-mpi.org/svn/ompi/tmp/jjh-fgs . }}} This commit was SVN r16478.
Этот коммит содержится в:
родитель
c143858998
Коммит
0bf61a1b84
@ -9,6 +9,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -20,6 +22,7 @@
|
||||
|
||||
#include "opal_config.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
|
||||
/*
|
||||
* Global functions for MCA overall CRS
|
||||
@ -88,6 +91,13 @@ extern "C" {
|
||||
int opal_crs_base_none_disable_checkpoint(void);
|
||||
int opal_crs_base_none_enable_checkpoint(void);
|
||||
|
||||
int opal_crs_base_none_prelaunch(int32_t rank,
|
||||
char *base_snapshot_dir,
|
||||
char **app,
|
||||
char **cwd,
|
||||
char ***argv,
|
||||
char ***env);
|
||||
|
||||
/**
|
||||
* Some utility functions
|
||||
*/
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -153,12 +155,24 @@ int opal_crs_base_none_enable_checkpoint(void)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int opal_crs_base_none_prelaunch(int32_t rank,
|
||||
char *base_snapshot_dir,
|
||||
char **app,
|
||||
char **cwd,
|
||||
char ***argv,
|
||||
char ***env)
|
||||
{
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
"0", true, env);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Utility functions
|
||||
*/
|
||||
char * opal_crs_base_unique_snapshot_name(pid_t pid)
|
||||
{
|
||||
char * loc_str;
|
||||
char * loc_str = NULL;
|
||||
|
||||
asprintf(&loc_str, "opal_snapshot_%d.ckpt", pid);
|
||||
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -77,7 +79,10 @@ static opal_crs_base_module_t none_module = {
|
||||
/** Disable checkpoints */
|
||||
opal_crs_base_none_disable_checkpoint,
|
||||
/** Enable checkpoints */
|
||||
opal_crs_base_none_enable_checkpoint
|
||||
opal_crs_base_none_enable_checkpoint,
|
||||
|
||||
/** Prelaunch */
|
||||
opal_crs_base_none_prelaunch
|
||||
};
|
||||
|
||||
int opal_crs_base_select(void)
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -59,7 +61,10 @@ static opal_crs_base_module_t blcr_module = {
|
||||
/** Disable checkpoints */
|
||||
opal_crs_blcr_disable_checkpoint,
|
||||
/** Enable checkpoints */
|
||||
opal_crs_blcr_enable_checkpoint
|
||||
opal_crs_blcr_enable_checkpoint,
|
||||
|
||||
/** Prelaunch */
|
||||
opal_crs_base_none_prelaunch
|
||||
};
|
||||
|
||||
/***************************
|
||||
|
@ -9,6 +9,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -183,7 +185,27 @@ typedef int (*opal_crs_base_module_disable_checkpoint_fn_t)
|
||||
typedef int (*opal_crs_base_module_enable_checkpoint_fn_t)
|
||||
(void);
|
||||
|
||||
|
||||
/**
|
||||
* Prepare the CRS component for process launch.
|
||||
* Some CRS components need to take action before the
|
||||
* process is ever launched to do such things as:
|
||||
* - seed the process environment
|
||||
* - LD_PRELOAD
|
||||
* - Analyze the binary before launch
|
||||
*
|
||||
* @param rank Rank of the process to be started
|
||||
* @param app Absolute pathname of argv[0]
|
||||
* @param argv Standard argv-style array, including a final NULL pointer
|
||||
* @param env Standard environ-style array, including a final NULL pointer
|
||||
*/
|
||||
typedef int (*opal_crs_base_module_prelaunch_fn_t)
|
||||
(int32_t rank,
|
||||
char *base_snapshot_dir,
|
||||
char **app,
|
||||
char **cwd,
|
||||
char ***argv,
|
||||
char ***env);
|
||||
|
||||
/**
|
||||
* Structure for CRS v1.0.0 components.
|
||||
*/
|
||||
@ -225,6 +247,9 @@ struct opal_crs_base_module_1_0_0_t {
|
||||
opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint;
|
||||
/** Enable checkpoints */
|
||||
opal_crs_base_module_enable_checkpoint_fn_t crs_enable_checkpoint;
|
||||
|
||||
/** Pre Launch */
|
||||
opal_crs_base_module_prelaunch_fn_t crs_prelaunch;
|
||||
};
|
||||
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t;
|
||||
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t;
|
||||
|
@ -8,7 +8,9 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -58,7 +60,10 @@ static opal_crs_base_module_t loc_module = {
|
||||
/** Disable checkpoints */
|
||||
opal_crs_self_disable_checkpoint,
|
||||
/** Enable checkpoints */
|
||||
opal_crs_self_enable_checkpoint
|
||||
opal_crs_self_enable_checkpoint,
|
||||
|
||||
/** Prelaunch */
|
||||
opal_crs_base_none_prelaunch
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -181,15 +181,18 @@ int opal_cr_init(void )
|
||||
opal_output_verbose(10, opal_cr_output,
|
||||
"opal_cr: init: OPAL CR Allow OPAL Only: %d",
|
||||
val);
|
||||
|
||||
mca_base_param_reg_int_name("opal_cr", "is_tool",
|
||||
"Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
|
||||
false, false,
|
||||
false,
|
||||
0,
|
||||
&val);
|
||||
if(!val)
|
||||
opal_cr_is_tool = false;
|
||||
else
|
||||
if(0 != val) {
|
||||
opal_cr_is_tool = true;
|
||||
}
|
||||
else {
|
||||
opal_cr_is_tool = false;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, opal_cr_output,
|
||||
"opal_cr: init: Is a tool program: %d",
|
||||
|
@ -10,6 +10,8 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -33,6 +35,9 @@ Error: The filename (%s) is invalid because either you have not provided a filen
|
||||
Error: Unable to obtain the proper restart command to restart from the
|
||||
checkpoint file (%s). Returned %d.
|
||||
|
||||
[comp_open_failure]
|
||||
Error: Unable to open the %s framework.
|
||||
|
||||
[comp_select_failure]
|
||||
Error: Unable to select the %s component needed to restart this
|
||||
application. (Returned %d)
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -197,6 +199,13 @@ main(int argc, char *argv[])
|
||||
* restart on this node because it doesn't have the proper checkpointer
|
||||
* available.
|
||||
*/
|
||||
if( OPAL_SUCCESS != (ret = opal_crs_base_open()) ) {
|
||||
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
|
||||
"crs", ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
|
||||
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
|
||||
expected_crs_comp, ret);
|
||||
@ -233,9 +242,8 @@ main(int argc, char *argv[])
|
||||
"\t Exec in self");
|
||||
}
|
||||
|
||||
/* We are launching a program that is not going to be a tool :) */
|
||||
opal_unsetenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
&environ);
|
||||
/* JJH: Do not unsetenv(opal_cr_is_tool) here, as it will impact the
|
||||
* JJH: application improperly. */
|
||||
|
||||
snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
|
||||
snapshot->cold_start = true;
|
||||
@ -381,10 +389,10 @@ static int parse_args(int argc, char *argv[])
|
||||
for(i = 0; i < len; ++i) {
|
||||
putenv(global_env[i]);
|
||||
}
|
||||
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
"1",
|
||||
true, &environ);
|
||||
|
||||
/* JJH: Do not setenv(opal_cr_is_tool, 1) here, as it will impact the
|
||||
* JJH: application improperly. */
|
||||
|
||||
/**
|
||||
* Now start parsing our specific arguments
|
||||
*/
|
||||
|
@ -57,6 +57,9 @@
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
#endif
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
@ -1130,7 +1133,31 @@ DOFORK:
|
||||
*/
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/*
|
||||
* OPAL CRS components need the opportunity to take action before a process
|
||||
* is forked.
|
||||
* Needs access to:
|
||||
* - Environment
|
||||
* - Rank/ORTE Name
|
||||
* - Binary to exec
|
||||
*/
|
||||
if( NULL != opal_crs.crs_prelaunch ) {
|
||||
if( OPAL_SUCCESS != (rc = opal_crs.crs_prelaunch(child->name->vpid,
|
||||
orte_snapc_base_global_snapshot_loc,
|
||||
&(app->app),
|
||||
&(app->cwd),
|
||||
&(app->argv),
|
||||
&(app_item->environ_copy) ) ) ) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (ORTE_SUCCESS != (rc = fork_local(app, child, app_item->environ_copy))) {
|
||||
/* do NOT ERROR_LOG this error - it generates
|
||||
* a message/node as most errors will be common
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
|
@ -9,6 +9,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -103,7 +105,11 @@ extern "C" {
|
||||
#define orte_snapc_base_metadata_filename (strdup("global_snapshot_meta.data"))
|
||||
|
||||
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_dir;
|
||||
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_ref;
|
||||
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_loc;
|
||||
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
|
||||
ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq;
|
||||
ORTE_DECLSPEC extern bool orte_snapc_base_establish_gloabl_snapshot_dir;
|
||||
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;
|
||||
|
||||
|
||||
@ -115,7 +121,8 @@ extern "C" {
|
||||
ORTE_DECLSPEC char * orte_snapc_base_unique_global_snapshot_name(pid_t pid);
|
||||
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name);
|
||||
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_directory(char *uniq_global_snapshot_name);
|
||||
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name);
|
||||
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
|
||||
bool empty_metadata);
|
||||
ORTE_DECLSPEC int orte_snapc_base_get_job_ckpt_info( orte_jobid_t jobid,
|
||||
size_t *ckpt_state,
|
||||
char **ckpt_snapshot_ref,
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -687,8 +689,13 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
|
||||
char * orte_snapc_base_unique_global_snapshot_name(pid_t pid)
|
||||
{
|
||||
char * uniq_name;
|
||||
|
||||
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid);
|
||||
|
||||
if( NULL == orte_snapc_base_global_snapshot_ref ) {
|
||||
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid);
|
||||
}
|
||||
else {
|
||||
uniq_name = strdup(orte_snapc_base_global_snapshot_ref);
|
||||
}
|
||||
|
||||
return uniq_name;
|
||||
}
|
||||
@ -717,7 +724,7 @@ char * orte_snapc_base_get_global_snapshot_directory(char *uniq_snapshot_name)
|
||||
return dir_name;
|
||||
}
|
||||
|
||||
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name)
|
||||
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, bool empty_metadata)
|
||||
{
|
||||
char * dir_name = NULL, *meta_data_fname = NULL;
|
||||
mode_t my_mode = S_IRWXU;
|
||||
@ -750,14 +757,21 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
|
||||
/*
|
||||
* Put in the checkpoint sequence number
|
||||
*/
|
||||
fprintf(meta_data, "#\n%s%d\n", SNAPC_METADATA_SEQ, (int)orte_snapc_base_snapshot_seq_number);
|
||||
if( empty_metadata ) {
|
||||
fprintf(meta_data, "#\n");
|
||||
}
|
||||
else {
|
||||
/*
|
||||
* Put in the checkpoint sequence number
|
||||
*/
|
||||
fprintf(meta_data, "#\n%s%d\n", SNAPC_METADATA_SEQ, (int)orte_snapc_base_snapshot_seq_number);
|
||||
|
||||
fclose(meta_data);
|
||||
meta_data = NULL;
|
||||
|
||||
/* Add timestamp */
|
||||
orte_snapc_base_add_timestamp(uniq_global_snapshot_name);
|
||||
fclose(meta_data);
|
||||
meta_data = NULL;
|
||||
|
||||
/* Add timestamp */
|
||||
orte_snapc_base_add_timestamp(uniq_global_snapshot_name);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if(NULL != meta_data)
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -44,7 +46,11 @@ opal_list_t orte_snapc_base_components_available;
|
||||
orte_snapc_base_component_t orte_snapc_base_selected_component;
|
||||
|
||||
char * orte_snapc_base_global_snapshot_dir = NULL;
|
||||
char * orte_snapc_base_global_snapshot_loc = NULL;
|
||||
char * orte_snapc_base_global_snapshot_ref = NULL;
|
||||
bool orte_snapc_base_store_in_place = true;
|
||||
bool orte_snapc_base_store_only_one_seq = false;
|
||||
bool orte_snapc_base_establish_gloabl_snapshot_dir = false;
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components,
|
||||
@ -104,9 +110,65 @@ int orte_snapc_base_open(void)
|
||||
orte_snapc_base_store_in_place = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reuse sequence numbers
|
||||
* This will create a directory and always use seq 0 for all checkpoints
|
||||
* This *should* also enforce a 2-phase commit protocol
|
||||
*/
|
||||
mca_base_param_reg_int_name("snapc_base",
|
||||
"only_one_seq",
|
||||
"Only store the most recent checkpoint sequence. [Default = disabled]",
|
||||
false, false,
|
||||
0,
|
||||
&value);
|
||||
if( 0 != value ) { /* Enabled */
|
||||
orte_snapc_base_store_only_one_seq = true;
|
||||
}
|
||||
else { /* Disabled */
|
||||
orte_snapc_base_store_only_one_seq = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-establish the global snapshot directory upon job registration
|
||||
*/
|
||||
mca_base_param_reg_int_name("snapc_base",
|
||||
"establish_global_snapshot_dir",
|
||||
"Establish the global snapshot directory on job startup. [Default = disabled]",
|
||||
false, false,
|
||||
0,
|
||||
&value);
|
||||
if( 0 != value ) { /* Enabled */
|
||||
orte_snapc_base_establish_gloabl_snapshot_dir = true;
|
||||
}
|
||||
else { /* Disabled */
|
||||
orte_snapc_base_establish_gloabl_snapshot_dir = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* User defined global snapshot directory name for this job
|
||||
*/
|
||||
mca_base_param_reg_string_name("snapc_base",
|
||||
"global_snapshot_ref",
|
||||
"The global snapshot reference to be used for this job. "
|
||||
" [Default = ompi_global_snapshot_MPIRUNPID.ckpt]",
|
||||
false, false,
|
||||
NULL,
|
||||
&orte_snapc_base_global_snapshot_ref);
|
||||
|
||||
|
||||
/* Init the sequence (interval) number */
|
||||
orte_snapc_base_snapshot_seq_number = 0;
|
||||
|
||||
if( NULL == orte_snapc_base_global_snapshot_loc ) {
|
||||
char *t1 = NULL;
|
||||
char *t2 = NULL;
|
||||
t1 = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
||||
t2 = orte_snapc_base_get_global_snapshot_directory( t1 );
|
||||
orte_snapc_base_global_snapshot_loc = strdup(t2);
|
||||
free(t1);
|
||||
free(t2);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Which SnapC component to open
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -51,6 +53,16 @@
|
||||
/************************************
|
||||
* Locally Global vars & functions :)
|
||||
************************************/
|
||||
#define INC_SEQ_NUM() \
|
||||
{ \
|
||||
if(orte_snapc_base_store_only_one_seq) { \
|
||||
orte_snapc_base_snapshot_seq_number = 0; \
|
||||
} else { \
|
||||
orte_snapc_base_snapshot_seq_number++; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
/* RML Callback */
|
||||
static void snapc_full_global_recv(int status,
|
||||
orte_process_name_t* sender,
|
||||
@ -207,6 +219,51 @@ int global_coord_setup_job(orte_jobid_t jobid) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* If requested pre-establish the global snapshot directory
|
||||
*/
|
||||
if(orte_snapc_base_establish_gloabl_snapshot_dir) {
|
||||
char *global_snapshot_handle = NULL;
|
||||
char *global_dir = NULL;
|
||||
|
||||
INC_SEQ_NUM();
|
||||
global_snapshot_handle = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
||||
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot_handle);
|
||||
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||
|
||||
global_snapshot.seq_num = orte_snapc_base_snapshot_seq_number;
|
||||
global_snapshot.reference_name = strdup(global_snapshot_handle);
|
||||
global_snapshot.local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name));
|
||||
|
||||
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
||||
"global) Pre-establish the global snapshot directory\n");
|
||||
|
||||
/* Creates the directory (with metadata files):
|
||||
* /tmp/ompi_global_snapshot_PID.ckpt/seq_num
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_init_global_snapshot_directory(global_snapshot_handle, true))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Push this value to the GPR so the orted can pick it up
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
||||
ORTE_SNAPC_CKPT_STATE_NONE,
|
||||
global_snapshot_handle,
|
||||
global_dir) ) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
free(global_snapshot_handle);
|
||||
global_snapshot_handle = NULL;
|
||||
|
||||
free(global_dir);
|
||||
global_dir = NULL;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
||||
"global [%d]) Setup job (%d) with vpid [%d, %d]\n", getpid(), jobid, vpid_start, vpid_range);
|
||||
|
||||
@ -360,7 +417,7 @@ snapc_full_global_checkpoint(orte_jobid_t jobid, bool term, char **global_snapsh
|
||||
/*********************
|
||||
* Generate the global snapshot directory, and unique global snapshot handle
|
||||
*********************/
|
||||
++orte_snapc_base_snapshot_seq_number;
|
||||
INC_SEQ_NUM();
|
||||
*global_snapshot_handle = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
||||
|
||||
global_snapshot.seq_num = orte_snapc_base_snapshot_seq_number;
|
||||
@ -370,7 +427,7 @@ snapc_full_global_checkpoint(orte_jobid_t jobid, bool term, char **global_snapsh
|
||||
/* Creates the directory (with metadata files):
|
||||
* /tmp/ompi_global_snapshot_PID.ckpt/seq_num
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_init_global_snapshot_directory(*global_snapshot_handle))) {
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_init_global_snapshot_directory(*global_snapshot_handle, false))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -538,6 +595,8 @@ static void vpid_ckpt_state_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
||||
char * global_dir = NULL;
|
||||
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
|
||||
|
||||
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(proc->jobid,
|
||||
ORTE_SNAPC_CKPT_STATE_RUNNING,
|
||||
global_snapshot.reference_name,
|
||||
@ -779,6 +838,7 @@ static int snapc_full_global_notify_checkpoint( char * global_snapshot_handle,
|
||||
/*
|
||||
* Update the job global segment
|
||||
*/
|
||||
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
||||
ckpt_state,
|
||||
global_snapshot_handle,
|
||||
@ -821,6 +881,7 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
|
||||
* Update the job checkpoint state
|
||||
**********************************/
|
||||
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
|
||||
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
||||
cur_job_ckpt_state,
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -138,7 +140,37 @@ int local_coord_setup_job(orte_jobid_t jobid)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ret = exit_status;
|
||||
if(orte_snapc_base_establish_gloabl_snapshot_dir) {
|
||||
size_t ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
char * ckpt_snapshot_ref = NULL;
|
||||
char * ckpt_snapshot_loc = NULL;
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_get_job_ckpt_info(jobid,
|
||||
&ckpt_state,
|
||||
&ckpt_snapshot_ref,
|
||||
&ckpt_snapshot_loc) ) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( NULL != ckpt_snapshot_loc &&
|
||||
(0 != strncmp(ckpt_snapshot_loc, "", strlen(""))) ) {
|
||||
orte_snapc_base_global_snapshot_loc = strdup(ckpt_snapshot_loc);
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
||||
"local) The global snapshot directory has been established at [%s]\n",
|
||||
orte_snapc_base_global_snapshot_loc);
|
||||
|
||||
if( NULL != ckpt_snapshot_ref ) {
|
||||
free(ckpt_snapshot_ref);
|
||||
ckpt_snapshot_ref = NULL;
|
||||
}
|
||||
if( NULL != ckpt_snapshot_loc ) {
|
||||
free(ckpt_snapshot_loc);
|
||||
ckpt_snapshot_loc = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
|
@ -360,7 +360,14 @@ int orte_daemon(int argc, char *argv[])
|
||||
opal_daemon_init(NULL);
|
||||
}
|
||||
|
||||
/* Intialize OPAL */
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* Mark as a tool program */
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
"1",
|
||||
true, &environ);
|
||||
#endif
|
||||
|
||||
/* Initialize OPAL */
|
||||
if (ORTE_SUCCESS != (ret = opal_init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
|
@ -393,6 +393,9 @@ int orterun(int argc, char *argv[])
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* Disable OPAL CR notifications for this tool */
|
||||
opal_cr_set_enabled(false);
|
||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||
"1",
|
||||
true, &environ);
|
||||
#endif
|
||||
|
||||
/* Intialize our Open RTE environment */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user