Move in some accumulated small features and minor bug fixes for C/R support.
{{{ svn merge -r 16447:16475 https://svn.open-mpi.org/svn/ompi/tmp/jjh-fgs . }}} This commit was SVN r16478.
Этот коммит содержится в:
родитель
c143858998
Коммит
0bf61a1b84
@ -9,6 +9,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -20,6 +22,7 @@
|
|||||||
|
|
||||||
#include "opal_config.h"
|
#include "opal_config.h"
|
||||||
#include "opal/mca/crs/crs.h"
|
#include "opal/mca/crs/crs.h"
|
||||||
|
#include "opal/util/opal_environ.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Global functions for MCA overall CRS
|
* Global functions for MCA overall CRS
|
||||||
@ -88,6 +91,13 @@ extern "C" {
|
|||||||
int opal_crs_base_none_disable_checkpoint(void);
|
int opal_crs_base_none_disable_checkpoint(void);
|
||||||
int opal_crs_base_none_enable_checkpoint(void);
|
int opal_crs_base_none_enable_checkpoint(void);
|
||||||
|
|
||||||
|
int opal_crs_base_none_prelaunch(int32_t rank,
|
||||||
|
char *base_snapshot_dir,
|
||||||
|
char **app,
|
||||||
|
char **cwd,
|
||||||
|
char ***argv,
|
||||||
|
char ***env);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Some utility functions
|
* Some utility functions
|
||||||
*/
|
*/
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -153,12 +155,24 @@ int opal_crs_base_none_enable_checkpoint(void)
|
|||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int opal_crs_base_none_prelaunch(int32_t rank,
|
||||||
|
char *base_snapshot_dir,
|
||||||
|
char **app,
|
||||||
|
char **cwd,
|
||||||
|
char ***argv,
|
||||||
|
char ***env)
|
||||||
|
{
|
||||||
|
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||||
|
"0", true, env);
|
||||||
|
return OPAL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Utility functions
|
* Utility functions
|
||||||
*/
|
*/
|
||||||
char * opal_crs_base_unique_snapshot_name(pid_t pid)
|
char * opal_crs_base_unique_snapshot_name(pid_t pid)
|
||||||
{
|
{
|
||||||
char * loc_str;
|
char * loc_str = NULL;
|
||||||
|
|
||||||
asprintf(&loc_str, "opal_snapshot_%d.ckpt", pid);
|
asprintf(&loc_str, "opal_snapshot_%d.ckpt", pid);
|
||||||
|
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -77,7 +79,10 @@ static opal_crs_base_module_t none_module = {
|
|||||||
/** Disable checkpoints */
|
/** Disable checkpoints */
|
||||||
opal_crs_base_none_disable_checkpoint,
|
opal_crs_base_none_disable_checkpoint,
|
||||||
/** Enable checkpoints */
|
/** Enable checkpoints */
|
||||||
opal_crs_base_none_enable_checkpoint
|
opal_crs_base_none_enable_checkpoint,
|
||||||
|
|
||||||
|
/** Prelaunch */
|
||||||
|
opal_crs_base_none_prelaunch
|
||||||
};
|
};
|
||||||
|
|
||||||
int opal_crs_base_select(void)
|
int opal_crs_base_select(void)
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -59,7 +61,10 @@ static opal_crs_base_module_t blcr_module = {
|
|||||||
/** Disable checkpoints */
|
/** Disable checkpoints */
|
||||||
opal_crs_blcr_disable_checkpoint,
|
opal_crs_blcr_disable_checkpoint,
|
||||||
/** Enable checkpoints */
|
/** Enable checkpoints */
|
||||||
opal_crs_blcr_enable_checkpoint
|
opal_crs_blcr_enable_checkpoint,
|
||||||
|
|
||||||
|
/** Prelaunch */
|
||||||
|
opal_crs_base_none_prelaunch
|
||||||
};
|
};
|
||||||
|
|
||||||
/***************************
|
/***************************
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -183,7 +185,27 @@ typedef int (*opal_crs_base_module_disable_checkpoint_fn_t)
|
|||||||
typedef int (*opal_crs_base_module_enable_checkpoint_fn_t)
|
typedef int (*opal_crs_base_module_enable_checkpoint_fn_t)
|
||||||
(void);
|
(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare the CRS component for process launch.
|
||||||
|
* Some CRS components need to take action before the
|
||||||
|
* process is ever launched to do such things as:
|
||||||
|
* - seed the process environment
|
||||||
|
* - LD_PRELOAD
|
||||||
|
* - Analyze the binary before launch
|
||||||
|
*
|
||||||
|
* @param rank Rank of the process to be started
|
||||||
|
* @param app Absolute pathname of argv[0]
|
||||||
|
* @param argv Standard argv-style array, including a final NULL pointer
|
||||||
|
* @param env Standard environ-style array, including a final NULL pointer
|
||||||
|
*/
|
||||||
|
typedef int (*opal_crs_base_module_prelaunch_fn_t)
|
||||||
|
(int32_t rank,
|
||||||
|
char *base_snapshot_dir,
|
||||||
|
char **app,
|
||||||
|
char **cwd,
|
||||||
|
char ***argv,
|
||||||
|
char ***env);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Structure for CRS v1.0.0 components.
|
* Structure for CRS v1.0.0 components.
|
||||||
*/
|
*/
|
||||||
@ -225,6 +247,9 @@ struct opal_crs_base_module_1_0_0_t {
|
|||||||
opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint;
|
opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint;
|
||||||
/** Enable checkpoints */
|
/** Enable checkpoints */
|
||||||
opal_crs_base_module_enable_checkpoint_fn_t crs_enable_checkpoint;
|
opal_crs_base_module_enable_checkpoint_fn_t crs_enable_checkpoint;
|
||||||
|
|
||||||
|
/** Pre Launch */
|
||||||
|
opal_crs_base_module_prelaunch_fn_t crs_prelaunch;
|
||||||
};
|
};
|
||||||
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t;
|
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t;
|
||||||
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t;
|
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t;
|
||||||
|
@ -8,7 +8,9 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -58,7 +60,10 @@ static opal_crs_base_module_t loc_module = {
|
|||||||
/** Disable checkpoints */
|
/** Disable checkpoints */
|
||||||
opal_crs_self_disable_checkpoint,
|
opal_crs_self_disable_checkpoint,
|
||||||
/** Enable checkpoints */
|
/** Enable checkpoints */
|
||||||
opal_crs_self_enable_checkpoint
|
opal_crs_self_enable_checkpoint,
|
||||||
|
|
||||||
|
/** Prelaunch */
|
||||||
|
opal_crs_base_none_prelaunch
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -181,15 +181,18 @@ int opal_cr_init(void )
|
|||||||
opal_output_verbose(10, opal_cr_output,
|
opal_output_verbose(10, opal_cr_output,
|
||||||
"opal_cr: init: OPAL CR Allow OPAL Only: %d",
|
"opal_cr: init: OPAL CR Allow OPAL Only: %d",
|
||||||
val);
|
val);
|
||||||
|
|
||||||
mca_base_param_reg_int_name("opal_cr", "is_tool",
|
mca_base_param_reg_int_name("opal_cr", "is_tool",
|
||||||
"Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
|
"Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
|
||||||
false, false,
|
false, false,
|
||||||
false,
|
0,
|
||||||
&val);
|
&val);
|
||||||
if(!val)
|
if(0 != val) {
|
||||||
opal_cr_is_tool = false;
|
|
||||||
else
|
|
||||||
opal_cr_is_tool = true;
|
opal_cr_is_tool = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
opal_cr_is_tool = false;
|
||||||
|
}
|
||||||
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
opal_output_verbose(10, opal_cr_output,
|
||||||
"opal_cr: init: Is a tool program: %d",
|
"opal_cr: init: Is a tool program: %d",
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
#
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -33,6 +35,9 @@ Error: The filename (%s) is invalid because either you have not provided a filen
|
|||||||
Error: Unable to obtain the proper restart command to restart from the
|
Error: Unable to obtain the proper restart command to restart from the
|
||||||
checkpoint file (%s). Returned %d.
|
checkpoint file (%s). Returned %d.
|
||||||
|
|
||||||
|
[comp_open_failure]
|
||||||
|
Error: Unable to open the %s framework.
|
||||||
|
|
||||||
[comp_select_failure]
|
[comp_select_failure]
|
||||||
Error: Unable to select the %s component needed to restart this
|
Error: Unable to select the %s component needed to restart this
|
||||||
application. (Returned %d)
|
application. (Returned %d)
|
||||||
|
@ -11,6 +11,8 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -197,6 +199,13 @@ main(int argc, char *argv[])
|
|||||||
* restart on this node because it doesn't have the proper checkpointer
|
* restart on this node because it doesn't have the proper checkpointer
|
||||||
* available.
|
* available.
|
||||||
*/
|
*/
|
||||||
|
if( OPAL_SUCCESS != (ret = opal_crs_base_open()) ) {
|
||||||
|
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
|
||||||
|
"crs", ret);
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
|
if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
|
||||||
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
|
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
|
||||||
expected_crs_comp, ret);
|
expected_crs_comp, ret);
|
||||||
@ -233,9 +242,8 @@ main(int argc, char *argv[])
|
|||||||
"\t Exec in self");
|
"\t Exec in self");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We are launching a program that is not going to be a tool :) */
|
/* JJH: Do not unsetenv(opal_cr_is_tool) here, as it will impact the
|
||||||
opal_unsetenv(mca_base_param_env_var("opal_cr_is_tool"),
|
* JJH: application improperly. */
|
||||||
&environ);
|
|
||||||
|
|
||||||
snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
|
snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
|
||||||
snapshot->cold_start = true;
|
snapshot->cold_start = true;
|
||||||
@ -381,10 +389,10 @@ static int parse_args(int argc, char *argv[])
|
|||||||
for(i = 0; i < len; ++i) {
|
for(i = 0; i < len; ++i) {
|
||||||
putenv(global_env[i]);
|
putenv(global_env[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
/* JJH: Do not setenv(opal_cr_is_tool, 1) here, as it will impact the
|
||||||
"1",
|
* JJH: application improperly. */
|
||||||
true, &environ);
|
|
||||||
/**
|
/**
|
||||||
* Now start parsing our specific arguments
|
* Now start parsing our specific arguments
|
||||||
*/
|
*/
|
||||||
|
@ -57,6 +57,9 @@
|
|||||||
|
|
||||||
#if OPAL_ENABLE_FT == 1
|
#if OPAL_ENABLE_FT == 1
|
||||||
#include "orte/mca/snapc/snapc.h"
|
#include "orte/mca/snapc/snapc.h"
|
||||||
|
#include "orte/mca/snapc/base/base.h"
|
||||||
|
#include "opal/mca/crs/crs.h"
|
||||||
|
#include "opal/mca/crs/base/base.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "orte/mca/odls/base/odls_private.h"
|
#include "orte/mca/odls/base/odls_private.h"
|
||||||
@ -1130,7 +1133,31 @@ DOFORK:
|
|||||||
*/
|
*/
|
||||||
opal_condition_signal(&orte_odls_globals.cond);
|
opal_condition_signal(&orte_odls_globals.cond);
|
||||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||||
|
|
||||||
|
#if OPAL_ENABLE_FT == 1
|
||||||
|
#if OPAL_ENABLE_FT_CR == 1
|
||||||
|
/*
|
||||||
|
* OPAL CRS components need the opportunity to take action before a process
|
||||||
|
* is forked.
|
||||||
|
* Needs access to:
|
||||||
|
* - Environment
|
||||||
|
* - Rank/ORTE Name
|
||||||
|
* - Binary to exec
|
||||||
|
*/
|
||||||
|
if( NULL != opal_crs.crs_prelaunch ) {
|
||||||
|
if( OPAL_SUCCESS != (rc = opal_crs.crs_prelaunch(child->name->vpid,
|
||||||
|
orte_snapc_base_global_snapshot_loc,
|
||||||
|
&(app->app),
|
||||||
|
&(app->cwd),
|
||||||
|
&(app->argv),
|
||||||
|
&(app_item->environ_copy) ) ) ) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto CLEANUP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = fork_local(app, child, app_item->environ_copy))) {
|
if (ORTE_SUCCESS != (rc = fork_local(app, child, app_item->environ_copy))) {
|
||||||
/* do NOT ERROR_LOG this error - it generates
|
/* do NOT ERROR_LOG this error - it generates
|
||||||
* a message/node as most errors will be common
|
* a message/node as most errors will be common
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -103,7 +105,11 @@ extern "C" {
|
|||||||
#define orte_snapc_base_metadata_filename (strdup("global_snapshot_meta.data"))
|
#define orte_snapc_base_metadata_filename (strdup("global_snapshot_meta.data"))
|
||||||
|
|
||||||
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_dir;
|
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_dir;
|
||||||
|
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_ref;
|
||||||
|
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_loc;
|
||||||
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
|
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
|
||||||
|
ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq;
|
||||||
|
ORTE_DECLSPEC extern bool orte_snapc_base_establish_gloabl_snapshot_dir;
|
||||||
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;
|
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;
|
||||||
|
|
||||||
|
|
||||||
@ -115,7 +121,8 @@ extern "C" {
|
|||||||
ORTE_DECLSPEC char * orte_snapc_base_unique_global_snapshot_name(pid_t pid);
|
ORTE_DECLSPEC char * orte_snapc_base_unique_global_snapshot_name(pid_t pid);
|
||||||
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name);
|
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name);
|
||||||
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_directory(char *uniq_global_snapshot_name);
|
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_directory(char *uniq_global_snapshot_name);
|
||||||
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name);
|
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
|
||||||
|
bool empty_metadata);
|
||||||
ORTE_DECLSPEC int orte_snapc_base_get_job_ckpt_info( orte_jobid_t jobid,
|
ORTE_DECLSPEC int orte_snapc_base_get_job_ckpt_info( orte_jobid_t jobid,
|
||||||
size_t *ckpt_state,
|
size_t *ckpt_state,
|
||||||
char **ckpt_snapshot_ref,
|
char **ckpt_snapshot_ref,
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -687,8 +689,13 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
|
|||||||
char * orte_snapc_base_unique_global_snapshot_name(pid_t pid)
|
char * orte_snapc_base_unique_global_snapshot_name(pid_t pid)
|
||||||
{
|
{
|
||||||
char * uniq_name;
|
char * uniq_name;
|
||||||
|
|
||||||
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid);
|
if( NULL == orte_snapc_base_global_snapshot_ref ) {
|
||||||
|
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
uniq_name = strdup(orte_snapc_base_global_snapshot_ref);
|
||||||
|
}
|
||||||
|
|
||||||
return uniq_name;
|
return uniq_name;
|
||||||
}
|
}
|
||||||
@ -717,7 +724,7 @@ char * orte_snapc_base_get_global_snapshot_directory(char *uniq_snapshot_name)
|
|||||||
return dir_name;
|
return dir_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name)
|
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, bool empty_metadata)
|
||||||
{
|
{
|
||||||
char * dir_name = NULL, *meta_data_fname = NULL;
|
char * dir_name = NULL, *meta_data_fname = NULL;
|
||||||
mode_t my_mode = S_IRWXU;
|
mode_t my_mode = S_IRWXU;
|
||||||
@ -750,14 +757,21 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
|
|||||||
/*
|
/*
|
||||||
* Put in the checkpoint sequence number
|
* Put in the checkpoint sequence number
|
||||||
*/
|
*/
|
||||||
fprintf(meta_data, "#\n%s%d\n", SNAPC_METADATA_SEQ, (int)orte_snapc_base_snapshot_seq_number);
|
if( empty_metadata ) {
|
||||||
|
fprintf(meta_data, "#\n");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/*
|
||||||
|
* Put in the checkpoint sequence number
|
||||||
|
*/
|
||||||
|
fprintf(meta_data, "#\n%s%d\n", SNAPC_METADATA_SEQ, (int)orte_snapc_base_snapshot_seq_number);
|
||||||
|
|
||||||
fclose(meta_data);
|
fclose(meta_data);
|
||||||
meta_data = NULL;
|
meta_data = NULL;
|
||||||
|
|
||||||
/* Add timestamp */
|
|
||||||
orte_snapc_base_add_timestamp(uniq_global_snapshot_name);
|
|
||||||
|
|
||||||
|
/* Add timestamp */
|
||||||
|
orte_snapc_base_add_timestamp(uniq_global_snapshot_name);
|
||||||
|
}
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if(NULL != meta_data)
|
if(NULL != meta_data)
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -44,7 +46,11 @@ opal_list_t orte_snapc_base_components_available;
|
|||||||
orte_snapc_base_component_t orte_snapc_base_selected_component;
|
orte_snapc_base_component_t orte_snapc_base_selected_component;
|
||||||
|
|
||||||
char * orte_snapc_base_global_snapshot_dir = NULL;
|
char * orte_snapc_base_global_snapshot_dir = NULL;
|
||||||
|
char * orte_snapc_base_global_snapshot_loc = NULL;
|
||||||
|
char * orte_snapc_base_global_snapshot_ref = NULL;
|
||||||
bool orte_snapc_base_store_in_place = true;
|
bool orte_snapc_base_store_in_place = true;
|
||||||
|
bool orte_snapc_base_store_only_one_seq = false;
|
||||||
|
bool orte_snapc_base_establish_gloabl_snapshot_dir = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function for finding and opening either all MCA components,
|
* Function for finding and opening either all MCA components,
|
||||||
@ -104,9 +110,65 @@ int orte_snapc_base_open(void)
|
|||||||
orte_snapc_base_store_in_place = false;
|
orte_snapc_base_store_in_place = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reuse sequence numbers
|
||||||
|
* This will create a directory and always use seq 0 for all checkpoints
|
||||||
|
* This *should* also enforce a 2-phase commit protocol
|
||||||
|
*/
|
||||||
|
mca_base_param_reg_int_name("snapc_base",
|
||||||
|
"only_one_seq",
|
||||||
|
"Only store the most recent checkpoint sequence. [Default = disabled]",
|
||||||
|
false, false,
|
||||||
|
0,
|
||||||
|
&value);
|
||||||
|
if( 0 != value ) { /* Enabled */
|
||||||
|
orte_snapc_base_store_only_one_seq = true;
|
||||||
|
}
|
||||||
|
else { /* Disabled */
|
||||||
|
orte_snapc_base_store_only_one_seq = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Pre-establish the global snapshot directory upon job registration
|
||||||
|
*/
|
||||||
|
mca_base_param_reg_int_name("snapc_base",
|
||||||
|
"establish_global_snapshot_dir",
|
||||||
|
"Establish the global snapshot directory on job startup. [Default = disabled]",
|
||||||
|
false, false,
|
||||||
|
0,
|
||||||
|
&value);
|
||||||
|
if( 0 != value ) { /* Enabled */
|
||||||
|
orte_snapc_base_establish_gloabl_snapshot_dir = true;
|
||||||
|
}
|
||||||
|
else { /* Disabled */
|
||||||
|
orte_snapc_base_establish_gloabl_snapshot_dir = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* User defined global snapshot directory name for this job
|
||||||
|
*/
|
||||||
|
mca_base_param_reg_string_name("snapc_base",
|
||||||
|
"global_snapshot_ref",
|
||||||
|
"The global snapshot reference to be used for this job. "
|
||||||
|
" [Default = ompi_global_snapshot_MPIRUNPID.ckpt]",
|
||||||
|
false, false,
|
||||||
|
NULL,
|
||||||
|
&orte_snapc_base_global_snapshot_ref);
|
||||||
|
|
||||||
|
|
||||||
/* Init the sequence (interval) number */
|
/* Init the sequence (interval) number */
|
||||||
orte_snapc_base_snapshot_seq_number = 0;
|
orte_snapc_base_snapshot_seq_number = 0;
|
||||||
|
|
||||||
|
if( NULL == orte_snapc_base_global_snapshot_loc ) {
|
||||||
|
char *t1 = NULL;
|
||||||
|
char *t2 = NULL;
|
||||||
|
t1 = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
||||||
|
t2 = orte_snapc_base_get_global_snapshot_directory( t1 );
|
||||||
|
orte_snapc_base_global_snapshot_loc = strdup(t2);
|
||||||
|
free(t1);
|
||||||
|
free(t2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Which SnapC component to open
|
* Which SnapC component to open
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -51,6 +53,16 @@
|
|||||||
/************************************
|
/************************************
|
||||||
* Locally Global vars & functions :)
|
* Locally Global vars & functions :)
|
||||||
************************************/
|
************************************/
|
||||||
|
#define INC_SEQ_NUM() \
|
||||||
|
{ \
|
||||||
|
if(orte_snapc_base_store_only_one_seq) { \
|
||||||
|
orte_snapc_base_snapshot_seq_number = 0; \
|
||||||
|
} else { \
|
||||||
|
orte_snapc_base_snapshot_seq_number++; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* RML Callback */
|
/* RML Callback */
|
||||||
static void snapc_full_global_recv(int status,
|
static void snapc_full_global_recv(int status,
|
||||||
orte_process_name_t* sender,
|
orte_process_name_t* sender,
|
||||||
@ -207,6 +219,51 @@ int global_coord_setup_job(orte_jobid_t jobid) {
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If requested pre-establish the global snapshot directory
|
||||||
|
*/
|
||||||
|
if(orte_snapc_base_establish_gloabl_snapshot_dir) {
|
||||||
|
char *global_snapshot_handle = NULL;
|
||||||
|
char *global_dir = NULL;
|
||||||
|
|
||||||
|
INC_SEQ_NUM();
|
||||||
|
global_snapshot_handle = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
||||||
|
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot_handle);
|
||||||
|
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||||
|
|
||||||
|
global_snapshot.seq_num = orte_snapc_base_snapshot_seq_number;
|
||||||
|
global_snapshot.reference_name = strdup(global_snapshot_handle);
|
||||||
|
global_snapshot.local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name));
|
||||||
|
|
||||||
|
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
||||||
|
"global) Pre-establish the global snapshot directory\n");
|
||||||
|
|
||||||
|
/* Creates the directory (with metadata files):
|
||||||
|
* /tmp/ompi_global_snapshot_PID.ckpt/seq_num
|
||||||
|
*/
|
||||||
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_init_global_snapshot_directory(global_snapshot_handle, true))) {
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Push this value to the GPR so the orted can pick it up
|
||||||
|
*/
|
||||||
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
||||||
|
ORTE_SNAPC_CKPT_STATE_NONE,
|
||||||
|
global_snapshot_handle,
|
||||||
|
global_dir) ) ) {
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(global_snapshot_handle);
|
||||||
|
global_snapshot_handle = NULL;
|
||||||
|
|
||||||
|
free(global_dir);
|
||||||
|
global_dir = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
||||||
"global [%d]) Setup job (%d) with vpid [%d, %d]\n", getpid(), jobid, vpid_start, vpid_range);
|
"global [%d]) Setup job (%d) with vpid [%d, %d]\n", getpid(), jobid, vpid_start, vpid_range);
|
||||||
|
|
||||||
@ -360,7 +417,7 @@ snapc_full_global_checkpoint(orte_jobid_t jobid, bool term, char **global_snapsh
|
|||||||
/*********************
|
/*********************
|
||||||
* Generate the global snapshot directory, and unique global snapshot handle
|
* Generate the global snapshot directory, and unique global snapshot handle
|
||||||
*********************/
|
*********************/
|
||||||
++orte_snapc_base_snapshot_seq_number;
|
INC_SEQ_NUM();
|
||||||
*global_snapshot_handle = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
*global_snapshot_handle = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
||||||
|
|
||||||
global_snapshot.seq_num = orte_snapc_base_snapshot_seq_number;
|
global_snapshot.seq_num = orte_snapc_base_snapshot_seq_number;
|
||||||
@ -370,7 +427,7 @@ snapc_full_global_checkpoint(orte_jobid_t jobid, bool term, char **global_snapsh
|
|||||||
/* Creates the directory (with metadata files):
|
/* Creates the directory (with metadata files):
|
||||||
* /tmp/ompi_global_snapshot_PID.ckpt/seq_num
|
* /tmp/ompi_global_snapshot_PID.ckpt/seq_num
|
||||||
*/
|
*/
|
||||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_init_global_snapshot_directory(*global_snapshot_handle))) {
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_init_global_snapshot_directory(*global_snapshot_handle, false))) {
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -538,6 +595,8 @@ static void vpid_ckpt_state_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
|||||||
char * global_dir = NULL;
|
char * global_dir = NULL;
|
||||||
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
|
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
|
||||||
|
|
||||||
|
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||||
|
|
||||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(proc->jobid,
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(proc->jobid,
|
||||||
ORTE_SNAPC_CKPT_STATE_RUNNING,
|
ORTE_SNAPC_CKPT_STATE_RUNNING,
|
||||||
global_snapshot.reference_name,
|
global_snapshot.reference_name,
|
||||||
@ -779,6 +838,7 @@ static int snapc_full_global_notify_checkpoint( char * global_snapshot_handle,
|
|||||||
/*
|
/*
|
||||||
* Update the job global segment
|
* Update the job global segment
|
||||||
*/
|
*/
|
||||||
|
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
||||||
ckpt_state,
|
ckpt_state,
|
||||||
global_snapshot_handle,
|
global_snapshot_handle,
|
||||||
@ -821,6 +881,7 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
|
|||||||
* Update the job checkpoint state
|
* Update the job checkpoint state
|
||||||
**********************************/
|
**********************************/
|
||||||
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
|
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
|
||||||
|
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||||
|
|
||||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
||||||
cur_job_ckpt_state,
|
cur_job_ckpt_state,
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||||
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -138,7 +140,37 @@ int local_coord_setup_job(orte_jobid_t jobid)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = exit_status;
|
if(orte_snapc_base_establish_gloabl_snapshot_dir) {
|
||||||
|
size_t ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||||
|
char * ckpt_snapshot_ref = NULL;
|
||||||
|
char * ckpt_snapshot_loc = NULL;
|
||||||
|
|
||||||
|
if( ORTE_SUCCESS != (ret = orte_snapc_base_get_job_ckpt_info(jobid,
|
||||||
|
&ckpt_state,
|
||||||
|
&ckpt_snapshot_ref,
|
||||||
|
&ckpt_snapshot_loc) ) ) {
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( NULL != ckpt_snapshot_loc &&
|
||||||
|
(0 != strncmp(ckpt_snapshot_loc, "", strlen(""))) ) {
|
||||||
|
orte_snapc_base_global_snapshot_loc = strdup(ckpt_snapshot_loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
||||||
|
"local) The global snapshot directory has been established at [%s]\n",
|
||||||
|
orte_snapc_base_global_snapshot_loc);
|
||||||
|
|
||||||
|
if( NULL != ckpt_snapshot_ref ) {
|
||||||
|
free(ckpt_snapshot_ref);
|
||||||
|
ckpt_snapshot_ref = NULL;
|
||||||
|
}
|
||||||
|
if( NULL != ckpt_snapshot_loc ) {
|
||||||
|
free(ckpt_snapshot_loc);
|
||||||
|
ckpt_snapshot_loc = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
return exit_status;
|
return exit_status;
|
||||||
|
@ -360,7 +360,14 @@ int orte_daemon(int argc, char *argv[])
|
|||||||
opal_daemon_init(NULL);
|
opal_daemon_init(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Intialize OPAL */
|
#if OPAL_ENABLE_FT == 1
|
||||||
|
/* Mark as a tool program */
|
||||||
|
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||||
|
"1",
|
||||||
|
true, &environ);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Initialize OPAL */
|
||||||
if (ORTE_SUCCESS != (ret = opal_init())) {
|
if (ORTE_SUCCESS != (ret = opal_init())) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -393,6 +393,9 @@ int orterun(int argc, char *argv[])
|
|||||||
#if OPAL_ENABLE_FT == 1
|
#if OPAL_ENABLE_FT == 1
|
||||||
/* Disable OPAL CR notifications for this tool */
|
/* Disable OPAL CR notifications for this tool */
|
||||||
opal_cr_set_enabled(false);
|
opal_cr_set_enabled(false);
|
||||||
|
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
||||||
|
"1",
|
||||||
|
true, &environ);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Intialize our Open RTE environment */
|
/* Intialize our Open RTE environment */
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user