1
1

Move in some accumulated small features and minor bug fixes for C/R support.

{{{
svn merge -r 16447:16475 https://svn.open-mpi.org/svn/ompi/tmp/jjh-fgs .
}}}

This commit was SVN r16478.
Этот коммит содержится в:
Josh Hursey 2007-10-17 13:47:36 +00:00
родитель c143858998
Коммит 0bf61a1b84
19 изменённых файлов: 329 добавлений и 32 удалений

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -20,6 +22,7 @@
#include "opal_config.h" #include "opal_config.h"
#include "opal/mca/crs/crs.h" #include "opal/mca/crs/crs.h"
#include "opal/util/opal_environ.h"
/* /*
* Global functions for MCA overall CRS * Global functions for MCA overall CRS
@ -88,6 +91,13 @@ extern "C" {
int opal_crs_base_none_disable_checkpoint(void); int opal_crs_base_none_disable_checkpoint(void);
int opal_crs_base_none_enable_checkpoint(void); int opal_crs_base_none_enable_checkpoint(void);
int opal_crs_base_none_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
/** /**
* Some utility functions * Some utility functions
*/ */

Просмотреть файл

@ -7,6 +7,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -153,12 +155,24 @@ int opal_crs_base_none_enable_checkpoint(void)
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
int opal_crs_base_none_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env)
{
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
"0", true, env);
return OPAL_SUCCESS;
}
/* /*
* Utility functions * Utility functions
*/ */
char * opal_crs_base_unique_snapshot_name(pid_t pid) char * opal_crs_base_unique_snapshot_name(pid_t pid)
{ {
char * loc_str; char * loc_str = NULL;
asprintf(&loc_str, "opal_snapshot_%d.ckpt", pid); asprintf(&loc_str, "opal_snapshot_%d.ckpt", pid);

Просмотреть файл

@ -7,6 +7,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow

Просмотреть файл

@ -7,6 +7,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -77,7 +79,10 @@ static opal_crs_base_module_t none_module = {
/** Disable checkpoints */ /** Disable checkpoints */
opal_crs_base_none_disable_checkpoint, opal_crs_base_none_disable_checkpoint,
/** Enable checkpoints */ /** Enable checkpoints */
opal_crs_base_none_enable_checkpoint opal_crs_base_none_enable_checkpoint,
/** Prelaunch */
opal_crs_base_none_prelaunch
}; };
int opal_crs_base_select(void) int opal_crs_base_select(void)

Просмотреть файл

@ -7,6 +7,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -59,7 +61,10 @@ static opal_crs_base_module_t blcr_module = {
/** Disable checkpoints */ /** Disable checkpoints */
opal_crs_blcr_disable_checkpoint, opal_crs_blcr_disable_checkpoint,
/** Enable checkpoints */ /** Enable checkpoints */
opal_crs_blcr_enable_checkpoint opal_crs_blcr_enable_checkpoint,
/** Prelaunch */
opal_crs_base_none_prelaunch
}; };
/*************************** /***************************

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -183,7 +185,27 @@ typedef int (*opal_crs_base_module_disable_checkpoint_fn_t)
typedef int (*opal_crs_base_module_enable_checkpoint_fn_t) typedef int (*opal_crs_base_module_enable_checkpoint_fn_t)
(void); (void);
/**
* Prepare the CRS component for process launch.
* Some CRS components need to take action before the
* process is ever launched to do such things as:
* - seed the process environment
* - LD_PRELOAD
* - Analyze the binary before launch
*
* @param rank Rank of the process to be started
* @param app Absolute pathname of argv[0]
* @param argv Standard argv-style array, including a final NULL pointer
* @param env Standard environ-style array, including a final NULL pointer
*/
typedef int (*opal_crs_base_module_prelaunch_fn_t)
(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
/** /**
* Structure for CRS v1.0.0 components. * Structure for CRS v1.0.0 components.
*/ */
@ -225,6 +247,9 @@ struct opal_crs_base_module_1_0_0_t {
opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint; opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint;
/** Enable checkpoints */ /** Enable checkpoints */
opal_crs_base_module_enable_checkpoint_fn_t crs_enable_checkpoint; opal_crs_base_module_enable_checkpoint_fn_t crs_enable_checkpoint;
/** Pre Launch */
opal_crs_base_module_prelaunch_fn_t crs_prelaunch;
}; };
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t; typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t;
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t; typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t;

Просмотреть файл

@ -8,7 +8,9 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -58,7 +60,10 @@ static opal_crs_base_module_t loc_module = {
/** Disable checkpoints */ /** Disable checkpoints */
opal_crs_self_disable_checkpoint, opal_crs_self_disable_checkpoint,
/** Enable checkpoints */ /** Enable checkpoints */
opal_crs_self_enable_checkpoint opal_crs_self_enable_checkpoint,
/** Prelaunch */
opal_crs_base_none_prelaunch
}; };
/* /*

Просмотреть файл

@ -181,15 +181,18 @@ int opal_cr_init(void )
opal_output_verbose(10, opal_cr_output, opal_output_verbose(10, opal_cr_output,
"opal_cr: init: OPAL CR Allow OPAL Only: %d", "opal_cr: init: OPAL CR Allow OPAL Only: %d",
val); val);
mca_base_param_reg_int_name("opal_cr", "is_tool", mca_base_param_reg_int_name("opal_cr", "is_tool",
"Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.", "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
false, false, false, false,
false, 0,
&val); &val);
if(!val) if(0 != val) {
opal_cr_is_tool = false;
else
opal_cr_is_tool = true; opal_cr_is_tool = true;
}
else {
opal_cr_is_tool = false;
}
opal_output_verbose(10, opal_cr_output, opal_output_verbose(10, opal_cr_output,
"opal_cr: init: Is a tool program: %d", "opal_cr: init: Is a tool program: %d",

Просмотреть файл

@ -10,6 +10,8 @@
# University of Stuttgart. All rights reserved. # University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2007 Evergrid, Inc. All rights reserved.
#
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -33,6 +35,9 @@ Error: The filename (%s) is invalid because either you have not provided a filen
Error: Unable to obtain the proper restart command to restart from the Error: Unable to obtain the proper restart command to restart from the
checkpoint file (%s). Returned %d. checkpoint file (%s). Returned %d.
[comp_open_failure]
Error: Unable to open the %s framework.
[comp_select_failure] [comp_select_failure]
Error: Unable to select the %s component needed to restart this Error: Unable to select the %s component needed to restart this
application. (Returned %d) application. (Returned %d)

Просмотреть файл

@ -11,6 +11,8 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -197,6 +199,13 @@ main(int argc, char *argv[])
* restart on this node because it doesn't have the proper checkpointer * restart on this node because it doesn't have the proper checkpointer
* available. * available.
*/ */
if( OPAL_SUCCESS != (ret = opal_crs_base_open()) ) {
opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
"crs", ret);
exit_status = ret;
goto cleanup;
}
if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) { if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
opal_show_help("help-opal-restart.txt", "comp_select_failure", true, opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
expected_crs_comp, ret); expected_crs_comp, ret);
@ -233,9 +242,8 @@ main(int argc, char *argv[])
"\t Exec in self"); "\t Exec in self");
} }
/* We are launching a program that is not going to be a tool :) */ /* JJH: Do not unsetenv(opal_cr_is_tool) here, as it will impact the
opal_unsetenv(mca_base_param_env_var("opal_cr_is_tool"), * JJH: application improperly. */
&environ);
snapshot = OBJ_NEW(opal_crs_base_snapshot_t); snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
snapshot->cold_start = true; snapshot->cold_start = true;
@ -381,10 +389,10 @@ static int parse_args(int argc, char *argv[])
for(i = 0; i < len; ++i) { for(i = 0; i < len; ++i) {
putenv(global_env[i]); putenv(global_env[i]);
} }
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), /* JJH: Do not setenv(opal_cr_is_tool, 1) here, as it will impact the
"1", * JJH: application improperly. */
true, &environ);
/** /**
* Now start parsing our specific arguments * Now start parsing our specific arguments
*/ */

Просмотреть файл

@ -57,6 +57,9 @@
#if OPAL_ENABLE_FT == 1 #if OPAL_ENABLE_FT == 1
#include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#endif #endif
#include "orte/mca/odls/base/odls_private.h" #include "orte/mca/odls/base/odls_private.h"
@ -1130,7 +1133,31 @@ DOFORK:
*/ */
opal_condition_signal(&orte_odls_globals.cond); opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
#if OPAL_ENABLE_FT == 1
#if OPAL_ENABLE_FT_CR == 1
/*
* OPAL CRS components need the opportunity to take action before a process
* is forked.
* Needs access to:
* - Environment
* - Rank/ORTE Name
* - Binary to exec
*/
if( NULL != opal_crs.crs_prelaunch ) {
if( OPAL_SUCCESS != (rc = opal_crs.crs_prelaunch(child->name->vpid,
orte_snapc_base_global_snapshot_loc,
&(app->app),
&(app->cwd),
&(app->argv),
&(app_item->environ_copy) ) ) ) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
}
#endif
#endif
if (ORTE_SUCCESS != (rc = fork_local(app, child, app_item->environ_copy))) { if (ORTE_SUCCESS != (rc = fork_local(app, child, app_item->environ_copy))) {
/* do NOT ERROR_LOG this error - it generates /* do NOT ERROR_LOG this error - it generates
* a message/node as most errors will be common * a message/node as most errors will be common

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -103,7 +105,11 @@ extern "C" {
#define orte_snapc_base_metadata_filename (strdup("global_snapshot_meta.data")) #define orte_snapc_base_metadata_filename (strdup("global_snapshot_meta.data"))
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_dir; ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_dir;
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_ref;
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_loc;
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place; ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq;
ORTE_DECLSPEC extern bool orte_snapc_base_establish_gloabl_snapshot_dir;
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number; ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;
@ -115,7 +121,8 @@ extern "C" {
ORTE_DECLSPEC char * orte_snapc_base_unique_global_snapshot_name(pid_t pid); ORTE_DECLSPEC char * orte_snapc_base_unique_global_snapshot_name(pid_t pid);
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name); ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_metadata_file(char *uniq_snapshot_name);
ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_directory(char *uniq_global_snapshot_name); ORTE_DECLSPEC char * orte_snapc_base_get_global_snapshot_directory(char *uniq_global_snapshot_name);
ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name); ORTE_DECLSPEC int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name,
bool empty_metadata);
ORTE_DECLSPEC int orte_snapc_base_get_job_ckpt_info( orte_jobid_t jobid, ORTE_DECLSPEC int orte_snapc_base_get_job_ckpt_info( orte_jobid_t jobid,
size_t *ckpt_state, size_t *ckpt_state,
char **ckpt_snapshot_ref, char **ckpt_snapshot_ref,

Просмотреть файл

@ -7,6 +7,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -687,8 +689,13 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
char * orte_snapc_base_unique_global_snapshot_name(pid_t pid) char * orte_snapc_base_unique_global_snapshot_name(pid_t pid)
{ {
char * uniq_name; char * uniq_name;
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid); if( NULL == orte_snapc_base_global_snapshot_ref ) {
asprintf(&uniq_name, "ompi_global_snapshot_%d.ckpt", pid);
}
else {
uniq_name = strdup(orte_snapc_base_global_snapshot_ref);
}
return uniq_name; return uniq_name;
} }
@ -717,7 +724,7 @@ char * orte_snapc_base_get_global_snapshot_directory(char *uniq_snapshot_name)
return dir_name; return dir_name;
} }
int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name) int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_name, bool empty_metadata)
{ {
char * dir_name = NULL, *meta_data_fname = NULL; char * dir_name = NULL, *meta_data_fname = NULL;
mode_t my_mode = S_IRWXU; mode_t my_mode = S_IRWXU;
@ -750,14 +757,21 @@ int orte_snapc_base_init_global_snapshot_directory(char *uniq_global_snapshot_na
/* /*
* Put in the checkpoint sequence number * Put in the checkpoint sequence number
*/ */
fprintf(meta_data, "#\n%s%d\n", SNAPC_METADATA_SEQ, (int)orte_snapc_base_snapshot_seq_number); if( empty_metadata ) {
fprintf(meta_data, "#\n");
}
else {
/*
* Put in the checkpoint sequence number
*/
fprintf(meta_data, "#\n%s%d\n", SNAPC_METADATA_SEQ, (int)orte_snapc_base_snapshot_seq_number);
fclose(meta_data); fclose(meta_data);
meta_data = NULL; meta_data = NULL;
/* Add timestamp */
orte_snapc_base_add_timestamp(uniq_global_snapshot_name);
/* Add timestamp */
orte_snapc_base_add_timestamp(uniq_global_snapshot_name);
}
cleanup: cleanup:
if(NULL != meta_data) if(NULL != meta_data)

Просмотреть файл

@ -7,6 +7,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -44,7 +46,11 @@ opal_list_t orte_snapc_base_components_available;
orte_snapc_base_component_t orte_snapc_base_selected_component; orte_snapc_base_component_t orte_snapc_base_selected_component;
char * orte_snapc_base_global_snapshot_dir = NULL; char * orte_snapc_base_global_snapshot_dir = NULL;
char * orte_snapc_base_global_snapshot_loc = NULL;
char * orte_snapc_base_global_snapshot_ref = NULL;
bool orte_snapc_base_store_in_place = true; bool orte_snapc_base_store_in_place = true;
bool orte_snapc_base_store_only_one_seq = false;
bool orte_snapc_base_establish_gloabl_snapshot_dir = false;
/** /**
* Function for finding and opening either all MCA components, * Function for finding and opening either all MCA components,
@ -104,9 +110,65 @@ int orte_snapc_base_open(void)
orte_snapc_base_store_in_place = false; orte_snapc_base_store_in_place = false;
} }
/*
* Reuse sequence numbers
* This will create a directory and always use seq 0 for all checkpoints
* This *should* also enforce a 2-phase commit protocol
*/
mca_base_param_reg_int_name("snapc_base",
"only_one_seq",
"Only store the most recent checkpoint sequence. [Default = disabled]",
false, false,
0,
&value);
if( 0 != value ) { /* Enabled */
orte_snapc_base_store_only_one_seq = true;
}
else { /* Disabled */
orte_snapc_base_store_only_one_seq = false;
}
/*
* Pre-establish the global snapshot directory upon job registration
*/
mca_base_param_reg_int_name("snapc_base",
"establish_global_snapshot_dir",
"Establish the global snapshot directory on job startup. [Default = disabled]",
false, false,
0,
&value);
if( 0 != value ) { /* Enabled */
orte_snapc_base_establish_gloabl_snapshot_dir = true;
}
else { /* Disabled */
orte_snapc_base_establish_gloabl_snapshot_dir = false;
}
/*
* User defined global snapshot directory name for this job
*/
mca_base_param_reg_string_name("snapc_base",
"global_snapshot_ref",
"The global snapshot reference to be used for this job. "
" [Default = ompi_global_snapshot_MPIRUNPID.ckpt]",
false, false,
NULL,
&orte_snapc_base_global_snapshot_ref);
/* Init the sequence (interval) number */ /* Init the sequence (interval) number */
orte_snapc_base_snapshot_seq_number = 0; orte_snapc_base_snapshot_seq_number = 0;
if( NULL == orte_snapc_base_global_snapshot_loc ) {
char *t1 = NULL;
char *t2 = NULL;
t1 = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
t2 = orte_snapc_base_get_global_snapshot_directory( t1 );
orte_snapc_base_global_snapshot_loc = strdup(t2);
free(t1);
free(t2);
}
/* /*
* Which SnapC component to open * Which SnapC component to open

Просмотреть файл

@ -7,6 +7,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -51,6 +53,16 @@
/************************************ /************************************
* Locally Global vars & functions :) * Locally Global vars & functions :)
************************************/ ************************************/
#define INC_SEQ_NUM() \
{ \
if(orte_snapc_base_store_only_one_seq) { \
orte_snapc_base_snapshot_seq_number = 0; \
} else { \
orte_snapc_base_snapshot_seq_number++; \
} \
}
/* RML Callback */ /* RML Callback */
static void snapc_full_global_recv(int status, static void snapc_full_global_recv(int status,
orte_process_name_t* sender, orte_process_name_t* sender,
@ -207,6 +219,51 @@ int global_coord_setup_job(orte_jobid_t jobid) {
goto cleanup; goto cleanup;
} }
/*
* If requested pre-establish the global snapshot directory
*/
if(orte_snapc_base_establish_gloabl_snapshot_dir) {
char *global_snapshot_handle = NULL;
char *global_dir = NULL;
INC_SEQ_NUM();
global_snapshot_handle = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot_handle);
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
global_snapshot.seq_num = orte_snapc_base_snapshot_seq_number;
global_snapshot.reference_name = strdup(global_snapshot_handle);
global_snapshot.local_location = opal_dirname(orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name));
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
"global) Pre-establish the global snapshot directory\n");
/* Creates the directory (with metadata files):
* /tmp/ompi_global_snapshot_PID.ckpt/seq_num
*/
if( ORTE_SUCCESS != (ret = orte_snapc_base_init_global_snapshot_directory(global_snapshot_handle, true))) {
exit_status = ret;
goto cleanup;
}
/*
* Push this value to the GPR so the orted can pick it up
*/
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
ORTE_SNAPC_CKPT_STATE_NONE,
global_snapshot_handle,
global_dir) ) ) {
exit_status = ret;
goto cleanup;
}
free(global_snapshot_handle);
global_snapshot_handle = NULL;
free(global_dir);
global_dir = NULL;
}
opal_output_verbose(10, mca_snapc_full_component.super.output_handle, opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
"global [%d]) Setup job (%d) with vpid [%d, %d]\n", getpid(), jobid, vpid_start, vpid_range); "global [%d]) Setup job (%d) with vpid [%d, %d]\n", getpid(), jobid, vpid_start, vpid_range);
@ -360,7 +417,7 @@ snapc_full_global_checkpoint(orte_jobid_t jobid, bool term, char **global_snapsh
/********************* /*********************
* Generate the global snapshot directory, and unique global snapshot handle * Generate the global snapshot directory, and unique global snapshot handle
*********************/ *********************/
++orte_snapc_base_snapshot_seq_number; INC_SEQ_NUM();
*global_snapshot_handle = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) ); *global_snapshot_handle = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
global_snapshot.seq_num = orte_snapc_base_snapshot_seq_number; global_snapshot.seq_num = orte_snapc_base_snapshot_seq_number;
@ -370,7 +427,7 @@ snapc_full_global_checkpoint(orte_jobid_t jobid, bool term, char **global_snapsh
/* Creates the directory (with metadata files): /* Creates the directory (with metadata files):
* /tmp/ompi_global_snapshot_PID.ckpt/seq_num * /tmp/ompi_global_snapshot_PID.ckpt/seq_num
*/ */
if( ORTE_SUCCESS != (ret = orte_snapc_base_init_global_snapshot_directory(*global_snapshot_handle))) { if( ORTE_SUCCESS != (ret = orte_snapc_base_init_global_snapshot_directory(*global_snapshot_handle, false))) {
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -538,6 +595,8 @@ static void vpid_ckpt_state_callback(orte_gpr_notify_data_t *data, void *cbdata)
char * global_dir = NULL; char * global_dir = NULL;
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name); global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(proc->jobid, if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(proc->jobid,
ORTE_SNAPC_CKPT_STATE_RUNNING, ORTE_SNAPC_CKPT_STATE_RUNNING,
global_snapshot.reference_name, global_snapshot.reference_name,
@ -779,6 +838,7 @@ static int snapc_full_global_notify_checkpoint( char * global_snapshot_handle,
/* /*
* Update the job global segment * Update the job global segment
*/ */
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid, if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
ckpt_state, ckpt_state,
global_snapshot_handle, global_snapshot_handle,
@ -821,6 +881,7 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
* Update the job checkpoint state * Update the job checkpoint state
**********************************/ **********************************/
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name); global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid, if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
cur_job_ckpt_state, cur_job_ckpt_state,

Просмотреть файл

@ -7,6 +7,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
*
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -138,7 +140,37 @@ int local_coord_setup_job(orte_jobid_t jobid)
goto cleanup; goto cleanup;
} }
ret = exit_status; if(orte_snapc_base_establish_gloabl_snapshot_dir) {
size_t ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
char * ckpt_snapshot_ref = NULL;
char * ckpt_snapshot_loc = NULL;
if( ORTE_SUCCESS != (ret = orte_snapc_base_get_job_ckpt_info(jobid,
&ckpt_state,
&ckpt_snapshot_ref,
&ckpt_snapshot_loc) ) ) {
exit_status = ret;
goto cleanup;
}
if( NULL != ckpt_snapshot_loc &&
(0 != strncmp(ckpt_snapshot_loc, "", strlen(""))) ) {
orte_snapc_base_global_snapshot_loc = strdup(ckpt_snapshot_loc);
}
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
"local) The global snapshot directory has been established at [%s]\n",
orte_snapc_base_global_snapshot_loc);
if( NULL != ckpt_snapshot_ref ) {
free(ckpt_snapshot_ref);
ckpt_snapshot_ref = NULL;
}
if( NULL != ckpt_snapshot_loc ) {
free(ckpt_snapshot_loc);
ckpt_snapshot_loc = NULL;
}
}
cleanup: cleanup:
return exit_status; return exit_status;

Просмотреть файл

@ -360,7 +360,14 @@ int orte_daemon(int argc, char *argv[])
opal_daemon_init(NULL); opal_daemon_init(NULL);
} }
/* Intialize OPAL */ #if OPAL_ENABLE_FT == 1
/* Mark as a tool program */
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
"1",
true, &environ);
#endif
/* Initialize OPAL */
if (ORTE_SUCCESS != (ret = opal_init())) { if (ORTE_SUCCESS != (ret = opal_init())) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
return ret; return ret;

Просмотреть файл

@ -393,6 +393,9 @@ int orterun(int argc, char *argv[])
#if OPAL_ENABLE_FT == 1 #if OPAL_ENABLE_FT == 1
/* Disable OPAL CR notifications for this tool */ /* Disable OPAL CR notifications for this tool */
opal_cr_set_enabled(false); opal_cr_set_enabled(false);
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
"1",
true, &environ);
#endif #endif
/* Intialize our Open RTE environment */ /* Intialize our Open RTE environment */