1
1

This commit fixes the checkpoint/restart functionality on the trunk. Included in this commit are:

* Extension to the ESS framework to support C/R
 * Fixed support for {{{snapc_base_establish_global_snapshot_dir}}}
 * Fixed FileM support
 * Misc. minor code modifications

There are some outstanding visability issues that I want to fix next.

This commit was SVN r17725.
This commit is contained in:
Josh Hursey 2008-03-05 04:57:23 +00:00
parent edb8e32a7a
commit 3b4073e32c
24 changed files with 429 additions and 272 deletions

View File

@ -86,7 +86,7 @@ enum ompi_crcp_base_pml_states_t {
};
typedef enum ompi_crcp_base_pml_states_t ompi_crcp_base_pml_states_t;
struct ompi_crcp_base_pml_state_t {
OMPI_DECLSPEC struct ompi_crcp_base_pml_state_t {
ompi_free_list_item_t super;
ompi_crcp_base_pml_states_t state;
int error_code;
@ -94,7 +94,7 @@ struct ompi_crcp_base_pml_state_t {
mca_pml_base_module_t *wrapped_pml_module;
};
typedef struct ompi_crcp_base_pml_state_t ompi_crcp_base_pml_state_t;
OBJ_CLASS_DECLARATION(ompi_crcp_base_pml_state_t);
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_crcp_base_pml_state_t);
typedef ompi_crcp_base_pml_state_t* (*ompi_crcp_base_pml_enable_fn_t)
(bool enable, ompi_crcp_base_pml_state_t* );

View File

@ -615,9 +615,9 @@ int ompi_proc_refresh(void) {
}
}
rc = ompi_proc_publish_info();
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
rc = ompi_proc_publish_info();
return rc;
}

View File

@ -336,14 +336,6 @@ static int ompi_cr_coord_post_restart(void) {
opal_output_verbose(10, ompi_cr_output,
"ompi_cr: coord_post_restart: ompi_cr_coord_post_restart()");
#if 0
/* register myself to require that I finalize before exiting */
if (ORTE_SUCCESS != (ret = orte_register_sync())) {
exit_status = ret;
goto cleanup;
}
#endif
/*
* Notify PML
* - Will notify BML and BTL's

View File

@ -90,14 +90,14 @@ extern "C" {
int opal_crs_base_none_disable_checkpoint(void);
int opal_crs_base_none_enable_checkpoint(void);
int opal_crs_base_none_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
int opal_crs_base_none_reg_thread(void);
OPAL_DECLSPEC int opal_crs_base_none_prelaunch(int32_t rank,
char *base_snapshot_dir,
char **app,
char **cwd,
char ***argv,
char ***env);
OPAL_DECLSPEC int opal_crs_base_none_reg_thread(void);
/**
* Some utility functions

View File

@ -42,7 +42,8 @@ static int rte_finalize(void);
orte_ess_base_module_t orte_ess_alps_module = {
rte_init,
rte_finalize,
orte_ess_base_app_abort
orte_ess_base_app_abort,
NULL /* ft_event */
};

View File

@ -42,7 +42,8 @@ static void rte_abort(int status, bool report) __opal_attribute_noreturn__;
orte_ess_base_module_t orte_ess_cnos_module = {
rte_init,
rte_finalize,
rte_abort
rte_abort,
NULL /* ft_event */
};
static int rte_init(char flags)

View File

@ -80,12 +80,15 @@ static int env_set_name(void);
static int rte_init(char flags);
static int rte_finalize(void);
static int rte_ft_event(int state);
static int ess_env_ft_event_update_process_info(orte_process_name_t proc, pid_t pid);
orte_ess_base_module_t orte_ess_env_module = {
rte_init,
rte_finalize,
orte_ess_base_app_abort
orte_ess_base_app_abort,
rte_ft_event
};
@ -209,3 +212,180 @@ static int env_set_name(void)
return ORTE_SUCCESS;
}
static int rte_ft_event(int state)
{
int ret, exit_status = ORTE_SUCCESS;
char * procid_str = NULL;
char * jobid_str = NULL;
/******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) {
/*
* Notify IOF
*/
if( ORTE_SUCCESS != (ret = orte_iof.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify RML & OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
}
}
/******** Continue Recovery ********/
else if (OPAL_CRS_CONTINUE == state ) {
/*
* Notify RML & OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify IOF
*/
if( ORTE_SUCCESS != (ret = orte_iof.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
}
/******** Restart Recovery ********/
else if (OPAL_CRS_RESTART == state ) {
/*
* Notify RML & OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* - Reset Contact information
*/
if( ORTE_SUCCESS != (ret = env_set_name() ) ) {
exit_status = ret;
}
/* Session directory stuff:
* orte_process_info.top_session_dir
* orte_process_info.universe_session_dir
* orte_process_info.job_session_dir
* orte_process_info.proc_session_dir
*/
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) {
exit_status = ret;
}
if (ORTE_SUCCESS != (ret = orte_util_convert_vpid_to_string(&procid_str, ORTE_PROC_MY_NAME->vpid))) {
exit_status = ret;
}
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
orte_process_info.tmpdir_base,
orte_system_info.user,
orte_system_info.nodename,
NULL, /* Batch ID -- Not used */
jobid_str,
procid_str))) {
exit_status = ret;
}
/*
* Re-enable communication through the RML
*/
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
exit_status = ret;
goto cleanup;
}
/*
* Notify IOF
*/
if( ORTE_SUCCESS != (ret = orte_iof.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* Re-exchange the routes
*/
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
exit_status = ret;
goto cleanup;
}
/*
* Send new PID to HNP/daemon
* The checkpointer could have used a proxy program to boot us
* so the pid that the orted got from fork() may not be the
* PID of this application.
* - Note: BLCR does this because it tries to preseve the PID
* of the program across checkpointes
*/
if( ORTE_SUCCESS != (ret = ess_env_ft_event_update_process_info(orte_process_info.my_name, getpid())) ) {
exit_status = ret;
goto cleanup;
}
}
else if (OPAL_CRS_TERM == state ) {
/* Nothing */
}
else {
/* Error state = Nothing */
}
cleanup:
if (NULL != jobid_str) {
free(jobid_str);
jobid_str = NULL;
}
return exit_status;
}
static int ess_env_ft_event_update_process_info(orte_process_name_t proc, pid_t proc_pid)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t buffer;
orte_snapc_cmd_flag_t command = ORTE_SNAPC_LOCAL_UPDATE_CMD;
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_SNAPC_CMD )) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc_pid, 1, OPAL_PID))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buffer, ORTE_RML_TAG_SNAPC, 0))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;
}

View File

@ -72,6 +72,17 @@ typedef int (*orte_ess_base_module_finalize_fn_t)(void);
*/
typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report);
/**
* Handle fault tolerance updates
*
* Handle fault tolerance updates
*
* @param[in] state Fault tolerance state update
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_ess_base_module_ft_event_fn_t)(int state);
/*
* the standard module data structure
@ -80,6 +91,7 @@ struct orte_ess_base_module_1_0_0_t {
orte_ess_base_module_init_fn_t init;
orte_ess_base_module_finalize_fn_t finalize;
orte_ess_base_module_abort_fn_t abort;
orte_ess_base_module_ft_event_fn_t ft_event;
};

View File

@ -77,7 +77,8 @@ static void rte_abort(int status, bool report) __opal_attribute_noreturn__;
orte_ess_base_module_t orte_ess_hnp_module = {
rte_init,
rte_finalize,
rte_abort
rte_abort,
NULL /* ft_event */
};

View File

@ -54,8 +54,8 @@ orte_ess_base_component_t mca_ess_lsf_component = {
/* Next the MCA v1.0.0 component meta data */
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
/* Initialization / querying functions */

View File

@ -52,7 +52,8 @@ static int rte_finalize(void);
orte_ess_base_module_t orte_ess_lsf_module = {
rte_init,
rte_finalize,
orte_ess_base_app_abort
orte_ess_base_app_abort,
NULL /* ft_event */
};
static int rte_init(char flags)

View File

@ -41,7 +41,8 @@ static void rte_abort(int status, bool report) __opal_attribute_noreturn__;
orte_ess_base_module_t orte_ess_portals_utcp_module = {
rte_init,
rte_finalize,
rte_abort
rte_abort,
NULL /* ft_event */
};
static int rte_init(char flags)

View File

@ -59,8 +59,8 @@ orte_ess_base_component_t mca_ess_singleton_component = {
/* Next the MCA v1.0.0 component meta data */
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
/* Initialization / querying functions */

View File

@ -69,7 +69,8 @@ static int rte_init(char flags);
orte_ess_base_module_t orte_ess_singleton_module = {
rte_init,
orte_ess_base_app_finalize,
orte_ess_base_app_abort
orte_ess_base_app_abort,
NULL /* ft_event */
};
static int rte_init(char flags)

View File

@ -53,7 +53,8 @@ static int rte_finalize(void);
orte_ess_base_module_t orte_ess_slurm_module = {
rte_init,
rte_finalize,
orte_ess_base_app_abort
orte_ess_base_app_abort,
NULL /* ft_event */
};

View File

@ -51,7 +51,8 @@ static void rte_abort(int status, bool report) __opal_attribute_noreturn__;
orte_ess_base_module_t orte_ess_tool_module = {
rte_init,
orte_ess_base_tool_finalize,
rte_abort
rte_abort,
NULL /* ft_event */
};

View File

@ -66,7 +66,11 @@ int orte_filem_base_comm_start(void)
{
int rc;
if (recv_issued || !orte_process_info.hnp) {
/* Only active in HNP and daemons */
if( !orte_process_info.hnp && !orte_process_info.daemon ) {
return ORTE_SUCCESS;
}
if ( recv_issued ) {
return ORTE_SUCCESS;
}
@ -91,8 +95,12 @@ int orte_filem_base_comm_start(void)
int orte_filem_base_comm_stop(void)
{
int rc;
if (!recv_issued || !orte_process_info.hnp) {
/* Only active in HNP and daemons */
if( !orte_process_info.hnp && !orte_process_info.daemon ) {
return ORTE_SUCCESS;
}
if ( recv_issued ) {
return ORTE_SUCCESS;
}

View File

@ -129,7 +129,7 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
ORTE_DECLSPEC extern char * orte_snapc_base_global_snapshot_loc;
ORTE_DECLSPEC extern bool orte_snapc_base_store_in_place;
ORTE_DECLSPEC extern bool orte_snapc_base_store_only_one_seq;
ORTE_DECLSPEC extern bool orte_snapc_base_establish_gloabl_snapshot_dir;
ORTE_DECLSPEC extern bool orte_snapc_base_establish_global_snapshot_dir;
ORTE_DECLSPEC extern size_t orte_snapc_base_snapshot_seq_number;

View File

@ -51,7 +51,7 @@ char * orte_snapc_base_global_snapshot_loc = NULL;
char * orte_snapc_base_global_snapshot_ref = NULL;
bool orte_snapc_base_store_in_place = true;
bool orte_snapc_base_store_only_one_seq = false;
bool orte_snapc_base_establish_gloabl_snapshot_dir = false;
bool orte_snapc_base_establish_global_snapshot_dir = false;
/**
* Function for finding and opening either all MCA components,
@ -147,11 +147,11 @@ int orte_snapc_base_open(void)
false, false,
0,
&value);
orte_snapc_base_establish_gloabl_snapshot_dir = OPAL_INT_TO_BOOL(value);
orte_snapc_base_establish_global_snapshot_dir = OPAL_INT_TO_BOOL(value);
OPAL_OUTPUT_VERBOSE((20, orte_snapc_base_output,
"snapc:base: open: base_establish_gloabl_snapshot_dir = %d",
orte_snapc_base_establish_gloabl_snapshot_dir));
"snapc:base: open: base_establish_global_snapshot_dir = %d",
orte_snapc_base_establish_global_snapshot_dir));
/*
* User defined global snapshot directory name for this job

View File

@ -45,6 +45,8 @@ typedef uint8_t orte_snapc_full_cmd_flag_t;
#define ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD 1
#define ORTE_SNAPC_FULL_UPDATE_PROC_STATE_CMD 2
#define ORTE_SNAPC_FULL_VPID_ASSOC_CMD 3
#define ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD 4
/*
* Local Component structures
*/

View File

@ -76,6 +76,8 @@ static void snapc_full_process_proc_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer);
static void snapc_full_process_vpid_assoc_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer);
static void snapc_full_process_establish_dir_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer);
static void snapc_full_process_cmdline_request_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer);
@ -224,7 +226,7 @@ int global_coord_setup_job(orte_jobid_t jobid) {
/*
* If requested pre-establish the global snapshot directory
*/
if(orte_snapc_base_establish_gloabl_snapshot_dir) {
if(orte_snapc_base_establish_global_snapshot_dir) {
char *global_snapshot_handle = NULL;
char *global_dir = NULL;
@ -248,17 +250,6 @@ int global_coord_setup_job(orte_jobid_t jobid) {
goto cleanup;
}
/*
* Notify orted to finish up
*/
if( ORTE_SUCCESS != (ret = orte_snapc_full_global_set_job_ckpt_info(jobid,
ORTE_SNAPC_CKPT_STATE_NONE,
global_snapshot_handle,
global_dir) ) ) {
exit_status = ret;
goto cleanup;
}
free(global_snapshot_handle);
global_snapshot_handle = NULL;
@ -423,7 +414,8 @@ void snapc_full_global_cmd_recv(int status,
int rc;
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
"Global) Receive a command message."));
"Global) Receive a command message from %s.",
ORTE_NAME_PRINT(sender)));
/*
* If this is a command line checkpoint request, handle directly
@ -468,6 +460,13 @@ void snapc_full_global_cmd_recv(int status,
snapc_full_process_vpid_assoc_cmd(sender, buffer);
break;
case ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD:
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"Global) Command: Establish checkpoint directory"));
snapc_full_process_establish_dir_cmd(sender, buffer);
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
}
@ -573,7 +572,7 @@ int global_coord_job_state_update(orte_jobid_t jobid,
}
if(ORTE_SNAPC_CKPT_STATE_REQUEST == job_ckpt_state ) {
#if JJH_FIX_ME
#if 0
/*
* Start the checkpoint, now that we have the jobid
*/
@ -850,6 +849,46 @@ int global_coord_vpid_assoc_update(orte_process_name_t local_coord,
return ORTE_SUCCESS;
}
static void snapc_full_process_establish_dir_cmd(orte_process_name_t* sender,
opal_buffer_t* exbuf)
{
int ret, exit_status = ORTE_SUCCESS;
orte_snapc_full_cmd_flag_t command = ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD;
opal_buffer_t buffer;
/* Send back:
* - Reference
* - Local location
*/
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_SNAPC_FULL_CMD))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(global_snapshot.reference_name), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(orte_snapc_base_global_snapshot_loc), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(sender, &buffer, ORTE_RML_TAG_SNAPC_FULL, 0))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&buffer);
return;
}
static void snapc_full_process_cmdline_request_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer)
{

View File

@ -97,6 +97,7 @@ static orte_snapc_full_local_snapshot_t *find_vpid_snapshot(orte_process_name_t
static int snapc_full_local_get_vpids(void);
static int snapc_full_local_setup_snapshot_dir(char * snapshot_ref, char * sugg_dir, char **actual_dir);
static int snapc_full_establish_dir(void);
static int snapc_full_local_start_checkpoint_all(size_t ckpt_state);
static int snapc_full_local_start_ckpt_open_comm(orte_snapc_full_local_snapshot_t *vpid_snapshot);
@ -150,6 +151,17 @@ int local_coord_setup_job(orte_jobid_t jobid)
goto cleanup;
}
/*
* Wait for the snapshot directory to be established before registering
* the callbacks since they use the same tag.
*/
if(orte_snapc_base_establish_global_snapshot_dir) {
if( ORTE_SUCCESS != (ret = snapc_full_establish_dir() ) ) {
exit_status = ret;
goto cleanup;
}
}
/*
* Setup Global Coordinator listener
*/
@ -166,39 +178,9 @@ int local_coord_setup_job(orte_jobid_t jobid)
goto cleanup;
}
if(orte_snapc_base_establish_gloabl_snapshot_dir) {
#if JJH_FIX_ME
size_t ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
char * ckpt_snapshot_ref = NULL;
char * ckpt_snapshot_loc = NULL;
if( ORTE_SUCCESS != (ret = orte_snapc_base_get_job_ckpt_info(jobid,
&ckpt_state,
&ckpt_snapshot_ref,
&ckpt_snapshot_loc) ) ) {
exit_status = ret;
goto cleanup;
}
if( NULL != ckpt_snapshot_loc &&
(0 != strncmp(ckpt_snapshot_loc, "", strlen(""))) ) {
orte_snapc_base_global_snapshot_loc = strdup(ckpt_snapshot_loc);
}
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"Local) The global snapshot directory has been established at [%s]\n",
orte_snapc_base_global_snapshot_loc));
if( NULL != ckpt_snapshot_ref ) {
free(ckpt_snapshot_ref);
ckpt_snapshot_ref = NULL;
}
if( NULL != ckpt_snapshot_loc ) {
free(ckpt_snapshot_loc);
ckpt_snapshot_loc = NULL;
}
#endif
}
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"Local) Finished setting up job\n"));
cleanup:
return exit_status;
@ -332,7 +314,7 @@ static int snapc_full_local_start_proc_listener(void)
}
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
"Global) Receive (Command line): Start command recv"));
"Local) Receive (Command line): Start command recv"));
/*
* Coordinator command listener
@ -363,7 +345,7 @@ static int snapc_full_local_stop_proc_listener(void)
}
OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle,
"Global) Receive (Command Line) stop command"));
"Local) Receive (Command Line) stop command"));
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_SNAPC))) {
@ -769,6 +751,105 @@ static int snapc_full_local_send_vpid_assoc(void)
return exit_status;
}
static int snapc_full_establish_dir(void)
{
int ret, exit_status = ORTE_SUCCESS;
orte_snapc_full_cmd_flag_t command = ORTE_SNAPC_FULL_ESTABLISH_DIR_CMD;
opal_buffer_t buffer;
char * ckpt_snapshot_ref = NULL;
char * ckpt_snapshot_loc = NULL;
orte_std_cntr_t count;
/*
* Global Coordinator: Operate locally
*/
if( ORTE_SNAPC_GLOBAL_COORD_TYPE == (orte_snapc_coord_type & ORTE_SNAPC_GLOBAL_COORD_TYPE)) {
opal_output(0, "Error: Not supported!\n");
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"Local) Contact the HNP for global snapshot directory information to establish\n"));
/* Notify HNP of request for information */
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_SNAPC_FULL_CMD))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_SNAPC_FULL, 0))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
OBJ_DESTRUCT(&buffer);
/* Wait for the HNP to release us */
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"Local) Wait for response to global snapshot directory information request\n"));
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if( ORTE_SUCCESS != (ret = orte_rml.recv_buffer(ORTE_PROC_MY_HNP,
&buffer,
ORTE_RML_TAG_SNAPC_FULL,
ORTE_RML_NON_PERSISTENT) ) ) {
OBJ_DESTRUCT(&buffer);
exit_status = ret;
goto cleanup;
}
/*
* Unpack the data
* - command
* - ckpt_reference
* - ckpt_location
*/
count = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&buffer, &command, &count, ORTE_SNAPC_FULL_CMD))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
count = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&buffer, &ckpt_snapshot_ref, &count, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
count = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&buffer, &ckpt_snapshot_loc, &count, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if( NULL != ckpt_snapshot_loc &&
(0 != strncmp(ckpt_snapshot_loc, "", strlen(""))) ) {
orte_snapc_base_global_snapshot_loc = strdup(ckpt_snapshot_loc);
}
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"Local) The global snapshot directory has been established at [%s]\n",
orte_snapc_base_global_snapshot_loc));
cleanup:
OBJ_DESTRUCT(&buffer);
if( NULL != ckpt_snapshot_ref ) {
free(ckpt_snapshot_ref);
ckpt_snapshot_ref = NULL;
}
if( NULL != ckpt_snapshot_loc ) {
free(ckpt_snapshot_loc);
ckpt_snapshot_loc = NULL;
}
return exit_status;
}
static int snapc_full_local_setup_snapshot_dir(char * snapshot_ref, char * sugg_dir, char **actual_dir)
{
int ret, exit_status = ORTE_SUCCESS;

View File

@ -86,8 +86,6 @@
BEGIN_C_DECLS
#define JJH_FIX_ME 0
/**
* States that a process can be in while checkpointing
*/

View File

@ -85,8 +85,6 @@ static int orte_cr_coord_post_ckpt(void);
static int orte_cr_coord_post_restart(void);
static int orte_cr_coord_post_continue(void);
static int orte_cr_update_process_info(orte_process_name_t proc, pid_t pid);
/*************
* Local vars
*************/
@ -247,19 +245,13 @@ static int orte_cr_coord_pre_ckpt(void) {
"orte_cr: coord_pre_ckpt: orte_cr_coord_pre_ckpt()");
/*
* Notify IOF
* Notify the ESS
*/
if( ORTE_SUCCESS != (ret = orte_iof.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify RML & OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
if( NULL != orte_ess.ft_event ) {
if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_CHECKPOINT))) {
exit_status = ret;
goto cleanup;
}
}
cleanup:
@ -305,8 +297,6 @@ static int orte_cr_coord_post_ckpt(void) {
static int orte_cr_coord_post_restart(void) {
int ret, exit_status = ORTE_SUCCESS;
char * procid_str = NULL;
char * jobid_str = NULL;
opal_output_verbose(10, orte_cr_output,
"orte_cr: coord_post_restart: orte_cr_coord_post_restart()");
@ -379,28 +369,7 @@ static int orte_cr_coord_post_restart(void) {
orte_process_info.my_name = *ORTE_NAME_INVALID;
/*
* Notify RML & OOB
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* Startup Discovery Service:
* - Connect to the universe
* Structure Elements Refreshed:
* orte_universe_info.name
* orte_universe_info.host
* orte_universe_info.uid
* orte_universe_info.persistence
* orte_universe_info.scope
* orte_universe_info.seed_uri
* orte_universe_info.console_connected
* orte_universe_info.scriptfile
*
* orte_process_info.ns_replica_uri
* orte_process_info.gpr_replica_uri
* Notify the ESS
*/
if (ORTE_SUCCESS != (ret = orte_ess_base_open())) {
exit_status = ret;
@ -410,102 +379,14 @@ static int orte_cr_coord_post_restart(void) {
exit_status = ret;
}
/** JJH XXX
* RHC: JOSH - the ess no longer has a "set_name" api as
* it performs that function as part of init'ing the rte.
* We can restore it, if needed - or perhaps much of this
* could go into the ess as part of a new "restore_rte"?
*/
#if 0
/*
* - Reset Contact information
*/
if( ORTE_SUCCESS != (ret = orte_ess.set_name() ) ) {
exit_status = ret;
}
#endif
orte_ess_base_close();
/* Session directory stuff:
* orte_process_info.top_session_dir
* orte_process_info.universe_session_dir
* orte_process_info.job_session_dir
* orte_process_info.proc_session_dir
*/
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) {
exit_status = ret;
if( NULL != orte_ess.ft_event ) {
if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
}
if (ORTE_SUCCESS != (ret = orte_util_convert_vpid_to_string(&procid_str, ORTE_PROC_MY_NAME->vpid))) {
exit_status = ret;
}
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
orte_process_info.tmpdir_base,
orte_system_info.user,
orte_system_info.nodename,
NULL, /* Batch ID -- Not used */
jobid_str,
procid_str))) {
exit_status = ret;
}
/*
* Re-enable communication through the RML
*/
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
exit_status = ret;
goto cleanup;
}
/*
* Notify IOF
*/
if( ORTE_SUCCESS != (ret = orte_iof.ft_event(OPAL_CRS_RESTART))) {
exit_status = ret;
goto cleanup;
}
/*
* Re-exchange the routes
*/
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
exit_status = ret;
goto cleanup;
}
/** RHC: JOSH - you'll need to send this to the PLM. That framework already has
* a receive posted, so you'll just need to add an appropriate command flag to
* "update proc info" - see orte/mca/plm/base/plm_base_receive.c
*
* I don't believe we use the pid info in the HNP anywhere - but this seems
* like it could be really dangerous to have the pid in the HNP -not- be the
* actual pid of the process!! Are you sure you want to do this??
*/
/*
* Send new PID to HNP/daemon
* The checkpointer could have used a proxy program to boot us
* so the pid that the orted got from fork() may not be the
* PID of this application.
* - Note: BLCR does this because it tries to preseve the PID
* of the program across checkpointes
*/
if( ORTE_SUCCESS != (ret = orte_cr_update_process_info(orte_process_info.my_name, getpid())) ) {
exit_status = ret;
goto cleanup;
}
cleanup:
if (NULL != jobid_str) {
free(jobid_str);
jobid_str = NULL;
}
return exit_status;
}
@ -516,19 +397,13 @@ static int orte_cr_coord_post_continue(void) {
"orte_cr: coord_post_continue: orte_cr_coord_post_continue()\n");
/*
* Notify RML & OOB
* Notify the ESS
*/
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
/*
* Notify IOF
*/
if( ORTE_SUCCESS != (ret = orte_iof.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
if( NULL != orte_ess.ft_event ) {
if( ORTE_SUCCESS != (ret = orte_ess.ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
}
cleanup:
@ -557,41 +432,3 @@ int orte_cr_entry_point_finalize(void)
return ORTE_SUCCESS;
}
static int orte_cr_update_process_info(orte_process_name_t proc, pid_t proc_pid)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t buffer;
orte_snapc_cmd_flag_t command = ORTE_SNAPC_LOCAL_UPDATE_CMD;
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_SNAPC_CMD )) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &proc_pid, 1, OPAL_PID))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buffer, ORTE_RML_TAG_SNAPC, 0))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;
}