1
1

This commit distinguishes the file transfer stage from the finish stage.

This commit also cleans up the checkpoint and terminate case making it more
precise than before. Previously the application could make a small amount of
progress between checkpoint completion and application termination. Now the
application will make no progress at all in this time span.

Additional minor change:
 - Start using OPAL_INT_TO_BOOL instead of if/else logic

This commit was SVN r16952.
Этот коммит содержится в:
Josh Hursey 2007-12-13 14:37:17 +00:00
родитель ecd563b0fa
Коммит a287c9cb65
8 изменённых файлов: 73 добавлений и 50 удалений

Просмотреть файл

@ -112,12 +112,7 @@ static int crcp_coord_open(void)
false, false,
0,
&val);
if( 0 != val ) {
timing_enabled = true;
}
else {
timing_enabled = false;
}
timing_enabled = OPAL_INT_TO_BOOL(val);
/*
* Debug Output

Просмотреть файл

@ -1836,6 +1836,9 @@ char * orte_snapc_ckpt_state_str(size_t state)
case ORTE_SNAPC_CKPT_STATE_RUNNING:
return strdup("Running");
break;
case ORTE_SNAPC_CKPT_STATE_FILE_XFER:
return strdup("File Transfer");
break;
case ORTE_SNAPC_CKPT_STATE_FINISHED:
return strdup("Finished");
break;

Просмотреть файл

@ -103,12 +103,7 @@ int orte_snapc_base_open(void)
false, false,
1,
&value);
if( 0 != value ) { /* Enabled */
orte_snapc_base_store_in_place = true;
}
else { /* Disabled */
orte_snapc_base_store_in_place = false;
}
orte_snapc_base_store_in_place = OPAL_INT_TO_BOOL(value);
/*
* Reuse sequence numbers
@ -121,12 +116,7 @@ int orte_snapc_base_open(void)
false, false,
0,
&value);
if( 0 != value ) { /* Enabled */
orte_snapc_base_store_only_one_seq = true;
}
else { /* Disabled */
orte_snapc_base_store_only_one_seq = false;
}
orte_snapc_base_store_only_one_seq = OPAL_INT_TO_BOOL(value);
/*
* Pre-establish the global snapshot directory upon job registration
@ -137,12 +127,7 @@ int orte_snapc_base_open(void)
false, false,
0,
&value);
if( 0 != value ) { /* Enabled */
orte_snapc_base_establish_gloabl_snapshot_dir = true;
}
else { /* Disabled */
orte_snapc_base_establish_gloabl_snapshot_dir = false;
}
orte_snapc_base_establish_gloabl_snapshot_dir = OPAL_INT_TO_BOOL(value);
/*
* User defined global snapshot directory name for this job

Просмотреть файл

@ -512,6 +512,14 @@ static int snapc_full_app_ckpt_handshake_end(int cr_state)
goto cleanup;
}
/*
* If the last command is non-zero then we need to terminate instead of
* returning to computation.
*/
if( 0 != last_cmd ) {
exit(0);
}
cleanup:
return exit_status;
}

Просмотреть файл

@ -115,13 +115,8 @@ static int snapc_full_open(void)
false, false,
0,
&value);
if( 0 != value ) { /* Enabled */
orte_snapc_full_skip_filem = true;
}
else { /* Disabled */
orte_snapc_full_skip_filem = false;
}
orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value);
/*
* Debug Output
*/

Просмотреть файл

@ -519,8 +519,9 @@ static void job_ckpt_request_callback(orte_gpr_notify_data_t *data, void *cbdata
goto cleanup;
}
}
else if( ORTE_SNAPC_CKPT_STATE_FINISHED != job_ckpt_state &&
ORTE_SNAPC_CKPT_STATE_ERROR != job_ckpt_state ) {
else if( ORTE_SNAPC_CKPT_STATE_FINISHED != job_ckpt_state &&
ORTE_SNAPC_CKPT_STATE_FILE_XFER != job_ckpt_state &&
ORTE_SNAPC_CKPT_STATE_ERROR != job_ckpt_state ) {
/*
* Update the orte-checkpoint cmd
*/
@ -865,8 +866,33 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
return exit_status;
}
cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FINISHED;
/**********************************
* Update the job checkpoint state
**********************************/
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FILE_XFER;
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
cur_job_ckpt_state,
global_snapshot.reference_name,
global_dir) ) ) {
exit_status = ret;
goto cleanup;
}
/************************
* Update the orte_checkpoint command (File Transfer)
************************/
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender,
global_snapshot.reference_name,
global_snapshot.seq_num,
cur_job_ckpt_state)) ) {
exit_status = ret;
goto cleanup;
}
/**********************
* Gather all of the files locally
* Note: We don't need to worry about the return code in as much since the
@ -876,12 +902,11 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
exit_status = ret;
cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_ERROR;
}
/**********************************
* Update the job checkpoint state
**********************************/
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FINISHED;
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
cur_job_ckpt_state,
@ -914,7 +939,6 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
goto cleanup;
}
}
/************************
* Do the final handshake with the orte_checkpoint command
@ -940,6 +964,8 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
/********************************
* Terminate the job if requested
* At this point the application should have already exited, but do this
* just to make doubly sure that the job is terminated.
*********************************/
if( term_job ) {
orte_pls.terminate_job(jobid, &orte_abort_timeout, NULL);

Просмотреть файл

@ -404,9 +404,12 @@ static void snapc_full_local_job_state_callback( orte_gpr_notify_data_t *data, v
goto cleanup;
}
}
else if( ORTE_SNAPC_CKPT_STATE_FINISHED == ckpt_state ) {
else if( ORTE_SNAPC_CKPT_STATE_FILE_XFER == ckpt_state ||
ORTE_SNAPC_CKPT_STATE_FINISHED == ckpt_state ) {
/*
* Release all checkpointed processes now that the checkpoint is complete
* If the request was to checkpoint then terminate this command will tell
* the application to do so upon release.
*/
for(item = opal_list_get_first(&snapc_local_vpids);
item != opal_list_get_end(&snapc_local_vpids);
@ -425,13 +428,6 @@ static void snapc_full_local_job_state_callback( orte_gpr_notify_data_t *data, v
goto cleanup;
}
}
/*
* If the PLS was able to actually allow for local calls to
* orte_pls.terminate_proc then we could terminate the processes
* from there, but since it is not implemented we need to do
* it from the HNP. :/
*/
}
cleanup:
@ -1127,6 +1123,19 @@ static int snapc_full_local_end_ckpt_handshake(orte_snapc_full_local_snapshot_t
int ret, exit_status = ORTE_SUCCESS;
int last_cmd = 0;
/*
* Make sure the pipe is open, so we do not try to do this twice
*/
if( 0 > vpid_snapshot->comm_pipe_w_fd ) {
return exit_status;
}
if( vpid_snapshot->super.term ) {
last_cmd = 999;
} else {
last_cmd = 0;
}
/*
* Finish the handshake.
*/

Просмотреть файл

@ -115,11 +115,13 @@ extern "C" {
/* Running the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_RUNNING 4
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FINISHED 5
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 5
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FINISHED 6
/* Unable to checkpoint this job */
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 6
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 7
/* Reached an error */
#define ORTE_SNAPC_CKPT_STATE_ERROR 7
#define ORTE_SNAPC_CKPT_STATE_ERROR 8
/**
* Definition of a orte local snapshot.