This commit distinguishes the file transfer stage from the finish stage.
This commit also cleans up the checkpoint and terminate case making it more precise than before. Previously the application could make a small amount of progress between checkpoint completion and application termination. Now the application will make no progress at all in this time span. Additional minor change: - Start using OPAL_INT_TO_BOOL instead of if/else logic This commit was SVN r16952.
Этот коммит содержится в:
родитель
ecd563b0fa
Коммит
a287c9cb65
@ -112,12 +112,7 @@ static int crcp_coord_open(void)
|
||||
false, false,
|
||||
0,
|
||||
&val);
|
||||
if( 0 != val ) {
|
||||
timing_enabled = true;
|
||||
}
|
||||
else {
|
||||
timing_enabled = false;
|
||||
}
|
||||
timing_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
/*
|
||||
* Debug Output
|
||||
|
@ -1836,6 +1836,9 @@ char * orte_snapc_ckpt_state_str(size_t state)
|
||||
case ORTE_SNAPC_CKPT_STATE_RUNNING:
|
||||
return strdup("Running");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_FILE_XFER:
|
||||
return strdup("File Transfer");
|
||||
break;
|
||||
case ORTE_SNAPC_CKPT_STATE_FINISHED:
|
||||
return strdup("Finished");
|
||||
break;
|
||||
|
@ -103,12 +103,7 @@ int orte_snapc_base_open(void)
|
||||
false, false,
|
||||
1,
|
||||
&value);
|
||||
if( 0 != value ) { /* Enabled */
|
||||
orte_snapc_base_store_in_place = true;
|
||||
}
|
||||
else { /* Disabled */
|
||||
orte_snapc_base_store_in_place = false;
|
||||
}
|
||||
orte_snapc_base_store_in_place = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/*
|
||||
* Reuse sequence numbers
|
||||
@ -121,12 +116,7 @@ int orte_snapc_base_open(void)
|
||||
false, false,
|
||||
0,
|
||||
&value);
|
||||
if( 0 != value ) { /* Enabled */
|
||||
orte_snapc_base_store_only_one_seq = true;
|
||||
}
|
||||
else { /* Disabled */
|
||||
orte_snapc_base_store_only_one_seq = false;
|
||||
}
|
||||
orte_snapc_base_store_only_one_seq = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/*
|
||||
* Pre-establish the global snapshot directory upon job registration
|
||||
@ -137,12 +127,7 @@ int orte_snapc_base_open(void)
|
||||
false, false,
|
||||
0,
|
||||
&value);
|
||||
if( 0 != value ) { /* Enabled */
|
||||
orte_snapc_base_establish_gloabl_snapshot_dir = true;
|
||||
}
|
||||
else { /* Disabled */
|
||||
orte_snapc_base_establish_gloabl_snapshot_dir = false;
|
||||
}
|
||||
orte_snapc_base_establish_gloabl_snapshot_dir = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/*
|
||||
* User defined global snapshot directory name for this job
|
||||
|
@ -512,6 +512,14 @@ static int snapc_full_app_ckpt_handshake_end(int cr_state)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the last command is non-zero then we need to terminate instead of
|
||||
* returning to computation.
|
||||
*/
|
||||
if( 0 != last_cmd ) {
|
||||
exit(0);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
@ -115,13 +115,8 @@ static int snapc_full_open(void)
|
||||
false, false,
|
||||
0,
|
||||
&value);
|
||||
if( 0 != value ) { /* Enabled */
|
||||
orte_snapc_full_skip_filem = true;
|
||||
}
|
||||
else { /* Disabled */
|
||||
orte_snapc_full_skip_filem = false;
|
||||
}
|
||||
|
||||
orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/*
|
||||
* Debug Output
|
||||
*/
|
||||
|
@ -519,8 +519,9 @@ static void job_ckpt_request_callback(orte_gpr_notify_data_t *data, void *cbdata
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else if( ORTE_SNAPC_CKPT_STATE_FINISHED != job_ckpt_state &&
|
||||
ORTE_SNAPC_CKPT_STATE_ERROR != job_ckpt_state ) {
|
||||
else if( ORTE_SNAPC_CKPT_STATE_FINISHED != job_ckpt_state &&
|
||||
ORTE_SNAPC_CKPT_STATE_FILE_XFER != job_ckpt_state &&
|
||||
ORTE_SNAPC_CKPT_STATE_ERROR != job_ckpt_state ) {
|
||||
/*
|
||||
* Update the orte-checkpoint cmd
|
||||
*/
|
||||
@ -865,8 +866,33 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FINISHED;
|
||||
|
||||
/**********************************
|
||||
* Update the job checkpoint state
|
||||
**********************************/
|
||||
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
|
||||
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||
|
||||
cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FILE_XFER;
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
||||
cur_job_ckpt_state,
|
||||
global_snapshot.reference_name,
|
||||
global_dir) ) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/************************
|
||||
* Update the orte_checkpoint command (File Transfer)
|
||||
************************/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender,
|
||||
global_snapshot.reference_name,
|
||||
global_snapshot.seq_num,
|
||||
cur_job_ckpt_state)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/**********************
|
||||
* Gather all of the files locally
|
||||
* Note: We don't need to worry about the return code in as much since the
|
||||
@ -876,12 +902,11 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
|
||||
exit_status = ret;
|
||||
cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_ERROR;
|
||||
}
|
||||
|
||||
|
||||
/**********************************
|
||||
* Update the job checkpoint state
|
||||
**********************************/
|
||||
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
|
||||
orte_snapc_base_global_snapshot_loc = strdup(global_dir);
|
||||
cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FINISHED;
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
|
||||
cur_job_ckpt_state,
|
||||
@ -914,7 +939,6 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/************************
|
||||
* Do the final handshake with the orte_checkpoint command
|
||||
@ -940,6 +964,8 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
|
||||
|
||||
/********************************
|
||||
* Terminate the job if requested
|
||||
* At this point the application should have already exited, but do this
|
||||
* just to make doubly sure that the job is terminated.
|
||||
*********************************/
|
||||
if( term_job ) {
|
||||
orte_pls.terminate_job(jobid, &orte_abort_timeout, NULL);
|
||||
|
@ -404,9 +404,12 @@ static void snapc_full_local_job_state_callback( orte_gpr_notify_data_t *data, v
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else if( ORTE_SNAPC_CKPT_STATE_FINISHED == ckpt_state ) {
|
||||
else if( ORTE_SNAPC_CKPT_STATE_FILE_XFER == ckpt_state ||
|
||||
ORTE_SNAPC_CKPT_STATE_FINISHED == ckpt_state ) {
|
||||
/*
|
||||
* Release all checkpointed processes now that the checkpoint is complete
|
||||
* If the request was to checkpoint then terminate this command will tell
|
||||
* the application to do so upon release.
|
||||
*/
|
||||
for(item = opal_list_get_first(&snapc_local_vpids);
|
||||
item != opal_list_get_end(&snapc_local_vpids);
|
||||
@ -425,13 +428,6 @@ static void snapc_full_local_job_state_callback( orte_gpr_notify_data_t *data, v
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the PLS was able to actually allow for local calls to
|
||||
* orte_pls.terminate_proc then we could terminate the processes
|
||||
* from there, but since it is not implemented we need to do
|
||||
* it from the HNP. :/
|
||||
*/
|
||||
}
|
||||
|
||||
cleanup:
|
||||
@ -1127,6 +1123,19 @@ static int snapc_full_local_end_ckpt_handshake(orte_snapc_full_local_snapshot_t
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
int last_cmd = 0;
|
||||
|
||||
/*
|
||||
* Make sure the pipe is open, so we do not try to do this twice
|
||||
*/
|
||||
if( 0 > vpid_snapshot->comm_pipe_w_fd ) {
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
if( vpid_snapshot->super.term ) {
|
||||
last_cmd = 999;
|
||||
} else {
|
||||
last_cmd = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Finish the handshake.
|
||||
*/
|
||||
|
@ -115,11 +115,13 @@ extern "C" {
|
||||
/* Running the checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_RUNNING 4
|
||||
/* Finished the checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_FINISHED 5
|
||||
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 5
|
||||
/* Finished the checkpoint */
|
||||
#define ORTE_SNAPC_CKPT_STATE_FINISHED 6
|
||||
/* Unable to checkpoint this job */
|
||||
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 6
|
||||
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 7
|
||||
/* Reached an error */
|
||||
#define ORTE_SNAPC_CKPT_STATE_ERROR 7
|
||||
#define ORTE_SNAPC_CKPT_STATE_ERROR 8
|
||||
|
||||
/**
|
||||
* Definition of a orte local snapshot.
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user