From a287c9cb65c5efedbc8d6852b5ee4480b573585c Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Thu, 13 Dec 2007 14:37:17 +0000 Subject: [PATCH] This commit distinguishes the file transfer stage from the finish stage. This commit also cleans up the checkpoint and terminate case making it more precise than before. Previously the application could make a small amount of progress between checkpoint completion and application termination. Now the application will make no progress at all in this time span. Additional minor change: - Start using OPAL_INT_TO_BOOL instead of if/else logic This commit was SVN r16952. --- ompi/mca/crcp/coord/crcp_coord_component.c | 7 +--- orte/mca/snapc/base/snapc_base_fns.c | 3 ++ orte/mca/snapc/base/snapc_base_open.c | 21 ++--------- orte/mca/snapc/full/snapc_full_app.c | 8 +++++ orte/mca/snapc/full/snapc_full_component.c | 9 ++--- orte/mca/snapc/full/snapc_full_global.c | 42 +++++++++++++++++----- orte/mca/snapc/full/snapc_full_local.c | 25 ++++++++----- orte/mca/snapc/snapc.h | 8 +++-- 8 files changed, 73 insertions(+), 50 deletions(-) diff --git a/ompi/mca/crcp/coord/crcp_coord_component.c b/ompi/mca/crcp/coord/crcp_coord_component.c index e10eb37d1d..5a9401c660 100644 --- a/ompi/mca/crcp/coord/crcp_coord_component.c +++ b/ompi/mca/crcp/coord/crcp_coord_component.c @@ -112,12 +112,7 @@ static int crcp_coord_open(void) false, false, 0, &val); - if( 0 != val ) { - timing_enabled = true; - } - else { - timing_enabled = false; - } + timing_enabled = OPAL_INT_TO_BOOL(val); /* * Debug Output diff --git a/orte/mca/snapc/base/snapc_base_fns.c b/orte/mca/snapc/base/snapc_base_fns.c index 8d8e314581..a07bd66eef 100644 --- a/orte/mca/snapc/base/snapc_base_fns.c +++ b/orte/mca/snapc/base/snapc_base_fns.c @@ -1836,6 +1836,9 @@ char * orte_snapc_ckpt_state_str(size_t state) case ORTE_SNAPC_CKPT_STATE_RUNNING: return strdup("Running"); break; + case ORTE_SNAPC_CKPT_STATE_FILE_XFER: + return strdup("File Transfer"); + break; case ORTE_SNAPC_CKPT_STATE_FINISHED: return strdup("Finished"); break; diff --git a/orte/mca/snapc/base/snapc_base_open.c b/orte/mca/snapc/base/snapc_base_open.c index 18cb523e75..7b63818112 100644 --- a/orte/mca/snapc/base/snapc_base_open.c +++ b/orte/mca/snapc/base/snapc_base_open.c @@ -103,12 +103,7 @@ int orte_snapc_base_open(void) false, false, 1, &value); - if( 0 != value ) { /* Enabled */ - orte_snapc_base_store_in_place = true; - } - else { /* Disabled */ - orte_snapc_base_store_in_place = false; - } + orte_snapc_base_store_in_place = OPAL_INT_TO_BOOL(value); /* * Reuse sequence numbers @@ -121,12 +116,7 @@ int orte_snapc_base_open(void) false, false, 0, &value); - if( 0 != value ) { /* Enabled */ - orte_snapc_base_store_only_one_seq = true; - } - else { /* Disabled */ - orte_snapc_base_store_only_one_seq = false; - } + orte_snapc_base_store_only_one_seq = OPAL_INT_TO_BOOL(value); /* * Pre-establish the global snapshot directory upon job registration @@ -137,12 +127,7 @@ int orte_snapc_base_open(void) false, false, 0, &value); - if( 0 != value ) { /* Enabled */ - orte_snapc_base_establish_gloabl_snapshot_dir = true; - } - else { /* Disabled */ - orte_snapc_base_establish_gloabl_snapshot_dir = false; - } + orte_snapc_base_establish_gloabl_snapshot_dir = OPAL_INT_TO_BOOL(value); /* * User defined global snapshot directory name for this job diff --git a/orte/mca/snapc/full/snapc_full_app.c b/orte/mca/snapc/full/snapc_full_app.c index 381bf21725..9bcf89256d 100644 --- a/orte/mca/snapc/full/snapc_full_app.c +++ b/orte/mca/snapc/full/snapc_full_app.c @@ -512,6 +512,14 @@ static int snapc_full_app_ckpt_handshake_end(int cr_state) goto cleanup; } + /* + * If the last command is non-zero then we need to terminate instead of + * returning to computation. + */ + if( 0 != last_cmd ) { + exit(0); + } + cleanup: return exit_status; } diff --git a/orte/mca/snapc/full/snapc_full_component.c b/orte/mca/snapc/full/snapc_full_component.c index 222462c774..0825d4ef50 100644 --- a/orte/mca/snapc/full/snapc_full_component.c +++ b/orte/mca/snapc/full/snapc_full_component.c @@ -115,13 +115,8 @@ static int snapc_full_open(void) false, false, 0, &value); - if( 0 != value ) { /* Enabled */ - orte_snapc_full_skip_filem = true; - } - else { /* Disabled */ - orte_snapc_full_skip_filem = false; - } - + orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value); + /* * Debug Output */ diff --git a/orte/mca/snapc/full/snapc_full_global.c b/orte/mca/snapc/full/snapc_full_global.c index 73e68376cb..5d119dca74 100644 --- a/orte/mca/snapc/full/snapc_full_global.c +++ b/orte/mca/snapc/full/snapc_full_global.c @@ -519,8 +519,9 @@ static void job_ckpt_request_callback(orte_gpr_notify_data_t *data, void *cbdata goto cleanup; } } - else if( ORTE_SNAPC_CKPT_STATE_FINISHED != job_ckpt_state && - ORTE_SNAPC_CKPT_STATE_ERROR != job_ckpt_state ) { + else if( ORTE_SNAPC_CKPT_STATE_FINISHED != job_ckpt_state && + ORTE_SNAPC_CKPT_STATE_FILE_XFER != job_ckpt_state && + ORTE_SNAPC_CKPT_STATE_ERROR != job_ckpt_state ) { /* * Update the orte-checkpoint cmd */ @@ -865,8 +866,33 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) { return exit_status; } - cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FINISHED; - + /********************************** + * Update the job checkpoint state + **********************************/ + global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name); + orte_snapc_base_global_snapshot_loc = strdup(global_dir); + + cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FILE_XFER; + + if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid, + cur_job_ckpt_state, + global_snapshot.reference_name, + global_dir) ) ) { + exit_status = ret; + goto cleanup; + } + + /************************ + * Update the orte_checkpoint command (File Transfer) + ************************/ + if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, + global_snapshot.reference_name, + global_snapshot.seq_num, + cur_job_ckpt_state)) ) { + exit_status = ret; + goto cleanup; + } + /********************** * Gather all of the files locally * Note: We don't need to worry about the return code in as much since the @@ -876,12 +902,11 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) { exit_status = ret; cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_ERROR; } - + /********************************** * Update the job checkpoint state **********************************/ - global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name); - orte_snapc_base_global_snapshot_loc = strdup(global_dir); + cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FINISHED; if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid, cur_job_ckpt_state, @@ -914,7 +939,6 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) { goto cleanup; } } - /************************ * Do the final handshake with the orte_checkpoint command @@ -940,6 +964,8 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) { /******************************** * Terminate the job if requested + * At this point the application should have already exited, but do this + * just to make doubly sure that the job is terminated. *********************************/ if( term_job ) { orte_pls.terminate_job(jobid, &orte_abort_timeout, NULL); diff --git a/orte/mca/snapc/full/snapc_full_local.c b/orte/mca/snapc/full/snapc_full_local.c index 1f5f4cb9cc..35b428859e 100644 --- a/orte/mca/snapc/full/snapc_full_local.c +++ b/orte/mca/snapc/full/snapc_full_local.c @@ -404,9 +404,12 @@ static void snapc_full_local_job_state_callback( orte_gpr_notify_data_t *data, v goto cleanup; } } - else if( ORTE_SNAPC_CKPT_STATE_FINISHED == ckpt_state ) { + else if( ORTE_SNAPC_CKPT_STATE_FILE_XFER == ckpt_state || + ORTE_SNAPC_CKPT_STATE_FINISHED == ckpt_state ) { /* * Release all checkpointed processes now that the checkpoint is complete + * If the request was to checkpoint then terminate this command will tell + * the application to do so upon release. */ for(item = opal_list_get_first(&snapc_local_vpids); item != opal_list_get_end(&snapc_local_vpids); @@ -425,13 +428,6 @@ static void snapc_full_local_job_state_callback( orte_gpr_notify_data_t *data, v goto cleanup; } } - - /* - * If the PLS was able to actually allow for local calls to - * orte_pls.terminate_proc then we could terminate the processes - * from there, but since it is not implemented we need to do - * it from the HNP. :/ - */ } cleanup: @@ -1127,6 +1123,19 @@ static int snapc_full_local_end_ckpt_handshake(orte_snapc_full_local_snapshot_t int ret, exit_status = ORTE_SUCCESS; int last_cmd = 0; + /* + * Make sure the pipe is open, so we do not try to do this twice + */ + if( 0 > vpid_snapshot->comm_pipe_w_fd ) { + return exit_status; + } + + if( vpid_snapshot->super.term ) { + last_cmd = 999; + } else { + last_cmd = 0; + } + /* * Finish the handshake. */ diff --git a/orte/mca/snapc/snapc.h b/orte/mca/snapc/snapc.h index 2c40975117..e1962f5e66 100644 --- a/orte/mca/snapc/snapc.h +++ b/orte/mca/snapc/snapc.h @@ -115,11 +115,13 @@ extern "C" { /* Running the checkpoint */ #define ORTE_SNAPC_CKPT_STATE_RUNNING 4 /* Finished the checkpoint */ -#define ORTE_SNAPC_CKPT_STATE_FINISHED 5 +#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 5 +/* Finished the checkpoint */ +#define ORTE_SNAPC_CKPT_STATE_FINISHED 6 /* Unable to checkpoint this job */ -#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 6 +#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 7 /* Reached an error */ -#define ORTE_SNAPC_CKPT_STATE_ERROR 7 +#define ORTE_SNAPC_CKPT_STATE_ERROR 8 /** * Definition of a orte local snapshot.