From 6ee0c641fdd6024ceb2fcf0fd5beeb1cb42b849d Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Sun, 15 Apr 2007 14:28:56 +0000 Subject: [PATCH] Cleanup the output from orte-checkpoint so it is a bit more clear and references the sequence number. Before: [...] Finished - Global Snapshot Reference: ompi_global_snapshot_1234.ckpt After: Snashot Ref.: 1 ompi_global_snapshot_1234.ckpt This commit was SVN r14381. --- orte/mca/snapc/base/base.h | 2 +- orte/mca/snapc/base/snapc_base_fns.c | 10 ++++-- orte/mca/snapc/full/snapc_full_global.c | 3 ++ orte/tools/orte-checkpoint/orte-checkpoint.c | 33 ++++++++++++++++---- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/orte/mca/snapc/base/base.h b/orte/mca/snapc/base/base.h index 05769d2588..aa544697ca 100644 --- a/orte/mca/snapc/base/base.h +++ b/orte/mca/snapc/base/base.h @@ -143,7 +143,7 @@ extern "C" { *******************************/ /* Initial handshake with the orte_checkpoint command */ ORTE_DECLSPEC int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool *term, orte_jobid_t *jobid); - ORTE_DECLSPEC int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int ckpt_status); + ORTE_DECLSPEC int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int seq_num, int ckpt_status); ORTE_DECLSPEC int orte_snapc_base_global_coord_recv_ack(orte_process_name_t* peer, bool *ack); ORTE_DECLSPEC int orte_snapc_base_global_coord_send_ack(orte_process_name_t* peer, bool ack); diff --git a/orte/mca/snapc/base/snapc_base_fns.c b/orte/mca/snapc/base/snapc_base_fns.c index 3063050fc2..008f398536 100644 --- a/orte/mca/snapc/base/snapc_base_fns.c +++ b/orte/mca/snapc/base/snapc_base_fns.c @@ -244,7 +244,7 @@ static void snapc_none_global_recv(int status, /* * Respond with an invalid response */ - if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) { + if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, -1, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) { exit_status = ret; goto cleanup; } @@ -454,7 +454,7 @@ int orte_snapc_base_global_coord_send_ack(orte_process_name_t* peer, bool ack) return exit_status; } -int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int ckpt_status) +int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int seq_num, int ckpt_status) { int ret, exit_status = ORTE_SUCCESS; orte_buffer_t *loc_buffer = NULL; @@ -538,7 +538,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char } /******************** - * Send over the global snapshot handle + * Send over the global snapshot handle & sequence number ********************/ if(NULL != loc_buffer) { OBJ_RELEASE(loc_buffer); @@ -553,6 +553,10 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char exit_status = ret; goto cleanup; } + if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &seq_num, 1, ORTE_INT))) { + exit_status = ret; + goto cleanup; + } if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) { exit_status = ret; goto cleanup; diff --git a/orte/mca/snapc/full/snapc_full_global.c b/orte/mca/snapc/full/snapc_full_global.c index ed0c0e9bc8..7595a75162 100644 --- a/orte/mca/snapc/full/snapc_full_global.c +++ b/orte/mca/snapc/full/snapc_full_global.c @@ -278,6 +278,7 @@ snapc_full_global_checkpoint(orte_jobid_t jobid, bool term, char **global_snapsh updated_job_to_running = false; if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, global_snapshot.reference_name, + global_snapshot.seq_num, ORTE_SNAPC_CKPT_STATE_REQUEST) ) ) { exit_status = ret; goto cleanup; @@ -362,6 +363,7 @@ static void job_ckpt_request_callback(orte_gpr_notify_data_t *data, void *cbdata */ if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, global_snapshot.reference_name, + global_snapshot.seq_num, job_ckpt_state) ) ) { exit_status = ret; goto cleanup; @@ -751,6 +753,7 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) { ************************/ if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, global_snapshot.reference_name, + global_snapshot.seq_num, ckpt_status)) ) { exit_status = ret; goto cleanup; diff --git a/orte/tools/orte-checkpoint/orte-checkpoint.c b/orte/tools/orte-checkpoint/orte-checkpoint.c index 28e8e69864..25664424ac 100644 --- a/orte/tools/orte-checkpoint/orte-checkpoint.c +++ b/orte/tools/orte-checkpoint/orte-checkpoint.c @@ -90,11 +90,12 @@ extern char** environ; static int ckpt_init(int argc, char *argv[]); /* Initalization routine */ static int ckpt_finalize(void); /* Finalization routine */ static int parse_args(int argc, char *argv[]); -static int notify_process_for_checkpoint(char **global_snapshot_handle, int term); +static int notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term); static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle, int term); -static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *ckpt_status); +static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num, int *ckpt_status); static int find_universe(void); static int pretty_print_status(int state, char * snapshot_ref); +static int pretty_print_reference(int seq, char * snapshot_ref); /***************************************** * Global Vars for Command line Arguments @@ -163,6 +164,7 @@ main(int argc, char *argv[]) { int ret, exit_status = ORTE_SUCCESS; char *global_snapshot_handle; + int seq_num = -1; /*************** * Initialize @@ -202,6 +204,7 @@ main(int argc, char *argv[]) } if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint(&global_snapshot_handle, + &seq_num, orte_checkpoint_globals.term)) ) { opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, orte_checkpoint_globals.pid); @@ -209,8 +212,13 @@ main(int argc, char *argv[]) goto cleanup; } - if(!orte_checkpoint_globals.nowait) + if( orte_checkpoint_globals.status ) { pretty_print_status(ORTE_SNAPC_CKPT_STATE_FINISHED, global_snapshot_handle); + } + + if(!orte_checkpoint_globals.nowait) { + pretty_print_reference(seq_num, global_snapshot_handle); + } cleanup: /*************** @@ -316,7 +324,7 @@ static int parse_args(int argc, char *argv[]) { } static int -notify_process_for_checkpoint(char **global_snapshot_handle, int term) +notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term) { int ret, exit_status = ORTE_SUCCESS; orte_process_name_t peer; @@ -347,7 +355,7 @@ notify_process_for_checkpoint(char **global_snapshot_handle, int term) * Wait for progress updates, stop waiting when 'Finished' status */ do { - if( ORTE_SUCCESS != (ret = wait_for_checkpoint(&peer, global_snapshot_handle, &ckpt_status)) ) { + if( ORTE_SUCCESS != (ret = wait_for_checkpoint(&peer, global_snapshot_handle, seq_num, &ckpt_status)) ) { exit_status = ORTE_ERROR; goto cleanup; } @@ -699,7 +707,7 @@ static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle, return exit_status; } -static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *ckpt_status) { +static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num, int *ckpt_status) { int ret, exit_status = ORTE_SUCCESS; orte_buffer_t *loc_buffer; orte_std_cntr_t n = 1; @@ -784,6 +792,11 @@ static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot exit_status = ret; goto cleanup; } + n = 1; + if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, seq_num, &n, ORTE_INT)) ) { + exit_status = ret; + goto cleanup; + } /* ACK */ if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_send_ack(peer, true)) ) { @@ -811,3 +824,11 @@ static int pretty_print_status(int state, char * snapshot_ref) { return ORTE_SUCCESS; } + +static int pretty_print_reference(int seq, char * snapshot_ref) { + + printf("Snashot Ref.: %3d %s\n", + seq, snapshot_ref); + + return ORTE_SUCCESS; +}