1
1

Cleanup the output from orte-checkpoint so it is a bit more clear and references

the sequence number.

Before:
[...] Finished - Global Snapshot Reference: ompi_global_snapshot_1234.ckpt

After:
Snashot Ref.:   1 ompi_global_snapshot_1234.ckpt

This commit was SVN r14381.
Этот коммит содержится в:
Josh Hursey 2007-04-15 14:28:56 +00:00
родитель 83ddd7a3c5
Коммит 6ee0c641fd
4 изменённых файлов: 38 добавлений и 10 удалений

Просмотреть файл

@ -143,7 +143,7 @@ extern "C" {
*******************************/
/* Initial handshake with the orte_checkpoint command */
ORTE_DECLSPEC int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool *term, orte_jobid_t *jobid);
ORTE_DECLSPEC int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int ckpt_status);
ORTE_DECLSPEC int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int seq_num, int ckpt_status);
ORTE_DECLSPEC int orte_snapc_base_global_coord_recv_ack(orte_process_name_t* peer, bool *ack);
ORTE_DECLSPEC int orte_snapc_base_global_coord_send_ack(orte_process_name_t* peer, bool ack);

Просмотреть файл

@ -244,7 +244,7 @@ static void snapc_none_global_recv(int status,
/*
* Respond with an invalid response
*/
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, -1, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
exit_status = ret;
goto cleanup;
}
@ -454,7 +454,7 @@ int orte_snapc_base_global_coord_send_ack(orte_process_name_t* peer, bool ack)
return exit_status;
}
int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int ckpt_status)
int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int seq_num, int ckpt_status)
{
int ret, exit_status = ORTE_SUCCESS;
orte_buffer_t *loc_buffer = NULL;
@ -538,7 +538,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
}
/********************
* Send over the global snapshot handle
* Send over the global snapshot handle & sequence number
********************/
if(NULL != loc_buffer) {
OBJ_RELEASE(loc_buffer);
@ -553,6 +553,10 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &seq_num, 1, ORTE_INT))) {
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
exit_status = ret;
goto cleanup;

Просмотреть файл

@ -278,6 +278,7 @@ snapc_full_global_checkpoint(orte_jobid_t jobid, bool term, char **global_snapsh
updated_job_to_running = false;
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender,
global_snapshot.reference_name,
global_snapshot.seq_num,
ORTE_SNAPC_CKPT_STATE_REQUEST) ) ) {
exit_status = ret;
goto cleanup;
@ -362,6 +363,7 @@ static void job_ckpt_request_callback(orte_gpr_notify_data_t *data, void *cbdata
*/
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender,
global_snapshot.reference_name,
global_snapshot.seq_num,
job_ckpt_state) ) ) {
exit_status = ret;
goto cleanup;
@ -751,6 +753,7 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
************************/
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender,
global_snapshot.reference_name,
global_snapshot.seq_num,
ckpt_status)) ) {
exit_status = ret;
goto cleanup;

Просмотреть файл

@ -90,11 +90,12 @@ extern char** environ;
static int ckpt_init(int argc, char *argv[]); /* Initalization routine */
static int ckpt_finalize(void); /* Finalization routine */
static int parse_args(int argc, char *argv[]);
static int notify_process_for_checkpoint(char **global_snapshot_handle, int term);
static int notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term);
static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle, int term);
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *ckpt_status);
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num, int *ckpt_status);
static int find_universe(void);
static int pretty_print_status(int state, char * snapshot_ref);
static int pretty_print_reference(int seq, char * snapshot_ref);
/*****************************************
* Global Vars for Command line Arguments
@ -163,6 +164,7 @@ main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
char *global_snapshot_handle;
int seq_num = -1;
/***************
* Initialize
@ -202,6 +204,7 @@ main(int argc, char *argv[])
}
if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint(&global_snapshot_handle,
&seq_num,
orte_checkpoint_globals.term)) ) {
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
orte_checkpoint_globals.pid);
@ -209,8 +212,13 @@ main(int argc, char *argv[])
goto cleanup;
}
if(!orte_checkpoint_globals.nowait)
if( orte_checkpoint_globals.status ) {
pretty_print_status(ORTE_SNAPC_CKPT_STATE_FINISHED, global_snapshot_handle);
}
if(!orte_checkpoint_globals.nowait) {
pretty_print_reference(seq_num, global_snapshot_handle);
}
cleanup:
/***************
@ -316,7 +324,7 @@ static int parse_args(int argc, char *argv[]) {
}
static int
notify_process_for_checkpoint(char **global_snapshot_handle, int term)
notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term)
{
int ret, exit_status = ORTE_SUCCESS;
orte_process_name_t peer;
@ -347,7 +355,7 @@ notify_process_for_checkpoint(char **global_snapshot_handle, int term)
* Wait for progress updates, stop waiting when 'Finished' status
*/
do {
if( ORTE_SUCCESS != (ret = wait_for_checkpoint(&peer, global_snapshot_handle, &ckpt_status)) ) {
if( ORTE_SUCCESS != (ret = wait_for_checkpoint(&peer, global_snapshot_handle, seq_num, &ckpt_status)) ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
@ -699,7 +707,7 @@ static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle,
return exit_status;
}
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *ckpt_status) {
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num, int *ckpt_status) {
int ret, exit_status = ORTE_SUCCESS;
orte_buffer_t *loc_buffer;
orte_std_cntr_t n = 1;
@ -784,6 +792,11 @@ static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot
exit_status = ret;
goto cleanup;
}
n = 1;
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, seq_num, &n, ORTE_INT)) ) {
exit_status = ret;
goto cleanup;
}
/* ACK */
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_send_ack(peer, true)) ) {
@ -811,3 +824,11 @@ static int pretty_print_status(int state, char * snapshot_ref) {
return ORTE_SUCCESS;
}
static int pretty_print_reference(int seq, char * snapshot_ref) {
printf("Snashot Ref.: %3d %s\n",
seq, snapshot_ref);
return ORTE_SUCCESS;
}