Cleanup the output from orte-checkpoint so it is a bit more clear and references
the sequence number. Before: [...] Finished - Global Snapshot Reference: ompi_global_snapshot_1234.ckpt After: Snashot Ref.: 1 ompi_global_snapshot_1234.ckpt This commit was SVN r14381.
Этот коммит содержится в:
родитель
83ddd7a3c5
Коммит
6ee0c641fd
@ -143,7 +143,7 @@ extern "C" {
|
||||
*******************************/
|
||||
/* Initial handshake with the orte_checkpoint command */
|
||||
ORTE_DECLSPEC int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool *term, orte_jobid_t *jobid);
|
||||
ORTE_DECLSPEC int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int ckpt_status);
|
||||
ORTE_DECLSPEC int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int seq_num, int ckpt_status);
|
||||
|
||||
ORTE_DECLSPEC int orte_snapc_base_global_coord_recv_ack(orte_process_name_t* peer, bool *ack);
|
||||
ORTE_DECLSPEC int orte_snapc_base_global_coord_send_ack(orte_process_name_t* peer, bool ack);
|
||||
|
@ -244,7 +244,7 @@ static void snapc_none_global_recv(int status,
|
||||
/*
|
||||
* Respond with an invalid response
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, NULL, -1, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -454,7 +454,7 @@ int orte_snapc_base_global_coord_send_ack(orte_process_name_t* peer, bool ack)
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int ckpt_status)
|
||||
int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int seq_num, int ckpt_status)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
orte_buffer_t *loc_buffer = NULL;
|
||||
@ -538,7 +538,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
|
||||
}
|
||||
|
||||
/********************
|
||||
* Send over the global snapshot handle
|
||||
* Send over the global snapshot handle & sequence number
|
||||
********************/
|
||||
if(NULL != loc_buffer) {
|
||||
OBJ_RELEASE(loc_buffer);
|
||||
@ -553,6 +553,10 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &seq_num, 1, ORTE_INT))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
|
@ -278,6 +278,7 @@ snapc_full_global_checkpoint(orte_jobid_t jobid, bool term, char **global_snapsh
|
||||
updated_job_to_running = false;
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender,
|
||||
global_snapshot.reference_name,
|
||||
global_snapshot.seq_num,
|
||||
ORTE_SNAPC_CKPT_STATE_REQUEST) ) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -362,6 +363,7 @@ static void job_ckpt_request_callback(orte_gpr_notify_data_t *data, void *cbdata
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender,
|
||||
global_snapshot.reference_name,
|
||||
global_snapshot.seq_num,
|
||||
job_ckpt_state) ) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -751,6 +753,7 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
|
||||
************************/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender,
|
||||
global_snapshot.reference_name,
|
||||
global_snapshot.seq_num,
|
||||
ckpt_status)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
|
@ -90,11 +90,12 @@ extern char** environ;
|
||||
static int ckpt_init(int argc, char *argv[]); /* Initalization routine */
|
||||
static int ckpt_finalize(void); /* Finalization routine */
|
||||
static int parse_args(int argc, char *argv[]);
|
||||
static int notify_process_for_checkpoint(char **global_snapshot_handle, int term);
|
||||
static int notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term);
|
||||
static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle, int term);
|
||||
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *ckpt_status);
|
||||
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num, int *ckpt_status);
|
||||
static int find_universe(void);
|
||||
static int pretty_print_status(int state, char * snapshot_ref);
|
||||
static int pretty_print_reference(int seq, char * snapshot_ref);
|
||||
|
||||
/*****************************************
|
||||
* Global Vars for Command line Arguments
|
||||
@ -163,6 +164,7 @@ main(int argc, char *argv[])
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
char *global_snapshot_handle;
|
||||
int seq_num = -1;
|
||||
|
||||
/***************
|
||||
* Initialize
|
||||
@ -202,6 +204,7 @@ main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint(&global_snapshot_handle,
|
||||
&seq_num,
|
||||
orte_checkpoint_globals.term)) ) {
|
||||
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
|
||||
orte_checkpoint_globals.pid);
|
||||
@ -209,8 +212,13 @@ main(int argc, char *argv[])
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if(!orte_checkpoint_globals.nowait)
|
||||
if( orte_checkpoint_globals.status ) {
|
||||
pretty_print_status(ORTE_SNAPC_CKPT_STATE_FINISHED, global_snapshot_handle);
|
||||
}
|
||||
|
||||
if(!orte_checkpoint_globals.nowait) {
|
||||
pretty_print_reference(seq_num, global_snapshot_handle);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/***************
|
||||
@ -316,7 +324,7 @@ static int parse_args(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
static int
|
||||
notify_process_for_checkpoint(char **global_snapshot_handle, int term)
|
||||
notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
orte_process_name_t peer;
|
||||
@ -347,7 +355,7 @@ notify_process_for_checkpoint(char **global_snapshot_handle, int term)
|
||||
* Wait for progress updates, stop waiting when 'Finished' status
|
||||
*/
|
||||
do {
|
||||
if( ORTE_SUCCESS != (ret = wait_for_checkpoint(&peer, global_snapshot_handle, &ckpt_status)) ) {
|
||||
if( ORTE_SUCCESS != (ret = wait_for_checkpoint(&peer, global_snapshot_handle, seq_num, &ckpt_status)) ) {
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -699,7 +707,7 @@ static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle,
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *ckpt_status) {
|
||||
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num, int *ckpt_status) {
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
orte_buffer_t *loc_buffer;
|
||||
orte_std_cntr_t n = 1;
|
||||
@ -784,6 +792,11 @@ static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
n = 1;
|
||||
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, seq_num, &n, ORTE_INT)) ) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* ACK */
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_send_ack(peer, true)) ) {
|
||||
@ -811,3 +824,11 @@ static int pretty_print_status(int state, char * snapshot_ref) {
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int pretty_print_reference(int seq, char * snapshot_ref) {
|
||||
|
||||
printf("Snashot Ref.: %3d %s\n",
|
||||
seq, snapshot_ref);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user