1
1

Fix/Cleanup the Checkpoint Error propagation through the Snapc Full component.

This commit was SVN r15175.
Этот коммит содержится в:
Josh Hursey 2007-06-22 16:14:25 +00:00
родитель 5528e0ca60
Коммит 84f102c343
4 изменённых файлов: 76 добавлений и 26 удалений

Просмотреть файл

@ -456,7 +456,8 @@ int orte_snapc_base_global_coord_send_ack(orte_process_name_t* peer, bool ack)
return exit_status; return exit_status;
} }
int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle, int seq_num, int ckpt_status) int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char *global_snapshot_handle,
int seq_num, int ckpt_status)
{ {
int ret, exit_status = ORTE_SUCCESS; int ret, exit_status = ORTE_SUCCESS;
orte_buffer_t *loc_buffer = NULL; orte_buffer_t *loc_buffer = NULL;
@ -1406,6 +1407,10 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
/* Extract the checkpointer */ /* Extract the checkpointer */
crs_comp = opal_crs_base_extract_expected_component(snapshot_location, &prev_pid); crs_comp = opal_crs_base_extract_expected_component(snapshot_location, &prev_pid);
if( NULL == crs_comp ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
/* get the base of the location */ /* get the base of the location */
local_dir = strdup(snapshot_location); local_dir = strdup(snapshot_location);

Просмотреть файл

@ -85,6 +85,8 @@ static orte_snapc_base_global_snapshot_t global_snapshot;
static orte_process_name_t orte_checkpoint_sender; static orte_process_name_t orte_checkpoint_sender;
static bool updated_job_to_running; static bool updated_job_to_running;
static size_t cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
/************************ /************************
* Function Definitions * Function Definitions
************************/ ************************/
@ -348,6 +350,8 @@ static void job_ckpt_request_callback(orte_gpr_notify_data_t *data, void *cbdata
} }
} }
cur_job_ckpt_state = job_ckpt_state;
if(ORTE_SNAPC_CKPT_STATE_REQUEST == job_ckpt_state ) { if(ORTE_SNAPC_CKPT_STATE_REQUEST == job_ckpt_state ) {
/* /*
* Start the checkpoint, now that we have the jobid * Start the checkpoint, now that we have the jobid
@ -357,7 +361,8 @@ static void job_ckpt_request_callback(orte_gpr_notify_data_t *data, void *cbdata
goto cleanup; goto cleanup;
} }
} }
else if( ORTE_SNAPC_CKPT_STATE_FINISHED != job_ckpt_state) { else if( ORTE_SNAPC_CKPT_STATE_FINISHED != job_ckpt_state &&
ORTE_SNAPC_CKPT_STATE_ERROR != job_ckpt_state ) {
/* /*
* Update the orte-checkpoint cmd * Update the orte-checkpoint cmd
*/ */
@ -422,7 +427,8 @@ static void vpid_ckpt_state_callback(orte_gpr_notify_data_t *data, void *cbdata)
vpid_snapshot->crs_snapshot_super.reference_name = strdup(ckpt_ref); vpid_snapshot->crs_snapshot_super.reference_name = strdup(ckpt_ref);
vpid_snapshot->crs_snapshot_super.remote_location = strdup(ckpt_loc); vpid_snapshot->crs_snapshot_super.remote_location = strdup(ckpt_loc);
if(ckpt_state == ORTE_SNAPC_CKPT_STATE_FINISHED) { if(ckpt_state == ORTE_SNAPC_CKPT_STATE_FINISHED ||
ckpt_state == ORTE_SNAPC_CKPT_STATE_ERROR ) {
snapc_full_global_check_for_done(vpid_snapshot->process_name.jobid); snapc_full_global_check_for_done(vpid_snapshot->process_name.jobid);
} }
break; break;
@ -692,7 +698,6 @@ static int snapc_full_global_notify_checkpoint( char * global_snapshot_handle,
static int snapc_full_global_check_for_done(orte_jobid_t jobid) { static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
int ret, exit_status = ORTE_SUCCESS; int ret, exit_status = ORTE_SUCCESS;
size_t ckpt_status = ORTE_SNAPC_CKPT_STATE_FINISHED;
opal_list_item_t* item = NULL; opal_list_item_t* item = NULL;
char * global_dir = NULL; char * global_dir = NULL;
bool term_job = false; bool term_job = false;
@ -702,12 +707,16 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
return exit_status; return exit_status;
} }
cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FINISHED;
/********************** /**********************
* Gather all of the files locally * Gather all of the files locally
* Note: We don't need to worry about the return code in as much since the
* rest of the functions know what to do with an error scenario.
**********************/ **********************/
if( ORTE_SUCCESS != (ret = snapc_full_global_gather_all_files()) ) { if( ORTE_SUCCESS != (ret = snapc_full_global_gather_all_files()) ) {
exit_status = ret; exit_status = ret;
goto cleanup; cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_ERROR;
} }
/********************************** /**********************************
@ -716,7 +725,7 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name); global_dir = orte_snapc_base_get_global_snapshot_directory(global_snapshot.reference_name);
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid, if( ORTE_SUCCESS != (ret = orte_snapc_base_set_job_ckpt_info(jobid,
ORTE_SNAPC_CKPT_STATE_FINISHED, cur_job_ckpt_state,
global_snapshot.reference_name, global_snapshot.reference_name,
global_dir) ) ) { global_dir) ) ) {
exit_status = ret; exit_status = ret;
@ -754,7 +763,7 @@ static int snapc_full_global_check_for_done(orte_jobid_t jobid) {
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender,
global_snapshot.reference_name, global_snapshot.reference_name,
global_snapshot.seq_num, global_snapshot.seq_num,
ckpt_status)) ) { cur_job_ckpt_state)) ) {
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -798,7 +807,8 @@ static bool snapc_full_global_is_done_yet(void) {
vpid_snapshot = (orte_snapc_base_snapshot_t*)item; vpid_snapshot = (orte_snapc_base_snapshot_t*)item;
/* If they are working, then we are not done yet */ /* If they are working, then we are not done yet */
if(ORTE_SNAPC_CKPT_STATE_FINISHED != vpid_snapshot->state) { if(ORTE_SNAPC_CKPT_STATE_FINISHED != vpid_snapshot->state &&
ORTE_SNAPC_CKPT_STATE_ERROR != vpid_snapshot->state ) {
done_yet = false; done_yet = false;
return done_yet; return done_yet;
} }
@ -814,7 +824,6 @@ static int snapc_full_global_gather_all_files(void) {
orte_filem_base_request_t *filem_request = OBJ_NEW(orte_filem_base_request_t); orte_filem_base_request_t *filem_request = OBJ_NEW(orte_filem_base_request_t);
int tmp_argc = 0; int tmp_argc = 0;
/* /*
* If it is stored in place, then we do not need to transfer anything * If it is stored in place, then we do not need to transfer anything
*/ */
@ -831,6 +840,13 @@ static int snapc_full_global_gather_all_files(void) {
"global) Remote Location: (%s)\n", vpid_snapshot->crs_snapshot_super.remote_location); "global) Remote Location: (%s)\n", vpid_snapshot->crs_snapshot_super.remote_location);
opal_output_verbose(20, mca_snapc_full_component.super.output_handle, opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
"global) Local Location: (%s)\n", vpid_snapshot->crs_snapshot_super.local_location); "global) Local Location: (%s)\n", vpid_snapshot->crs_snapshot_super.local_location);
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
"global) Status: (%d)\n", (int)vpid_snapshot->state);
if( ORTE_SNAPC_CKPT_STATE_ERROR == vpid_snapshot->state ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
/* /*
* Update the metadata file * Update the metadata file
@ -874,6 +890,13 @@ static int snapc_full_global_gather_all_files(void) {
"global) Remote Location: (%s)\n", vpid_snapshot->crs_snapshot_super.remote_location); "global) Remote Location: (%s)\n", vpid_snapshot->crs_snapshot_super.remote_location);
opal_output_verbose(20, mca_snapc_full_component.super.output_handle, opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
"global) Local Location: (%s)\n", vpid_snapshot->crs_snapshot_super.local_location); "global) Local Location: (%s)\n", vpid_snapshot->crs_snapshot_super.local_location);
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
"global) Status: (%d)\n", (int)vpid_snapshot->state);
if( ORTE_SNAPC_CKPT_STATE_ERROR == vpid_snapshot->state ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
/* /*
* Construct the process information * Construct the process information

Просмотреть файл

@ -407,7 +407,8 @@ static void snapc_full_local_vpid_state_callback(orte_gpr_notify_data_t *data, v
/* /*
* This process has finished their checkpoint, see if we are done yet * This process has finished their checkpoint, see if we are done yet
*/ */
if( ORTE_SNAPC_CKPT_STATE_FINISHED == ckpt_state ) { if( ORTE_SNAPC_CKPT_STATE_FINISHED == ckpt_state ||
ORTE_SNAPC_CKPT_STATE_ERROR == ckpt_state ) {
if(local_checkpoint_finished()) { if(local_checkpoint_finished()) {
/* /*
* Currently we don't need to do anything when done * Currently we don't need to do anything when done
@ -904,7 +905,8 @@ static bool local_checkpoint_finished(void)
vpid_snapshot = (orte_snapc_base_snapshot_t*)item; vpid_snapshot = (orte_snapc_base_snapshot_t*)item;
/* Searching for any vpid's that have not completed */ /* Searching for any vpid's that have not completed */
if(ORTE_SNAPC_CKPT_STATE_FINISHED != vpid_snapshot->state) { if(ORTE_SNAPC_CKPT_STATE_FINISHED != vpid_snapshot->state &&
ORTE_SNAPC_CKPT_STATE_ERROR != vpid_snapshot->state ) {
is_done = false; is_done = false;
break; break;
} }
@ -919,6 +921,14 @@ static void snapc_full_local_wait_ckpt_cb(pid_t pid, int status, void* cbdata)
opal_list_item_t* item = NULL; opal_list_item_t* item = NULL;
orte_snapc_base_snapshot_t *vpid_snapshot = NULL; orte_snapc_base_snapshot_t *vpid_snapshot = NULL;
int ret, exit_status = ORTE_SUCCESS; int ret, exit_status = ORTE_SUCCESS;
size_t loc_state = ORTE_SNAPC_CKPT_STATE_FINISHED;
if( status == OPAL_SUCCESS ) {
loc_state = ORTE_SNAPC_CKPT_STATE_FINISHED;
}
else {
loc_state = ORTE_SNAPC_CKPT_STATE_ERROR;
}
/* /*
* Find the process in the list * Find the process in the list
@ -930,7 +940,7 @@ static void snapc_full_local_wait_ckpt_cb(pid_t pid, int status, void* cbdata)
if( 0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, proc_name, &vpid_snapshot->process_name) ) { if( 0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, proc_name, &vpid_snapshot->process_name) ) {
/* Update it's state */ /* Update it's state */
vpid_snapshot->state = ORTE_SNAPC_CKPT_STATE_FINISHED; vpid_snapshot->state = loc_state;
break; break;
} }
} }
@ -940,7 +950,7 @@ static void snapc_full_local_wait_ckpt_cb(pid_t pid, int status, void* cbdata)
* Update our status information * Update our status information
*/ */
if( ORTE_SUCCESS != (ret = orte_snapc_base_set_vpid_ckpt_info( vpid_snapshot->process_name, if( ORTE_SUCCESS != (ret = orte_snapc_base_set_vpid_ckpt_info( vpid_snapshot->process_name,
ORTE_SNAPC_CKPT_STATE_FINISHED, loc_state,
vpid_snapshot->crs_snapshot_super.reference_name, vpid_snapshot->crs_snapshot_super.reference_name,
vpid_snapshot->crs_snapshot_super.local_location ) ) ) { vpid_snapshot->crs_snapshot_super.local_location ) ) ) {
exit_status = ret; exit_status = ret;

Просмотреть файл

@ -87,7 +87,7 @@ static int ckpt_finalize(void); /* Finalization routine */
static int parse_args(int argc, char *argv[]); static int parse_args(int argc, char *argv[]);
static int notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term); static int notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int term);
static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle, int term); static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle, int term);
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num, int *ckpt_status); static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num);
static int find_universe(void); static int find_universe(void);
static int pretty_print_status(int state, char * snapshot_ref); static int pretty_print_status(int state, char * snapshot_ref);
static int pretty_print_reference(int seq, char * snapshot_ref); static int pretty_print_reference(int seq, char * snapshot_ref);
@ -115,6 +115,7 @@ typedef struct {
bool nowait; /* Do not wait for checkpoint to complete before returning */ bool nowait; /* Do not wait for checkpoint to complete before returning */
bool status; /* Display status messages while checkpoint is progressing */ bool status; /* Display status messages while checkpoint is progressing */
int output; int output;
int ckpt_status;
} orte_checkpoint_globals_t; } orte_checkpoint_globals_t;
orte_checkpoint_globals_t orte_checkpoint_globals; orte_checkpoint_globals_t orte_checkpoint_globals;
@ -217,6 +218,13 @@ main(int argc, char *argv[])
goto cleanup; goto cleanup;
} }
if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
orte_checkpoint_globals.pid, ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
if( orte_checkpoint_globals.status ) { if( orte_checkpoint_globals.status ) {
pretty_print_status(ORTE_SNAPC_CKPT_STATE_FINISHED, global_snapshot_handle); pretty_print_status(ORTE_SNAPC_CKPT_STATE_FINISHED, global_snapshot_handle);
} }
@ -252,6 +260,7 @@ static int parse_args(int argc, char *argv[]) {
orte_checkpoint_globals.nowait = false; orte_checkpoint_globals.nowait = false;
orte_checkpoint_globals.status = false; orte_checkpoint_globals.status = false;
orte_checkpoint_globals.output = -1; orte_checkpoint_globals.output = -1;
orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
/* Parse the command line options */ /* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts); opal_cmd_line_create(&cmd_line, cmd_line_opts);
@ -333,7 +342,6 @@ notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int t
{ {
int ret, exit_status = ORTE_SUCCESS; int ret, exit_status = ORTE_SUCCESS;
orte_process_name_t peer; orte_process_name_t peer;
int ckpt_status = ORTE_SUCCESS;
peer = *ORTE_PROC_MY_HNP; peer = *ORTE_PROC_MY_HNP;
@ -360,7 +368,7 @@ notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int t
* Wait for progress updates, stop waiting when 'Finished' status * Wait for progress updates, stop waiting when 'Finished' status
*/ */
do { do {
if( ORTE_SUCCESS != (ret = wait_for_checkpoint(&peer, global_snapshot_handle, seq_num, &ckpt_status)) ) { if( ORTE_SUCCESS != (ret = wait_for_checkpoint(&peer, global_snapshot_handle, seq_num) ) ) {
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
goto cleanup; goto cleanup;
} }
@ -369,7 +377,7 @@ notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int t
* If process said that it cannot checkpoint at this time return a * If process said that it cannot checkpoint at this time return a
* pretty message. * pretty message.
*/ */
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == ckpt_status ) { if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status ) {
opal_show_help("help-orte-checkpoint.txt", "non-ckptable", opal_show_help("help-orte-checkpoint.txt", "non-ckptable",
true, true,
orte_checkpoint_globals.pid); orte_checkpoint_globals.pid);
@ -380,8 +388,8 @@ notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int t
* If we are to display the status progression * If we are to display the status progression
*/ */
if( orte_checkpoint_globals.status ) { if( orte_checkpoint_globals.status ) {
if(ORTE_SNAPC_CKPT_STATE_FINISHED != ckpt_status) if(ORTE_SNAPC_CKPT_STATE_FINISHED != orte_checkpoint_globals.ckpt_status)
pretty_print_status(ckpt_status, *global_snapshot_handle); pretty_print_status(orte_checkpoint_globals.ckpt_status, *global_snapshot_handle);
} }
/* /*
* Otherwise only display it if we are going to be terminated soon * Otherwise only display it if we are going to be terminated soon
@ -391,11 +399,12 @@ notify_process_for_checkpoint(char **global_snapshot_handle, int *seq_num, int t
* print out the global snapshot handle when we start running * print out the global snapshot handle when we start running
*/ */
if(orte_checkpoint_globals.term && if(orte_checkpoint_globals.term &&
ORTE_SNAPC_CKPT_STATE_RUNNING == ckpt_status ) { ORTE_SNAPC_CKPT_STATE_RUNNING == orte_checkpoint_globals.ckpt_status ) {
pretty_print_status(ckpt_status, *global_snapshot_handle); pretty_print_status(orte_checkpoint_globals.ckpt_status, *global_snapshot_handle);
} }
} }
} while(ORTE_SNAPC_CKPT_STATE_FINISHED != ckpt_status ); } while(ORTE_SNAPC_CKPT_STATE_FINISHED != orte_checkpoint_globals.ckpt_status &&
ORTE_SNAPC_CKPT_STATE_ERROR != orte_checkpoint_globals.ckpt_status );
} }
cleanup: cleanup:
@ -729,11 +738,12 @@ static int contact_hnp(orte_process_name_t *peer, char *global_snapshot_handle,
return exit_status; return exit_status;
} }
static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num, int *ckpt_status) { static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot_handle, int *seq_num) {
int ret, exit_status = ORTE_SUCCESS; int ret, exit_status = ORTE_SUCCESS;
orte_buffer_t *loc_buffer; orte_buffer_t *loc_buffer;
orte_std_cntr_t n = 1; orte_std_cntr_t n = 1;
size_t str_len = 0; size_t str_len = 0;
int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
if (NULL == (loc_buffer = OBJ_NEW(orte_buffer_t))) { if (NULL == (loc_buffer = OBJ_NEW(orte_buffer_t))) {
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
@ -754,11 +764,13 @@ static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot
goto cleanup; goto cleanup;
} }
n = 1; n = 1;
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, ckpt_status, &n, ORTE_INT)) ) { if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, &ckpt_status, &n, ORTE_INT)) ) {
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
orte_checkpoint_globals.ckpt_status = ckpt_status;
/* ACK */ /* ACK */
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_send_ack(peer, true)) ) { if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_send_ack(peer, true)) ) {
exit_status = ret; exit_status = ret;
@ -766,7 +778,7 @@ static int wait_for_checkpoint(orte_process_name_t *peer, char **global_snapshot
} }
/* If we cannot checkpoint, then just skip to the end */ /* If we cannot checkpoint, then just skip to the end */
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == *ckpt_status) { if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == ckpt_status) {
*global_snapshot_handle = NULL; *global_snapshot_handle = NULL;
goto cleanup; goto cleanup;
} }