* If not able to checkpoint at this time (say because we are already checkpointing or restarting) then make sure to re-set the listener so that we can checkpoint later.
* Work around duplicate node names in the map. It should not happen normally, but if the rmaps component gets this wrong provide a work around. Ralph is working on a rmaps fix for this, so we will likely remove/comment out the fix later. This commit was SVN r25572.
Этот коммит содержится в:
родитель
cc57840b53
Коммит
b5ac320826
@ -403,6 +403,7 @@ static void snapc_none_global_cmdline_request(int status,
|
||||
* Unknown command
|
||||
*/
|
||||
else {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
@ -102,7 +102,7 @@ static void snapc_full_process_request_op_cmd(orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer);
|
||||
|
||||
/*** Command Line Interactions */
|
||||
static orte_process_name_t orte_checkpoint_sender = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
|
||||
static orte_process_name_t orte_checkpoint_sender;
|
||||
static bool snapc_cmdline_recv_issued = false;
|
||||
static int snapc_full_global_start_cmdline_listener(void);
|
||||
static int snapc_full_global_stop_cmdline_listener(void);
|
||||
@ -212,6 +212,8 @@ int global_coord_init(void)
|
||||
current_global_jobid = ORTE_JOBID_INVALID;
|
||||
orte_snapc_base_snapshot_seq_number = -1;
|
||||
|
||||
orte_checkpoint_sender = orte_name_invalid;
|
||||
|
||||
SNAPC_FULL_CLEAR_TIMERS();
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
@ -585,6 +587,7 @@ static int global_init_job_structs(void)
|
||||
{
|
||||
orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL;
|
||||
orte_snapc_base_local_snapshot_t *app_snapshot = NULL;
|
||||
opal_list_item_t* orted_item = NULL;
|
||||
orte_node_t *cur_node = NULL;
|
||||
orte_job_map_t *map = NULL;
|
||||
orte_job_t *jdata = NULL;
|
||||
@ -592,6 +595,7 @@ static int global_init_job_structs(void)
|
||||
orte_std_cntr_t i = 0;
|
||||
orte_vpid_t p = 0;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
bool found = false;
|
||||
|
||||
/* look up job data object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(current_global_jobid))) {
|
||||
@ -610,6 +614,32 @@ static int global_init_job_structs(void)
|
||||
|
||||
procs = (orte_proc_t**)cur_node->procs->addr;
|
||||
|
||||
/*
|
||||
* Look out for duplicates
|
||||
* JJH: Should not happen, but does if rmaps get a bug in setting up the map.
|
||||
*/
|
||||
found = false;
|
||||
for(orted_item = opal_list_get_first(&(global_snapshot.local_snapshots));
|
||||
orted_item != opal_list_get_end(&(global_snapshot.local_snapshots));
|
||||
orted_item = opal_list_get_next(orted_item) ) {
|
||||
orted_snapshot = (orte_snapc_full_orted_snapshot_t*)orted_item;
|
||||
/*
|
||||
* Is in list?
|
||||
*/
|
||||
if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
|
||||
&(cur_node->daemon->name),
|
||||
&(orted_snapshot->process_name) )) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( found ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, mca_snapc_full_component.super.output_handle,
|
||||
"Global) [%d] Found Daemon %s with %d procs - Duplicate!! - Should not happen!",
|
||||
i, ORTE_NAME_PRINT(&(cur_node->daemon->name)), cur_node->num_procs));
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
||||
"Global) [%d] Found Daemon %s with %d procs",
|
||||
i, ORTE_NAME_PRINT(&(cur_node->daemon->name)), cur_node->num_procs));
|
||||
@ -1090,8 +1120,15 @@ static void snapc_full_process_cmdline_request_cmd(int fd, short event, void *cb
|
||||
ORTE_SNAPC_CKPT_STATE_ERROR))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
orte_checkpoint_sender = orte_name_invalid;
|
||||
is_orte_checkpoint_connected = false;
|
||||
|
||||
/* Reset the listener */
|
||||
if( ORTE_SUCCESS != (ret = snapc_full_global_start_cmdline_listener() ) ){
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user