Some checkpoint/restart fixes in response to r18338 (changes in modex).
Things should be working now. This commit was SVN r18348. The following SVN revision numbers were found above: r18338 --> open-mpi/ompi@3e55fe6f6d
Этот коммит содержится в:
родитель
7154b232bb
Коммит
dcd21d7d07
@ -536,17 +536,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
ret);
|
||||
return ret;
|
||||
}
|
||||
#if 0
|
||||
/*
|
||||
* Fill in remote proc information
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = ompi_proc_get_info())) {
|
||||
opal_output(0,
|
||||
"pml:ob1: ft_event(Restart): Failed ompi_proc_get_info() = %d",
|
||||
ret);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
* Add the new procs (BTLs redo modex recv's)
|
||||
|
@ -278,6 +278,9 @@ int ompi_proc_refresh(void) {
|
||||
item = opal_list_get_next(item), ++i ) {
|
||||
proc = (ompi_proc_t*)item;
|
||||
|
||||
/* Does not change: proc->proc_name.vpid */
|
||||
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
if (i == ORTE_PROC_MY_NAME->vpid) {
|
||||
ompi_proc_local_proc = proc;
|
||||
proc->proc_flags |= OMPI_PROC_FLAG_LOCAL;
|
||||
|
39
orte/mca/ess/env/ess_env_module.c
поставляемый
39
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -109,7 +109,7 @@ orte_ess_base_module_t orte_ess_env_module = {
|
||||
};
|
||||
|
||||
static opal_pointer_array_t nidmap;
|
||||
static orte_pmap_t *pmap;
|
||||
static orte_pmap_t *pmap = NULL;
|
||||
static orte_vpid_t nprocs;
|
||||
|
||||
static int rte_init(char flags)
|
||||
@ -198,11 +198,13 @@ static int rte_finalize(void)
|
||||
}
|
||||
if (NULL != nids[i]->name) {
|
||||
free(nids[i]->name);
|
||||
nids[i]->name = NULL;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&nidmap);
|
||||
free(pmap);
|
||||
|
||||
pmap = NULL;
|
||||
|
||||
/* use the default procedure to finish */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -341,6 +343,8 @@ static int rte_ft_event(int state)
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
char * procid_str = NULL;
|
||||
char * jobid_str = NULL;
|
||||
orte_nid_t **nids = NULL;
|
||||
int32_t i;
|
||||
|
||||
/******** Checkpoint Prep ********/
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
@ -416,6 +420,23 @@ static int rte_ft_event(int state)
|
||||
* This should follow the ess init() function
|
||||
*/
|
||||
|
||||
/*
|
||||
* Clear nidmap
|
||||
*/
|
||||
nids = (orte_nid_t**)nidmap.addr;
|
||||
for (i=0; i < nidmap.size; i++) {
|
||||
if (NULL == nids[i]) {
|
||||
break;
|
||||
}
|
||||
if (NULL != nids[i]->name) {
|
||||
free(nids[i]->name);
|
||||
nids[i]->name = NULL;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&nidmap);
|
||||
free(pmap);
|
||||
pmap = NULL;
|
||||
|
||||
/*
|
||||
* - Reset Contact information
|
||||
*/
|
||||
@ -541,6 +562,20 @@ static int rte_ft_event(int state)
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Refresh nidmap structure
|
||||
*/
|
||||
OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8);
|
||||
|
||||
/* if one was provided, build my nidmap */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf,
|
||||
&nidmap, &pmap, &nprocs))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else if (OPAL_CRS_TERM == state ) {
|
||||
/* Nothing */
|
||||
|
@ -208,6 +208,9 @@ int orte_proc_info_finalize(void)
|
||||
orte_process_info.singleton = false;
|
||||
orte_process_info.daemon = false;
|
||||
|
||||
OBJ_RELEASE(orte_process_info.sync_buf);
|
||||
orte_process_info.sync_buf = NULL;
|
||||
|
||||
init = false;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user