1
1

Some checkpoint/restart fixes in response to r18338 (changes in modex).

Things should be working now.

This commit was SVN r18348.

The following SVN revision numbers were found above:
  r18338 --> open-mpi/ompi@3e55fe6f6d
Этот коммит содержится в:
Josh Hursey 2008-05-01 17:48:13 +00:00
родитель 7154b232bb
Коммит dcd21d7d07
4 изменённых файлов: 44 добавлений и 13 удалений

Просмотреть файл

@ -536,17 +536,7 @@ int mca_pml_ob1_ft_event( int state )
ret);
return ret;
}
#if 0
/*
* Fill in remote proc information
*/
if (OMPI_SUCCESS != (ret = ompi_proc_get_info())) {
opal_output(0,
"pml:ob1: ft_event(Restart): Failed ompi_proc_get_info() = %d",
ret);
return ret;
}
#endif
/*
* Startup the PML stack now that the modex is running again
* Add the new procs (BTLs redo modex recv's)

Просмотреть файл

@ -278,6 +278,9 @@ int ompi_proc_refresh(void) {
item = opal_list_get_next(item), ++i ) {
proc = (ompi_proc_t*)item;
/* Does not change: proc->proc_name.vpid */
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
if (i == ORTE_PROC_MY_NAME->vpid) {
ompi_proc_local_proc = proc;
proc->proc_flags |= OMPI_PROC_FLAG_LOCAL;

39
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -109,7 +109,7 @@ orte_ess_base_module_t orte_ess_env_module = {
};
static opal_pointer_array_t nidmap;
static orte_pmap_t *pmap;
static orte_pmap_t *pmap = NULL;
static orte_vpid_t nprocs;
static int rte_init(char flags)
@ -198,11 +198,13 @@ static int rte_finalize(void)
}
if (NULL != nids[i]->name) {
free(nids[i]->name);
nids[i]->name = NULL;
}
}
OBJ_DESTRUCT(&nidmap);
free(pmap);
pmap = NULL;
/* use the default procedure to finish */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
ORTE_ERROR_LOG(ret);
@ -341,6 +343,8 @@ static int rte_ft_event(int state)
int ret, exit_status = ORTE_SUCCESS;
char * procid_str = NULL;
char * jobid_str = NULL;
orte_nid_t **nids = NULL;
int32_t i;
/******** Checkpoint Prep ********/
if(OPAL_CRS_CHECKPOINT == state) {
@ -416,6 +420,23 @@ static int rte_ft_event(int state)
* This should follow the ess init() function
*/
/*
* Clear nidmap
*/
nids = (orte_nid_t**)nidmap.addr;
for (i=0; i < nidmap.size; i++) {
if (NULL == nids[i]) {
break;
}
if (NULL != nids[i]->name) {
free(nids[i]->name);
nids[i]->name = NULL;
}
}
OBJ_DESTRUCT(&nidmap);
free(pmap);
pmap = NULL;
/*
* - Reset Contact information
*/
@ -541,6 +562,20 @@ static int rte_ft_event(int state)
exit_status = ret;
goto cleanup;
}
/*
* Refresh nidmap structure
*/
OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t);
opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8);
/* if one was provided, build my nidmap */
if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf,
&nidmap, &pmap, &nprocs))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
}
else if (OPAL_CRS_TERM == state ) {
/* Nothing */

Просмотреть файл

@ -208,6 +208,9 @@ int orte_proc_info_finalize(void)
orte_process_info.singleton = false;
orte_process_info.daemon = false;
OBJ_RELEASE(orte_process_info.sync_buf);
orte_process_info.sync_buf = NULL;
init = false;
return ORTE_SUCCESS;
}