1
1

Fixes Ticket #125 for both the trunk and v1.1 branch.

This commit will apply cleanly to the v1.1 branch, and should
be moved over once I get someone to verify it.

The problem is outlined in the bug. The fix was to move the
setting of the app context index (idx) before we put it in the
GPR so that it is propogated to the gpr.

The reason this hasn't bitten us before is because we init
app->idx to 0, which is true most of the time. Except that is
when MPI_Comm_spawn_multiple in which we put in more than 
one app context, thus care about correct indexing.

This was causing down the line memory corruption by overrunning
the mapping array. This commit also puts in a check to make 
sure that we error out if we ever try to do that again.

This commit was SVN r10380.
Этот коммит содержится в:
Josh Hursey 2006-06-15 22:14:07 +00:00
родитель ad1065d572
Коммит 58110f9fc9
2 изменённых файлов: 25 добавлений и 5 удалений

Просмотреть файл

@ -48,8 +48,10 @@ static void orte_rmaps_base_node_construct(orte_rmaps_base_node_t* node)
static void orte_rmaps_base_node_destruct(orte_rmaps_base_node_t* node)
{
opal_list_item_t* item;
if(NULL != node->node)
if(NULL != node->node) {
OBJ_RELEASE(node->node);
node->node = NULL;
}
while(NULL != (item = opal_list_remove_first(&node->node_procs))) {
OBJ_RELEASE(item);
}
@ -76,7 +78,10 @@ static void orte_rmaps_base_proc_construct(orte_rmaps_base_proc_t* proc)
static void orte_rmaps_base_proc_destruct(orte_rmaps_base_proc_t* proc)
{
if (NULL != proc->app) free(proc->app);
if (NULL != proc->app) {
free(proc->app);
proc->app = NULL;
}
}
OBJ_CLASS_INSTANCE(
@ -110,9 +115,11 @@ static void orte_rmaps_base_map_destruct(orte_rmaps_base_map_t* map)
OBJ_RELEASE(item);
if(NULL != map->procs) {
free(map->procs);
map->procs = NULL;
}
if(NULL != map->app) {
OBJ_RELEASE(map->app);
map->app = NULL;
}
OBJ_DESTRUCT(&map->nodes);
}
@ -450,9 +457,20 @@ int orte_rmaps_base_get_map(orte_jobid_t jobid, opal_list_t* mapping_list)
OBJ_RELEASE(proc);
continue;
}
/*
* This seems like a dummy check, but it ensures that we fail
* rather than overrun our array. This can happen if the
* indicies on the app schemas are incorrect
*/
if(map->num_procs < map->app->num_procs) {
map->procs[map->num_procs++] = proc;
proc->proc_node = orte_rmaps_lookup_node(&map->nodes, &nodes, node_name, proc);
}
else {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto cleanup;
}
}
/* cleanup any nodes allocated and not mapped */
while(NULL != (item = opal_list_remove_first(&nodes))) {

Просмотреть файл

@ -78,6 +78,8 @@ int orte_rmgr_base_put_app_context(
for(i=0; i<num_context; i++) {
orte_app_context_t* app = app_context[i];
app->idx = i;
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[i]),
ORTE_JOB_APP_CONTEXT_KEY,
ORTE_APP_CONTEXT,
@ -85,8 +87,8 @@ int orte_rmgr_base_put_app_context(
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OBJ_RETAIN(app);
app->idx = i;
job_slots += app->num_procs;
}