Fixes Ticket #125 for both the trunk and v1.1 branch.
This commit will apply cleanly to the v1.1 branch, and should be moved over once I get someone to verify it. The problem is outlined in the bug. The fix was to move the setting of the app context index (idx) before we put it in the GPR so that it is propogated to the gpr. The reason this hasn't bitten us before is because we init app->idx to 0, which is true most of the time. Except that is when MPI_Comm_spawn_multiple in which we put in more than one app context, thus care about correct indexing. This was causing down the line memory corruption by overrunning the mapping array. This commit also puts in a check to make sure that we error out if we ever try to do that again. This commit was SVN r10380.
Этот коммит содержится в:
родитель
ad1065d572
Коммит
58110f9fc9
@ -48,8 +48,10 @@ static void orte_rmaps_base_node_construct(orte_rmaps_base_node_t* node)
|
||||
static void orte_rmaps_base_node_destruct(orte_rmaps_base_node_t* node)
|
||||
{
|
||||
opal_list_item_t* item;
|
||||
if(NULL != node->node)
|
||||
if(NULL != node->node) {
|
||||
OBJ_RELEASE(node->node);
|
||||
node->node = NULL;
|
||||
}
|
||||
while(NULL != (item = opal_list_remove_first(&node->node_procs))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
@ -76,7 +78,10 @@ static void orte_rmaps_base_proc_construct(orte_rmaps_base_proc_t* proc)
|
||||
|
||||
static void orte_rmaps_base_proc_destruct(orte_rmaps_base_proc_t* proc)
|
||||
{
|
||||
if (NULL != proc->app) free(proc->app);
|
||||
if (NULL != proc->app) {
|
||||
free(proc->app);
|
||||
proc->app = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
@ -110,9 +115,11 @@ static void orte_rmaps_base_map_destruct(orte_rmaps_base_map_t* map)
|
||||
OBJ_RELEASE(item);
|
||||
if(NULL != map->procs) {
|
||||
free(map->procs);
|
||||
map->procs = NULL;
|
||||
}
|
||||
if(NULL != map->app) {
|
||||
OBJ_RELEASE(map->app);
|
||||
map->app = NULL;
|
||||
}
|
||||
OBJ_DESTRUCT(&map->nodes);
|
||||
}
|
||||
@ -450,9 +457,20 @@ int orte_rmaps_base_get_map(orte_jobid_t jobid, opal_list_t* mapping_list)
|
||||
OBJ_RELEASE(proc);
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* This seems like a dummy check, but it ensures that we fail
|
||||
* rather than overrun our array. This can happen if the
|
||||
* indicies on the app schemas are incorrect
|
||||
*/
|
||||
if(map->num_procs < map->app->num_procs) {
|
||||
map->procs[map->num_procs++] = proc;
|
||||
proc->proc_node = orte_rmaps_lookup_node(&map->nodes, &nodes, node_name, proc);
|
||||
}
|
||||
else {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* cleanup any nodes allocated and not mapped */
|
||||
while(NULL != (item = opal_list_remove_first(&nodes))) {
|
||||
|
@ -78,6 +78,8 @@ int orte_rmgr_base_put_app_context(
|
||||
|
||||
for(i=0; i<num_context; i++) {
|
||||
orte_app_context_t* app = app_context[i];
|
||||
app->idx = i;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[i]),
|
||||
ORTE_JOB_APP_CONTEXT_KEY,
|
||||
ORTE_APP_CONTEXT,
|
||||
@ -85,8 +87,8 @@ int orte_rmgr_base_put_app_context(
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OBJ_RETAIN(app);
|
||||
app->idx = i;
|
||||
job_slots += app->num_procs;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user