From 58110f9fc96faddcab1760c4ff2e4df56b3f7934 Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Thu, 15 Jun 2006 22:14:07 +0000 Subject: [PATCH] Fixes Ticket #125 for both the trunk and v1.1 branch. This commit will apply cleanly to the v1.1 branch, and should be moved over once I get someone to verify it. The problem is outlined in the bug. The fix was to move the setting of the app context index (idx) before we put it in the GPR so that it is propogated to the gpr. The reason this hasn't bitten us before is because we init app->idx to 0, which is true most of the time. Except that is when MPI_Comm_spawn_multiple in which we put in more than one app context, thus care about correct indexing. This was causing down the line memory corruption by overrunning the mapping array. This commit also puts in a check to make sure that we error out if we ever try to do that again. This commit was SVN r10380. --- orte/mca/rmaps/base/rmaps_base_map.c | 26 ++++++++++++++++++++++---- orte/mca/rmgr/base/rmgr_base_context.c | 4 +++- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/orte/mca/rmaps/base/rmaps_base_map.c b/orte/mca/rmaps/base/rmaps_base_map.c index df45d24479..73a09b13da 100644 --- a/orte/mca/rmaps/base/rmaps_base_map.c +++ b/orte/mca/rmaps/base/rmaps_base_map.c @@ -48,8 +48,10 @@ static void orte_rmaps_base_node_construct(orte_rmaps_base_node_t* node) static void orte_rmaps_base_node_destruct(orte_rmaps_base_node_t* node) { opal_list_item_t* item; - if(NULL != node->node) + if(NULL != node->node) { OBJ_RELEASE(node->node); + node->node = NULL; + } while(NULL != (item = opal_list_remove_first(&node->node_procs))) { OBJ_RELEASE(item); } @@ -76,7 +78,10 @@ static void orte_rmaps_base_proc_construct(orte_rmaps_base_proc_t* proc) static void orte_rmaps_base_proc_destruct(orte_rmaps_base_proc_t* proc) { - if (NULL != proc->app) free(proc->app); + if (NULL != proc->app) { + free(proc->app); + proc->app = NULL; + } } OBJ_CLASS_INSTANCE( @@ -110,9 +115,11 @@ static void orte_rmaps_base_map_destruct(orte_rmaps_base_map_t* map) OBJ_RELEASE(item); if(NULL != map->procs) { free(map->procs); + map->procs = NULL; } if(NULL != map->app) { OBJ_RELEASE(map->app); + map->app = NULL; } OBJ_DESTRUCT(&map->nodes); } @@ -450,8 +457,19 @@ int orte_rmaps_base_get_map(orte_jobid_t jobid, opal_list_t* mapping_list) OBJ_RELEASE(proc); continue; } - map->procs[map->num_procs++] = proc; - proc->proc_node = orte_rmaps_lookup_node(&map->nodes, &nodes, node_name, proc); + /* + * This seems like a dummy check, but it ensures that we fail + * rather than overrun our array. This can happen if the + * indicies on the app schemas are incorrect + */ + if(map->num_procs < map->app->num_procs) { + map->procs[map->num_procs++] = proc; + proc->proc_node = orte_rmaps_lookup_node(&map->nodes, &nodes, node_name, proc); + } + else { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + goto cleanup; + } } /* cleanup any nodes allocated and not mapped */ diff --git a/orte/mca/rmgr/base/rmgr_base_context.c b/orte/mca/rmgr/base/rmgr_base_context.c index e43ac8b706..8f4024db77 100644 --- a/orte/mca/rmgr/base/rmgr_base_context.c +++ b/orte/mca/rmgr/base/rmgr_base_context.c @@ -78,6 +78,8 @@ int orte_rmgr_base_put_app_context( for(i=0; iidx = i; + if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[i]), ORTE_JOB_APP_CONTEXT_KEY, ORTE_APP_CONTEXT, @@ -85,8 +87,8 @@ int orte_rmgr_base_put_app_context( ORTE_ERROR_LOG(rc); goto cleanup; } + OBJ_RETAIN(app); - app->idx = i; job_slots += app->num_procs; }