coll/ml: fix leaks in coll/ml resources
This patch fixes two leaks: - Fix typo in fallback collective code that caused coll/ml to retain the ibcast module twice but only release it once. One of those ibcast saves was supposed to be bcast. - Do not check for module initialization in the module destructor. It is possible to destruct a module that is partially setup. cmr=v1.8:reviewer=manjugv This commit was SVN r31187.
Этот коммит содержится в:
родитель
c49a5d1e12
Коммит
08bbdcbf61
@ -1,6 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -151,7 +154,6 @@ int ml_coll_schedule_setup(mca_coll_ml_module_t *ml_module)
|
||||
/* NOTE: as part of initialization each routine needs to make sure that
|
||||
* the module element max_dag_size is set large enough - space for
|
||||
* tracking collective progress is allocated based on this value. */
|
||||
OBJ_CONSTRUCT(&(ml_module->coll_ml_collective_descriptors), ompi_free_list_t);
|
||||
|
||||
/* figure out what the size of the ml buffer is */
|
||||
ml_per_proc_buffer_size=ml_module->payload_block->size_buffer;
|
||||
|
@ -2,7 +2,7 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -138,8 +138,13 @@ mca_coll_ml_module_construct(mca_coll_ml_module_t *module)
|
||||
module->small_message_thresholds[coll_i] = BCOL_THRESHOLD_UNLIMITED;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&(module->active_bcols_list), opal_list_t);
|
||||
OBJ_CONSTRUCT(&(module->waiting_for_memory_list), opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->active_bcols_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->waiting_for_memory_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->fragment_descriptors, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&module->message_descriptors, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&module->coll_ml_collective_descriptors, ompi_free_list_t);
|
||||
|
||||
memset (&module->fallback, 0, sizeof (module->fallback));
|
||||
}
|
||||
|
||||
#define ML_RELEASE_FALLBACK(_coll_ml, _coll) \
|
||||
@ -158,81 +163,79 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module)
|
||||
|
||||
ML_VERBOSE(4, ("ML module destruct"));
|
||||
|
||||
if (module->initialized) {
|
||||
for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) {
|
||||
topo = &module->topo_list[index_topo];
|
||||
if (COLL_ML_TOPO_DISABLED == topo->status) {
|
||||
/* skip the topology */
|
||||
continue;
|
||||
}
|
||||
for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) {
|
||||
topo = &module->topo_list[index_topo];
|
||||
if (COLL_ML_TOPO_DISABLED == topo->status) {
|
||||
/* skip the topology */
|
||||
continue;
|
||||
}
|
||||
|
||||
if (NULL != topo->component_pairs) {
|
||||
for(i = 0; i < topo->n_levels; ++i) {
|
||||
for(j = 0; j < topo->component_pairs[i].num_bcol_modules; ++j) {
|
||||
OBJ_RELEASE(topo->component_pairs[i].bcol_modules[j]);
|
||||
}
|
||||
/* free the array of bcol module */
|
||||
free(topo->component_pairs[i].bcol_modules);
|
||||
|
||||
OBJ_RELEASE(topo->component_pairs[i].subgroup_module);
|
||||
if (NULL != topo->component_pairs) {
|
||||
for(i = 0; i < topo->n_levels; ++i) {
|
||||
for(j = 0; j < topo->component_pairs[i].num_bcol_modules; ++j) {
|
||||
OBJ_RELEASE(topo->component_pairs[i].bcol_modules[j]);
|
||||
}
|
||||
/* free the array of bcol module */
|
||||
free(topo->component_pairs[i].bcol_modules);
|
||||
|
||||
free(topo->component_pairs);
|
||||
OBJ_RELEASE(topo->component_pairs[i].subgroup_module);
|
||||
}
|
||||
|
||||
/* gvm Leak FIX Free collective algorithms structure */
|
||||
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
|
||||
if (NULL != topo->hierarchical_algorithms[fnc]){
|
||||
free(topo->hierarchical_algorithms[fnc]);
|
||||
}
|
||||
}
|
||||
free(topo->component_pairs);
|
||||
}
|
||||
|
||||
/* free up the route vector memory */
|
||||
if (NULL != topo->route_vector) {
|
||||
free(topo->route_vector);
|
||||
}
|
||||
/* free resrouce description */
|
||||
if(NULL != topo->array_of_all_subgroups) {
|
||||
for( k=0 ; k < topo->number_of_all_subgroups ; k++ ) {
|
||||
if(0 < topo->array_of_all_subgroups[k].n_ranks) {
|
||||
free(topo->array_of_all_subgroups[k].rank_data);
|
||||
topo->array_of_all_subgroups[k].rank_data = NULL;
|
||||
}
|
||||
}
|
||||
free(topo->array_of_all_subgroups);
|
||||
topo->array_of_all_subgroups = NULL;
|
||||
/* gvm Leak FIX Free collective algorithms structure */
|
||||
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
|
||||
if (NULL != topo->hierarchical_algorithms[fnc]){
|
||||
free(topo->hierarchical_algorithms[fnc]);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&(module->active_bcols_list));
|
||||
OBJ_DESTRUCT(&(module->waiting_for_memory_list));
|
||||
|
||||
/* gvm Leak FIX Remove fragment free list */
|
||||
OBJ_DESTRUCT(&(module->fragment_descriptors));
|
||||
OBJ_DESTRUCT(&(module->message_descriptors));
|
||||
/* push mca_bcol_base_memory_block_desc_t back on list manager */
|
||||
mca_coll_ml_free_block(module->payload_block);
|
||||
/* release the cinvertor if it was allocated */
|
||||
if (NULL != module->reference_convertor) {
|
||||
OBJ_RELEASE(module->reference_convertor);
|
||||
/* free up the route vector memory */
|
||||
if (NULL != topo->route_vector) {
|
||||
free(topo->route_vector);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&(module->coll_ml_collective_descriptors));
|
||||
|
||||
if (NULL != module->coll_ml_barrier_function) {
|
||||
free(module->coll_ml_barrier_function);
|
||||
/* free resrouce description */
|
||||
if(NULL != topo->array_of_all_subgroups) {
|
||||
for( k=0 ; k < topo->number_of_all_subgroups ; k++ ) {
|
||||
if(0 < topo->array_of_all_subgroups[k].n_ranks) {
|
||||
free(topo->array_of_all_subgroups[k].rank_data);
|
||||
topo->array_of_all_subgroups[k].rank_data = NULL;
|
||||
}
|
||||
}
|
||||
free(topo->array_of_all_subgroups);
|
||||
topo->array_of_all_subgroups = NULL;
|
||||
}
|
||||
|
||||
/* release saved collectives */
|
||||
ML_RELEASE_FALLBACK(module, allreduce);
|
||||
ML_RELEASE_FALLBACK(module, allgather);
|
||||
ML_RELEASE_FALLBACK(module, reduce);
|
||||
ML_RELEASE_FALLBACK(module, ibcast);
|
||||
ML_RELEASE_FALLBACK(module, iallreduce);
|
||||
ML_RELEASE_FALLBACK(module, iallgather);
|
||||
ML_RELEASE_FALLBACK(module, ireduce);
|
||||
ML_RELEASE_FALLBACK(module, ibcast);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&(module->active_bcols_list));
|
||||
OBJ_DESTRUCT(&(module->waiting_for_memory_list));
|
||||
|
||||
/* gvm Leak FIX Remove fragment free list */
|
||||
OBJ_DESTRUCT(&(module->fragment_descriptors));
|
||||
OBJ_DESTRUCT(&(module->message_descriptors));
|
||||
/* push mca_bcol_base_memory_block_desc_t back on list manager */
|
||||
mca_coll_ml_free_block(module->payload_block);
|
||||
/* release the cinvertor if it was allocated */
|
||||
if (NULL != module->reference_convertor) {
|
||||
OBJ_RELEASE(module->reference_convertor);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&(module->coll_ml_collective_descriptors));
|
||||
|
||||
if (NULL != module->coll_ml_barrier_function) {
|
||||
free(module->coll_ml_barrier_function);
|
||||
}
|
||||
|
||||
/* release saved collectives */
|
||||
ML_RELEASE_FALLBACK(module, allreduce);
|
||||
ML_RELEASE_FALLBACK(module, allgather);
|
||||
ML_RELEASE_FALLBACK(module, reduce);
|
||||
ML_RELEASE_FALLBACK(module, bcast);
|
||||
ML_RELEASE_FALLBACK(module, iallreduce);
|
||||
ML_RELEASE_FALLBACK(module, iallgather);
|
||||
ML_RELEASE_FALLBACK(module, ireduce);
|
||||
ML_RELEASE_FALLBACK(module, ibcast);
|
||||
}
|
||||
|
||||
|
||||
@ -2658,8 +2661,6 @@ static int init_lists(mca_coll_ml_module_t *ml_module)
|
||||
/* initialize full message descriptors - moving this to the
|
||||
* module, as the fragment has resrouce requirements that
|
||||
* are communicator dependent */
|
||||
OBJ_CONSTRUCT(&(ml_module->message_descriptors), ompi_free_list_t);
|
||||
|
||||
/* no data associated with the message descriptor */
|
||||
|
||||
length = sizeof(mca_coll_ml_descriptor_t);
|
||||
@ -2679,8 +2680,6 @@ static int init_lists(mca_coll_ml_module_t *ml_module)
|
||||
* small message latency */
|
||||
|
||||
/* create a free list of fragment descriptors */
|
||||
OBJ_CONSTRUCT(&(ml_module->fragment_descriptors), ompi_free_list_t);
|
||||
|
||||
/*length_payload=sizeof(something);*/
|
||||
length = sizeof(mca_coll_ml_fragment_t);
|
||||
ret = ompi_free_list_init_ex_new(&(ml_module->fragment_descriptors), length,
|
||||
@ -3132,7 +3131,7 @@ static void ml_save_fallback_colls (mca_coll_ml_module_t *coll_ml,
|
||||
ML_SAVE_FALLBACK(coll_ml, allreduce);
|
||||
ML_SAVE_FALLBACK(coll_ml, allgather);
|
||||
ML_SAVE_FALLBACK(coll_ml, reduce);
|
||||
ML_SAVE_FALLBACK(coll_ml, ibcast);
|
||||
ML_SAVE_FALLBACK(coll_ml, bcast);
|
||||
ML_SAVE_FALLBACK(coll_ml, iallreduce);
|
||||
ML_SAVE_FALLBACK(coll_ml, iallgather);
|
||||
ML_SAVE_FALLBACK(coll_ml, ireduce);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user