1
1

coll/ml: fix issues identified by the clang static analyser and fix

a segmentation fault in the reduce cleanup

Some of the changes address false warnings produced by scan-build. I
added asserts and changed some malloc calls to calloc to silence these
warnings.

The was one issue in cleanup for reduce since the component_functions
member is changed by the allreduce call. There may be other issues
with how this code works but releasing the allocated
component_functions after setting up the static functions addresses
the primary issue (SIGSEGV).

cmr=v1.8.1:reviewer=manjugv

This commit was SVN r31417.
Этот коммит содержится в:
Nathan Hjelm 2014-04-16 22:43:35 +00:00
родитель f80aece271
Коммит 484a3f6147
15 изменённых файлов: 157 добавлений и 96 удалений

Просмотреть файл

@ -456,14 +456,14 @@ int mca_coll_ml_allgather_start (void *sbuf, int scount,
mca_coll_ml_convertor_pack(
(void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len *
(coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
(topo_info->hier_layout_info[0].offset +
topo_info->hier_layout_info[0].level_one_index)),
frag_len, &coll_op->full_message.send_convertor);
} else {
/* change 6 */
memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len *
(coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
(topo_info->hier_layout_info[0].offset +
topo_info->hier_layout_info[0].level_one_index)),
sbuf, frag_len);
}

Просмотреть файл

@ -88,6 +88,10 @@ int mca_coll_ml_initialize_block(mca_bcol_base_memory_block_desc_t *ml_memblock,
uint64_t addr_offset = 0;
mca_bcol_base_payload_buffer_desc_t *pbuff_descs = NULL,*pbuff_desc = NULL;
if (0 == num_banks || 0 == num_buffers || 0 == buffer_size) {
return OMPI_ERR_BAD_PARAM;
}
if (NULL == ml_memblock){
ML_ERROR(("Memory block not initialized"));
ret = OMPI_ERROR;
@ -102,6 +106,9 @@ int mca_coll_ml_initialize_block(mca_bcol_base_memory_block_desc_t *ml_memblock,
pbuff_descs = (mca_bcol_base_payload_buffer_desc_t*) malloc(sizeof(mca_bcol_base_payload_buffer_desc_t)
* num_banks * num_buffers);
if (NULL == pbuff_descs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(bank_loop = 0; bank_loop < num_banks; bank_loop++)
for(buff_loop = 0; buff_loop < num_buffers; buff_loop++){

Просмотреть файл

@ -2,6 +2,8 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -466,6 +468,7 @@ do {
for (i = 0; i < (schedule)->n_fns; ++i) { \
mca_bcol_base_module_t *current_bcol = \
(schedule)->component_functions[i].constant_group_data.bcol_module; \
assert (NULL != current_bcol); \
if (current_bcol->bcol_component->need_ordering) { \
(schedule)->n_fns_need_ordering++; \
} \

Просмотреть файл

@ -182,6 +182,10 @@ static int coll_ml_progress()
}
} else {
rc = seq_coll_op->sequential_routine.seq_task_setup(seq_coll_op);
if (OMPI_SUCCESS != rc) {
mca_coll_ml_abort_ml("Failed to run sequential task setup");
}
seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
continue;
}

Просмотреть файл

@ -2,7 +2,7 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -43,6 +43,7 @@ typedef struct coll_config_t {
static int algorithm_name_to_id(char *name)
{
assert (NULL != name);
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_KNOWN"))
return ML_BCAST_SMALL_DATA_KNOWN;
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_UNKNOWN"))
@ -110,6 +111,7 @@ static int algorithm_name_to_id(char *name)
static int hierarchy_name_to_id(char *name)
{
assert (NULL != name);
if (!strcasecmp(name, "FULL_HR")) {
return COLL_ML_HR_FULL;
}
@ -128,6 +130,7 @@ static int hierarchy_name_to_id(char *name)
static int section_name_to_id(char *name)
{
assert (NULL != name);
if (!strcasecmp(name, "SMALL")) {
return ML_SMALL_MSG;
}
@ -141,6 +144,7 @@ static int section_name_to_id(char *name)
static int coll_name_to_id(char *name)
{
assert (NULL != name);
if (!strcasecmp(name, "ALLGATHER")) {
return ML_ALLGATHER;
}
@ -339,6 +343,8 @@ static int parse_algorithm_key(section_config_t *section, char *value)
static int parse_threshold_key(section_config_t *section, char *value)
{
assert (NULL != value);
if(!strcasecmp(value, "unlimited")) {
section->config.threshold = -1;
} else {
@ -364,6 +370,8 @@ static int parse_hierarchy_key(section_config_t *section, char *value)
static int parse_fragmentation_key(section_config_t *section, char *value)
{
assert (NULL != value);
if(!strcasecmp(value, "enable")) {
section->config.fragmentation_enabled = 1;
} else if (!strcasecmp(value, "disable")) {

Просмотреть файл

@ -1,6 +1,9 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -68,6 +71,8 @@ int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_m
if (OPAL_UNLIKELY(NULL == ranks_in_comm)) {
ML_ERROR(("Memory allocation failed."));
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_NO_MEM, true);
/* not reached but causes a clang warning to not return here */
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < comm_size; ++i) {
@ -114,12 +119,9 @@ int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_m
int mca_coll_ml_check_if_bcol_is_requested(const char *component_name)
{
mca_base_component_list_item_t *bcol_comp;
bcol_comp = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use);
ML_VERBOSE(10, ("Loop over bcol components"));
for ( bcol_comp = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use);
bcol_comp != (mca_base_component_list_item_t *) opal_list_get_end(&mca_bcol_base_components_in_use);
bcol_comp = (mca_base_component_list_item_t *) opal_list_get_next(bcol_comp)) {
OPAL_LIST_FOREACH(bcol_comp, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
if(0 == strcmp(component_name,
((mca_bcol_base_component_2_0_0_t *)
bcol_comp->cli_component)->bcol_version.mca_component_name)) {

Просмотреть файл

@ -1,6 +1,9 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -29,13 +32,12 @@ static int mca_coll_ml_build_memsync_schedule(
mca_coll_ml_collective_operation_description_t *schedule;
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
schedule = *coll_desc;
if (OPAL_UNLIKELY(NULL == schedule)) {
ML_ERROR(("Can't allocate memory."));
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto Barrier_Setup_Error;
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (topo_info->global_highest_hier_group_index ==
@ -168,6 +170,9 @@ Barrier_Setup_Error:
schedule->component_functions = NULL;
}
free (schedule);
*coll_desc = NULL;
return rc;
}

Просмотреть файл

@ -106,9 +106,7 @@ static int mca_coll_ml_build_allgather_schedule(mca_coll_ml_topology_t *topo_inf
if (NULL != scratch_num) {
free(scratch_num);
}
if (NULL != schedule->component_functions) {
free(schedule->component_functions);
}
return ret;
}

Просмотреть файл

@ -65,15 +65,14 @@ static int mca_coll_ml_build_allreduce_schedule(
}
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
schedule = *coll_desc;
if (NULL == schedule) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto Allreduce_Setup_Error;
return OMPI_ERR_OUT_OF_RESOURCE;
}
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers * 2));
scratch_indx = (int *) calloc(n_hiers * 2, sizeof (int));
if (NULL == scratch_indx) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
@ -93,7 +92,6 @@ static int mca_coll_ml_build_allreduce_schedule(
if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
} else {
scratch_indx[cnt] = 0;
prev_bcol = GET_BCOL(topo_info, i);
}
}
@ -103,7 +101,6 @@ static int mca_coll_ml_build_allreduce_schedule(
if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, n_hiers - 1))) {
scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
} else {
scratch_indx[cnt] = 0;
prev_bcol = GET_BCOL(topo_info, n_hiers - 1);
}
@ -115,7 +112,6 @@ static int mca_coll_ml_build_allreduce_schedule(
if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
} else {
scratch_indx[cnt] = 0;
prev_bcol = GET_BCOL(topo_info, i);
}
}
@ -282,6 +278,8 @@ Allreduce_Setup_Error:
if (NULL != schedule->component_functions) {
free(schedule->component_functions);
}
*coll_desc = NULL;
free (schedule);
return ret;
}

Просмотреть файл

@ -89,6 +89,7 @@ static int mca_coll_ml_build_barrier_schedule(
if (NULL == comp_fn->bcol_function) {
ML_VERBOSE(10, ("no function available for BCOL_FANIN, NON_BLOCKING, DATA_SRC_KNOWN"));
rc = OMPI_ERR_NOT_AVAILABLE;
goto Barrier_Setup_Error;
}
@ -105,6 +106,7 @@ static int mca_coll_ml_build_barrier_schedule(
if (NULL == comp_fn->bcol_function) {
ML_VERBOSE(10, ("no function available for BCOL_BARRIER, NON_BLOCKING, DATA_SRC_KNOWN"));
rc = OMPI_ERR_NOT_AVAILABLE;
goto Barrier_Setup_Error;
}
@ -125,6 +127,7 @@ static int mca_coll_ml_build_barrier_schedule(
if (NULL == comp_fn->bcol_function) {
ML_VERBOSE(10, ("no function available for BCOL_FANOUT, NON_BLOCKING, DATA_SRC_KNOWN"));
rc = OMPI_ERR_NOT_AVAILABLE;
goto Barrier_Setup_Error;
}

Просмотреть файл

@ -145,15 +145,14 @@ static int mca_coll_ml_build_bcast_dynamic_schedule_no_attributes(
*bcol_module;
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
schedule = *coll_desc;
if (NULL == schedule) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto Bcast_Setup_Error;
return OMPI_ERR_OUT_OF_RESOURCE;
}
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
scratch_indx = (int *) calloc(n_hiers, sizeof (int));
if (NULL == scratch_indx) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
@ -315,15 +314,14 @@ static int mca_coll_ml_build_bcast_sequential_schedule_no_attributes(
*bcol_module;
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
schedule = *coll_desc;
if (NULL == schedule) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto Bcast_Setup_Error;
return OMPI_ERR_OUT_OF_RESOURCE;
}
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
scratch_indx = (int *) calloc(n_hiers, sizeof (int));
if (NULL == scratch_indx) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
@ -520,6 +518,8 @@ Bcast_Setup_Error:
if (NULL != schedule->comp_fn_arr) {
free(schedule->comp_fn_arr);
}
free (schedule);
*coll_desc = NULL;
return ret;
}
@ -569,15 +569,14 @@ static int mca_coll_ml_build_bcast_known_schedule_no_attributes(
*bcol_module;
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
schedule = *coll_desc;
if (NULL == schedule) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto Bcast_Setup_Error;
return OMPI_ERR_OUT_OF_RESOURCE;
}
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
scratch_indx = (int *) calloc(n_hiers, sizeof (int));
if (NULL == scratch_indx) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
@ -598,7 +597,6 @@ static int mca_coll_ml_build_bcast_known_schedule_no_attributes(
if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) {
scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1;
} else {
scratch_indx[i_hier] = 0;
prev_bcol = GET_BCOL(topo_info, i_hier);
}
}
@ -725,6 +723,8 @@ Bcast_Setup_Error:
if (NULL != schedule->component_functions) {
free(schedule->component_functions);
}
free (schedule);
*coll_desc = NULL;
return ret;
}
@ -813,6 +813,8 @@ void ml_coll_hier_bcast_cleanup(mca_coll_ml_module_t *ml_module)
int topo_index = 0;
mca_coll_ml_topology_t *topo_info = ml_module->topo_list;
assert (NULL != ml_module);
for (i = 0; i < ML_NUM_MSG; i++) {
switch (i) {
@ -832,30 +834,18 @@ void ml_coll_hier_bcast_cleanup(mca_coll_ml_module_t *ml_module)
return;
}
if (NULL == ml_module->coll_ml_bcast_functions[alg]) {
continue;
}
switch (alg) {
case ML_BCAST_SMALL_DATA_KNOWN:
case ML_BCAST_LARGE_DATA_KNOWN:
case ML_BCAST_SMALL_DATA_UNKNOWN:
case ML_BCAST_LARGE_DATA_UNKNOWN:
case ML_BCAST_SMALL_DATA_SEQUENTIAL:
case ML_BCAST_LARGE_DATA_SEQUENTIAL:
if (NULL != ml_module->coll_ml_bcast_functions[alg]) {
if (ML_BCAST_SMALL_DATA_KNOWN <= alg && ML_BCAST_LARGE_DATA_SEQUENTIAL >= alg) {
if (ml_module->coll_ml_bcast_functions[alg]->component_functions) {
free(ml_module->coll_ml_bcast_functions[alg]->component_functions);
ml_module->coll_ml_bcast_functions[alg]->component_functions = NULL;
}
if (ml_module->coll_ml_bcast_functions[alg]) {
free(ml_module->coll_ml_bcast_functions[alg]);
ml_module->coll_ml_bcast_functions[alg] = NULL;
}
break;
default:
free(ml_module->coll_ml_bcast_functions[alg]);
ml_module->coll_ml_bcast_functions[alg] = NULL;
} else {
topo_info->hierarchical_algorithms[ML_BCAST] = NULL;
return;
}
}
}
}

Просмотреть файл

@ -74,7 +74,7 @@ static int mca_coll_ml_build_static_reduce_schedule(
mca_coll_ml_collective_operation_description_t *schedule = NULL;
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
schedule = *coll_desc;
if (OPAL_UNLIKELY(NULL == schedule)) {
@ -83,7 +83,7 @@ static int mca_coll_ml_build_static_reduce_schedule(
goto Error;
}
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
scratch_indx = (int *) calloc (n_hiers, sizeof (int));
if (NULL == scratch_indx) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
@ -267,15 +267,32 @@ static int mca_coll_ml_build_static_reduce_schedule(
MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule);
/* reduce does not use the component functions so we no longer need this. see
* coll_ml_reduce.c:442 */
free (schedule->component_functions);
schedule->component_functions = NULL;
free(scratch_num);
free(scratch_indx);
return OMPI_SUCCESS;
Error:
if (NULL != schedule->component_functions) {
free(schedule->component_functions);
schedule->component_functions = NULL;
if (NULL != scratch_num) {
free (scratch_num);
}
if (NULL != scratch_indx) {
free (scratch_indx);
}
if (NULL != schedule) {
if (NULL != schedule->component_functions) {
free(schedule->component_functions);
schedule->component_functions = NULL;
}
free (schedule);
*coll_desc = NULL;
}
return ret;
@ -335,25 +352,20 @@ void ml_coll_hier_reduce_cleanup(mca_coll_ml_module_t *ml_module)
return;
}
for (i=0; i<ml_module->topo_list[topo_index].n_levels; i++) {
if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]) {
free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]);
ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i] = NULL;
}
}
if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr) {
for (i=0; i<ml_module->topo_list[topo_index].n_levels; i++) {
if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]) {
free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]);
ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i] = NULL;
}
}
free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr);
ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr = NULL;
}
if (ml_module->coll_ml_reduce_functions[alg]->component_functions) {
free(ml_module->coll_ml_reduce_functions[alg]->component_functions);
ml_module->coll_ml_reduce_functions[alg]->component_functions = NULL;
}
ml_module->coll_ml_reduce_functions[alg]->component_functions = NULL;
if (ml_module->coll_ml_reduce_functions[alg]) {
free(ml_module->coll_ml_reduce_functions[alg]);
ml_module->coll_ml_reduce_functions[alg] = NULL;
}
free(ml_module->coll_ml_reduce_functions[alg]);
ml_module->coll_ml_reduce_functions[alg] = NULL;
}

Просмотреть файл

@ -1,6 +1,9 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -76,7 +79,7 @@ int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module,
/* allocate space for the functions */
collective_alg->functions = (mca_bcol_base_function_t *)
malloc(sizeof(mca_bcol_base_function_t) * collective_alg->n_functions);
calloc(collective_alg->n_functions, sizeof(mca_bcol_base_function_t));
if( NULL == collective_alg->functions) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
@ -98,7 +101,7 @@ int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module,
/* Figure out how many of the same bcols are called in a row.
* The index of the bcol in row we store in scratch_indx and
* the total number of bcols in the row we store in scratch_num */
scratch_indx = (int *) malloc(sizeof(int) * (2 * num_hierarchies));
scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int));
if(NULL == scratch_indx) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
@ -373,7 +376,7 @@ int ml_coll_barrier_constant_group_data_setup(
/* Figure out how many of the same bcols are called in a row.
* The index of the bcol in row we store in scratch_indx and
* the total number of bcols in the row we store in scratch_num */
scratch_indx = (int *) malloc(sizeof(int) * (2 * num_hierarchies));
scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int));
if(NULL == scratch_indx) {
ML_ERROR(("Can't allocate memory."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
@ -498,6 +501,10 @@ int ml_coll_barrier_constant_group_data_setup(
mca_bcol_base_module_t *current_bcol =
component_functions[i].constant_group_data.bcol_module;
/* silence clang warning about possible NULL dereference of component_functions.
* this case is a developer error if it occurs */
assert (NULL != component_functions && NULL != constant_group_data);
cnt = 0;
for (j = 0; j < n_functions; ++j) {
if (current_bcol ==

Просмотреть файл

@ -251,6 +251,9 @@ int mca_coll_ml_register_params(void)
"Algorithm to use for broadcast", MCA_BASE_VAR_TYPE_INT,
new_enum, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&mca_coll_ml_component.bcast_algorithm);
if (0 > tmp) {
ret = tmp;
}
CHECK(reg_bool("disable_allgather", NULL, "Disable Allgather", false,
&mca_coll_ml_component.disable_allgather));

Просмотреть файл

@ -143,8 +143,6 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module)
ML_VERBOSE(4, ("ML module destruct"));
ml_coll_hier_reduce_cleanup(module);
for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) {
topo = &module->topo_list[index_topo];
if (COLL_ML_TOPO_DISABLED == topo->status) {
@ -230,6 +228,7 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module)
ml_coll_hier_allreduce_cleanup_new(module);
ml_coll_hier_allgather_cleanup(module);
ml_coll_hier_bcast_cleanup(module);
ml_coll_hier_reduce_cleanup(module);
/* release saved collectives */
ML_RELEASE_FALLBACK(module, allreduce);
@ -629,7 +628,7 @@ static int check_global_view_of_subgroups( int n_procs_selected,
return ret;
}
static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ranks_in_all_subgroups, int my_rank_in_list)
static int ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ranks_in_all_subgroups, int my_rank_in_list)
{
int *list_n_connected;
int group_size, rank, i, j, knt, offset, k, my_sbgp = 0;
@ -643,6 +642,10 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra
mca_coll_ml_leader_offset_info_t *loc_leader = (mca_coll_ml_leader_offset_info_t *)
malloc(sizeof(mca_coll_ml_leader_offset_info_t)*(n_hier+1));
if (NULL == loc_leader) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* first thing I want to know is where does the first level end */
level_one_knt = 0;
@ -692,7 +695,11 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra
/* malloc some memory for the new list to cache
on the bcol module
*/
list_n_connected = (int *) malloc(sizeof(int)*group_size);
list_n_connected = (int *) calloc(group_size, sizeof (int));
if (NULL == list_n_connected) {
free (loc_leader);
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* next thing to do is to find out which subgroup I'm in
* at this particular level
@ -803,6 +810,8 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra
/* what other goodies do I want to cache on the ml-module? */
topo->hier_layout_info = loc_leader;
return OMPI_SUCCESS;
}
static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
@ -857,7 +866,6 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
}
if( my_rank == root ) {
sum=0;
for(i=0 ; i < (*num_total_subgroups) ; i++ ) {
scratch_space[4*i]=(*array_of_all_subgroup_ranks)[i].root_rank_in_comm;
scratch_space[4*i+1]=(*array_of_all_subgroup_ranks)[i].n_ranks;
@ -899,7 +907,7 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
for(i=0 ; i < (*num_total_subgroups) ; i++ ) {
sum+=(*array_of_all_subgroup_ranks)[i].n_ranks;
}
if( in_num_total_subgroups != (*num_total_subgroups) ) {
if( in_num_total_subgroups != (*num_total_subgroups) && sum > 0 ) {
(*list_of_ranks_in_all_subgroups)=(int *)
realloc((*list_of_ranks_in_all_subgroups),sizeof(int)*sum);
if (OPAL_UNLIKELY(NULL == (*list_of_ranks_in_all_subgroups))) {
@ -1040,11 +1048,16 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
PROVIDE_SUFFICIENT_MEMORY((*sub_group_meta_data), dummy1,
(*size_of_sub_group_meta_data),
sub_group_params_t, (*num_total_subgroups), 1, 5);
if (OPAL_UNLIKELY(NULL == (*sub_group_meta_data))) {
ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data."));
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto exit_ERROR;
}
/* do this for the temporary memory slots */
PROVIDE_SUFFICIENT_MEMORY(temp, dummy2,
knt1, int32_t *, knt2, 1, 5);
if (OPAL_UNLIKELY(NULL == (*sub_group_meta_data))) {
ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data."));
if (OPAL_UNLIKELY(NULL == temp)) {
ML_VERBOSE(10, ("Cannot allocate memory for temporary storage"));
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto exit_ERROR;
}
@ -1053,7 +1066,7 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
/* no need for this here - use a temporary ptr */
temp[knt2]=
(int *)malloc(sizeof(int)*size_of_all_selected);
(int *)calloc(size_of_all_selected, sizeof(int));
if (OPAL_UNLIKELY(NULL == temp[knt2] ) ){
ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data."));
rc = OMPI_ERR_OUT_OF_RESOURCE;
@ -1102,12 +1115,12 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
}
/* clean up temporary storage */
if(NULL != temp) {
exit_ERROR:
if (NULL != temp) {
free(temp);
}
/* return */
exit_ERROR:
return rc;
}
@ -1716,6 +1729,9 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
/* number of processes selected with this sbgp on all ranks */
int global_n_procs_selected;
/* silence clang warnings */
assert (NULL != bcol_cli && NULL != sbgp_cli);
/*
** obtain the list of ranks in the current level
*/
@ -1985,6 +2001,8 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
* set largest power of 2 for this group
*/
module->n_levels_pow2 = ml_fls(module->group_size);
/* silence a clang warning */
assert (module->n_levels_pow2 > 0 && module->n_levels_pow2 < 32);
module->pow_2 = 1 << module->n_levels_pow2;
n_hier++;
@ -2113,7 +2131,11 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
topo->number_of_all_subgroups = num_total_subgroups;
topo->array_of_all_subgroups = array_of_all_subgroup_ranks;
ml_init_k_nomial_trees(topo, list_of_ranks_in_all_subgroups, ompi_comm_rank(ml_module->comm));
ret = ml_init_k_nomial_trees(topo, list_of_ranks_in_all_subgroups, ompi_comm_rank(ml_module->comm));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto exit_ERROR;
}
/* Set the route table if know-root type of algorithms is used */
if (COLL_ML_STATIC_BCAST == mca_coll_ml_component.bcast_algorithm) {
ret = mca_coll_ml_fill_in_route_tab(topo, ml_module->comm);
@ -2203,15 +2225,12 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
sbgp_cli = (sbgp_base_component_keyval_t *)
opal_list_get_first(&mca_sbgp_base_components_in_use);
for (bcol_cli = (mca_base_component_list_item_t *)
opal_list_get_first(&mca_bcol_base_components_in_use);
(opal_list_item_t *) bcol_cli !=
opal_list_get_end(&mca_bcol_base_components_in_use);
bcol_cli = (mca_base_component_list_item_t *)
opal_list_get_next((opal_list_item_t *) bcol_cli),
sbgp_cli = (sbgp_base_component_keyval_t *)
opal_list_get_next((opal_list_item_t *) sbgp_cli)) {
OPAL_LIST_FOREACH(bcol_cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component;
/* silence false-positive clang warning */
assert (NULL != sbgp_cli);
if (NULL != bcol_component->coll_support_all_types &&
!bcol_component->coll_support_all_types(BCOL_ALLREDUCE)) {
mca_base_component_list_item_t *bcol_cli_next;
@ -2256,6 +2275,8 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
&ml_module->topo_list[COLL_ML_HR_ALLREDUCE],
n_hierarchies, sbgp_component->sbgp_version.mca_component_name, NULL);
}
sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
}
return OMPI_SUCCESS;