diff --git a/ompi/mca/coll/ml/coll_ml_allgather.c b/ompi/mca/coll/ml/coll_ml_allgather.c index c67b84db87..9501112627 100644 --- a/ompi/mca/coll/ml/coll_ml_allgather.c +++ b/ompi/mca/coll/ml/coll_ml_allgather.c @@ -456,14 +456,14 @@ int mca_coll_ml_allgather_start (void *sbuf, int scount, mca_coll_ml_convertor_pack( (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len * - (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + - coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), + (topo_info->hier_layout_info[0].offset + + topo_info->hier_layout_info[0].level_one_index)), frag_len, &coll_op->full_message.send_convertor); } else { /* change 6 */ memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len * - (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + - coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), + (topo_info->hier_layout_info[0].offset + + topo_info->hier_layout_info[0].level_one_index)), sbuf, frag_len); } diff --git a/ompi/mca/coll/ml/coll_ml_allocation.c b/ompi/mca/coll/ml/coll_ml_allocation.c index 2a91d948d7..555c5e9aff 100644 --- a/ompi/mca/coll/ml/coll_ml_allocation.c +++ b/ompi/mca/coll/ml/coll_ml_allocation.c @@ -88,6 +88,10 @@ int mca_coll_ml_initialize_block(mca_bcol_base_memory_block_desc_t *ml_memblock, uint64_t addr_offset = 0; mca_bcol_base_payload_buffer_desc_t *pbuff_descs = NULL,*pbuff_desc = NULL; + if (0 == num_banks || 0 == num_buffers || 0 == buffer_size) { + return OMPI_ERR_BAD_PARAM; + } + if (NULL == ml_memblock){ ML_ERROR(("Memory block not initialized")); ret = OMPI_ERROR; @@ -102,6 +106,9 @@ int mca_coll_ml_initialize_block(mca_bcol_base_memory_block_desc_t *ml_memblock, pbuff_descs = (mca_bcol_base_payload_buffer_desc_t*) malloc(sizeof(mca_bcol_base_payload_buffer_desc_t) * num_banks * num_buffers); + if (NULL == pbuff_descs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } for(bank_loop = 0; bank_loop < num_banks; bank_loop++) for(buff_loop = 0; buff_loop < num_buffers; buff_loop++){ diff --git a/ompi/mca/coll/ml/coll_ml_colls.h b/ompi/mca/coll/ml/coll_ml_colls.h index c65df08fac..f5f8b5d822 100644 --- a/ompi/mca/coll/ml/coll_ml_colls.h +++ b/ompi/mca/coll/ml/coll_ml_colls.h @@ -2,6 +2,8 @@ /* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -466,6 +468,7 @@ do { for (i = 0; i < (schedule)->n_fns; ++i) { \ mca_bcol_base_module_t *current_bcol = \ (schedule)->component_functions[i].constant_group_data.bcol_module; \ + assert (NULL != current_bcol); \ if (current_bcol->bcol_component->need_ordering) { \ (schedule)->n_fns_need_ordering++; \ } \ diff --git a/ompi/mca/coll/ml/coll_ml_component.c b/ompi/mca/coll/ml/coll_ml_component.c index 661250b712..eb1ed6365c 100644 --- a/ompi/mca/coll/ml/coll_ml_component.c +++ b/ompi/mca/coll/ml/coll_ml_component.c @@ -182,6 +182,10 @@ static int coll_ml_progress() } } else { rc = seq_coll_op->sequential_routine.seq_task_setup(seq_coll_op); + if (OMPI_SUCCESS != rc) { + mca_coll_ml_abort_ml("Failed to run sequential task setup"); + } + seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING; continue; } diff --git a/ompi/mca/coll/ml/coll_ml_config.c b/ompi/mca/coll/ml/coll_ml_config.c index bd3c35593f..aedc4aaf81 100644 --- a/ompi/mca/coll/ml/coll_ml_config.c +++ b/ompi/mca/coll/ml/coll_ml_config.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -43,6 +43,7 @@ typedef struct coll_config_t { static int algorithm_name_to_id(char *name) { + assert (NULL != name); if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_KNOWN")) return ML_BCAST_SMALL_DATA_KNOWN; if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_UNKNOWN")) @@ -110,6 +111,7 @@ static int algorithm_name_to_id(char *name) static int hierarchy_name_to_id(char *name) { + assert (NULL != name); if (!strcasecmp(name, "FULL_HR")) { return COLL_ML_HR_FULL; } @@ -128,6 +130,7 @@ static int hierarchy_name_to_id(char *name) static int section_name_to_id(char *name) { + assert (NULL != name); if (!strcasecmp(name, "SMALL")) { return ML_SMALL_MSG; } @@ -141,6 +144,7 @@ static int section_name_to_id(char *name) static int coll_name_to_id(char *name) { + assert (NULL != name); if (!strcasecmp(name, "ALLGATHER")) { return ML_ALLGATHER; } @@ -339,6 +343,8 @@ static int parse_algorithm_key(section_config_t *section, char *value) static int parse_threshold_key(section_config_t *section, char *value) { + assert (NULL != value); + if(!strcasecmp(value, "unlimited")) { section->config.threshold = -1; } else { @@ -364,6 +370,8 @@ static int parse_hierarchy_key(section_config_t *section, char *value) static int parse_fragmentation_key(section_config_t *section, char *value) { + assert (NULL != value); + if(!strcasecmp(value, "enable")) { section->config.fragmentation_enabled = 1; } else if (!strcasecmp(value, "disable")) { diff --git a/ompi/mca/coll/ml/coll_ml_custom_utils.c b/ompi/mca/coll/ml/coll_ml_custom_utils.c index 701367a6de..b9bd6d442f 100644 --- a/ompi/mca/coll/ml/coll_ml_custom_utils.c +++ b/ompi/mca/coll/ml/coll_ml_custom_utils.c @@ -1,6 +1,9 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -68,6 +71,8 @@ int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_m if (OPAL_UNLIKELY(NULL == ranks_in_comm)) { ML_ERROR(("Memory allocation failed.")); ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_NO_MEM, true); + /* not reached but causes a clang warning to not return here */ + return OMPI_ERR_OUT_OF_RESOURCE; } for (i = 0; i < comm_size; ++i) { @@ -114,12 +119,9 @@ int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_m int mca_coll_ml_check_if_bcol_is_requested(const char *component_name) { mca_base_component_list_item_t *bcol_comp; - bcol_comp = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use); ML_VERBOSE(10, ("Loop over bcol components")); - for ( bcol_comp = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use); - bcol_comp != (mca_base_component_list_item_t *) opal_list_get_end(&mca_bcol_base_components_in_use); - bcol_comp = (mca_base_component_list_item_t *) opal_list_get_next(bcol_comp)) { + OPAL_LIST_FOREACH(bcol_comp, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) { if(0 == strcmp(component_name, ((mca_bcol_base_component_2_0_0_t *) bcol_comp->cli_component)->bcol_version.mca_component_name)) { diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithm_memsync_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithm_memsync_setup.c index 089c3a2e56..f50d040f61 100644 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithm_memsync_setup.c +++ b/ompi/mca/coll/ml/coll_ml_hier_algorithm_memsync_setup.c @@ -1,6 +1,9 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,13 +32,12 @@ static int mca_coll_ml_build_memsync_schedule( mca_coll_ml_collective_operation_description_t *schedule; *coll_desc = (mca_coll_ml_collective_operation_description_t *) - malloc(sizeof(mca_coll_ml_collective_operation_description_t)); + calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); schedule = *coll_desc; if (OPAL_UNLIKELY(NULL == schedule)) { ML_ERROR(("Can't allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto Barrier_Setup_Error; + return OMPI_ERR_OUT_OF_RESOURCE; } if (topo_info->global_highest_hier_group_index == @@ -168,6 +170,9 @@ Barrier_Setup_Error: schedule->component_functions = NULL; } + free (schedule); + *coll_desc = NULL; + return rc; } diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c index 3b4a900edf..cd964d41dd 100644 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c +++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c @@ -106,9 +106,7 @@ static int mca_coll_ml_build_allgather_schedule(mca_coll_ml_topology_t *topo_inf if (NULL != scratch_num) { free(scratch_num); } - if (NULL != schedule->component_functions) { - free(schedule->component_functions); - } + return ret; } diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c index e1078fbba6..a371d51b7a 100644 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c +++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c @@ -65,15 +65,14 @@ static int mca_coll_ml_build_allreduce_schedule( } *coll_desc = (mca_coll_ml_collective_operation_description_t *) - malloc(sizeof(mca_coll_ml_collective_operation_description_t)); + calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); schedule = *coll_desc; if (NULL == schedule) { ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Allreduce_Setup_Error; + return OMPI_ERR_OUT_OF_RESOURCE; } - scratch_indx = (int *) malloc(sizeof(int) * (n_hiers * 2)); + scratch_indx = (int *) calloc(n_hiers * 2, sizeof (int)); if (NULL == scratch_indx) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; @@ -93,7 +92,6 @@ static int mca_coll_ml_build_allreduce_schedule( if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; } else { - scratch_indx[cnt] = 0; prev_bcol = GET_BCOL(topo_info, i); } } @@ -103,7 +101,6 @@ static int mca_coll_ml_build_allreduce_schedule( if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, n_hiers - 1))) { scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; } else { - scratch_indx[cnt] = 0; prev_bcol = GET_BCOL(topo_info, n_hiers - 1); } @@ -115,7 +112,6 @@ static int mca_coll_ml_build_allreduce_schedule( if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; } else { - scratch_indx[cnt] = 0; prev_bcol = GET_BCOL(topo_info, i); } } @@ -282,6 +278,8 @@ Allreduce_Setup_Error: if (NULL != schedule->component_functions) { free(schedule->component_functions); } + *coll_desc = NULL; + free (schedule); return ret; } diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c index 1047984656..54aeac6f86 100644 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c +++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c @@ -89,6 +89,7 @@ static int mca_coll_ml_build_barrier_schedule( if (NULL == comp_fn->bcol_function) { ML_VERBOSE(10, ("no function available for BCOL_FANIN, NON_BLOCKING, DATA_SRC_KNOWN")); + rc = OMPI_ERR_NOT_AVAILABLE; goto Barrier_Setup_Error; } @@ -105,6 +106,7 @@ static int mca_coll_ml_build_barrier_schedule( if (NULL == comp_fn->bcol_function) { ML_VERBOSE(10, ("no function available for BCOL_BARRIER, NON_BLOCKING, DATA_SRC_KNOWN")); + rc = OMPI_ERR_NOT_AVAILABLE; goto Barrier_Setup_Error; } @@ -125,6 +127,7 @@ static int mca_coll_ml_build_barrier_schedule( if (NULL == comp_fn->bcol_function) { ML_VERBOSE(10, ("no function available for BCOL_FANOUT, NON_BLOCKING, DATA_SRC_KNOWN")); + rc = OMPI_ERR_NOT_AVAILABLE; goto Barrier_Setup_Error; } diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c index 3f6c172d0a..627baf6344 100644 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c +++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c @@ -145,15 +145,14 @@ static int mca_coll_ml_build_bcast_dynamic_schedule_no_attributes( *bcol_module; *coll_desc = (mca_coll_ml_collective_operation_description_t *) - malloc(sizeof(mca_coll_ml_collective_operation_description_t)); + calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); schedule = *coll_desc; if (NULL == schedule) { ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; + return OMPI_ERR_OUT_OF_RESOURCE; } - scratch_indx = (int *) malloc(sizeof(int) * (n_hiers)); + scratch_indx = (int *) calloc(n_hiers, sizeof (int)); if (NULL == scratch_indx) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; @@ -315,15 +314,14 @@ static int mca_coll_ml_build_bcast_sequential_schedule_no_attributes( *bcol_module; *coll_desc = (mca_coll_ml_collective_operation_description_t *) - malloc(sizeof(mca_coll_ml_collective_operation_description_t)); + calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); schedule = *coll_desc; if (NULL == schedule) { ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; + return OMPI_ERR_OUT_OF_RESOURCE; } - scratch_indx = (int *) malloc(sizeof(int) * (n_hiers)); + scratch_indx = (int *) calloc(n_hiers, sizeof (int)); if (NULL == scratch_indx) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; @@ -520,6 +518,8 @@ Bcast_Setup_Error: if (NULL != schedule->comp_fn_arr) { free(schedule->comp_fn_arr); } + free (schedule); + *coll_desc = NULL; return ret; } @@ -569,15 +569,14 @@ static int mca_coll_ml_build_bcast_known_schedule_no_attributes( *bcol_module; *coll_desc = (mca_coll_ml_collective_operation_description_t *) - malloc(sizeof(mca_coll_ml_collective_operation_description_t)); + calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); schedule = *coll_desc; if (NULL == schedule) { ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; + return OMPI_ERR_OUT_OF_RESOURCE; } - scratch_indx = (int *) malloc(sizeof(int) * (n_hiers)); + scratch_indx = (int *) calloc(n_hiers, sizeof (int)); if (NULL == scratch_indx) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; @@ -598,7 +597,6 @@ static int mca_coll_ml_build_bcast_known_schedule_no_attributes( if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) { scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1; } else { - scratch_indx[i_hier] = 0; prev_bcol = GET_BCOL(topo_info, i_hier); } } @@ -725,6 +723,8 @@ Bcast_Setup_Error: if (NULL != schedule->component_functions) { free(schedule->component_functions); } + free (schedule); + *coll_desc = NULL; return ret; } @@ -813,6 +813,8 @@ void ml_coll_hier_bcast_cleanup(mca_coll_ml_module_t *ml_module) int topo_index = 0; mca_coll_ml_topology_t *topo_info = ml_module->topo_list; + assert (NULL != ml_module); + for (i = 0; i < ML_NUM_MSG; i++) { switch (i) { @@ -832,30 +834,18 @@ void ml_coll_hier_bcast_cleanup(mca_coll_ml_module_t *ml_module) return; } - if (NULL == ml_module->coll_ml_bcast_functions[alg]) { - continue; - } - - switch (alg) { - case ML_BCAST_SMALL_DATA_KNOWN: - case ML_BCAST_LARGE_DATA_KNOWN: - case ML_BCAST_SMALL_DATA_UNKNOWN: - case ML_BCAST_LARGE_DATA_UNKNOWN: - case ML_BCAST_SMALL_DATA_SEQUENTIAL: - case ML_BCAST_LARGE_DATA_SEQUENTIAL: + if (NULL != ml_module->coll_ml_bcast_functions[alg]) { + if (ML_BCAST_SMALL_DATA_KNOWN <= alg && ML_BCAST_LARGE_DATA_SEQUENTIAL >= alg) { if (ml_module->coll_ml_bcast_functions[alg]->component_functions) { free(ml_module->coll_ml_bcast_functions[alg]->component_functions); ml_module->coll_ml_bcast_functions[alg]->component_functions = NULL; } - if (ml_module->coll_ml_bcast_functions[alg]) { - free(ml_module->coll_ml_bcast_functions[alg]); - ml_module->coll_ml_bcast_functions[alg] = NULL; - } - break; - default: + free(ml_module->coll_ml_bcast_functions[alg]); + ml_module->coll_ml_bcast_functions[alg] = NULL; + } else { topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - return; + } } } } diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c index 465de228c1..579f77d12b 100644 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c +++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c @@ -74,7 +74,7 @@ static int mca_coll_ml_build_static_reduce_schedule( mca_coll_ml_collective_operation_description_t *schedule = NULL; *coll_desc = (mca_coll_ml_collective_operation_description_t *) - malloc(sizeof(mca_coll_ml_collective_operation_description_t)); + calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); schedule = *coll_desc; if (OPAL_UNLIKELY(NULL == schedule)) { @@ -83,7 +83,7 @@ static int mca_coll_ml_build_static_reduce_schedule( goto Error; } - scratch_indx = (int *) malloc(sizeof(int) * (n_hiers)); + scratch_indx = (int *) calloc (n_hiers, sizeof (int)); if (NULL == scratch_indx) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; @@ -267,15 +267,32 @@ static int mca_coll_ml_build_static_reduce_schedule( MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule); + /* reduce does not use the component functions so we no longer need this. see + * coll_ml_reduce.c:442 */ + free (schedule->component_functions); + schedule->component_functions = NULL; + free(scratch_num); free(scratch_indx); return OMPI_SUCCESS; Error: - if (NULL != schedule->component_functions) { - free(schedule->component_functions); - schedule->component_functions = NULL; + if (NULL != scratch_num) { + free (scratch_num); + } + + if (NULL != scratch_indx) { + free (scratch_indx); + } + + if (NULL != schedule) { + if (NULL != schedule->component_functions) { + free(schedule->component_functions); + schedule->component_functions = NULL; + } + free (schedule); + *coll_desc = NULL; } return ret; @@ -335,25 +352,20 @@ void ml_coll_hier_reduce_cleanup(mca_coll_ml_module_t *ml_module) return; } - for (i=0; itopo_list[topo_index].n_levels; i++) { - if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]) { - free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]); - ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i] = NULL; - } - } - if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr) { + for (i=0; itopo_list[topo_index].n_levels; i++) { + if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]) { + free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]); + ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i] = NULL; + } + } + free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr); ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr = NULL; } - if (ml_module->coll_ml_reduce_functions[alg]->component_functions) { - free(ml_module->coll_ml_reduce_functions[alg]->component_functions); - ml_module->coll_ml_reduce_functions[alg]->component_functions = NULL; - } + ml_module->coll_ml_reduce_functions[alg]->component_functions = NULL; - if (ml_module->coll_ml_reduce_functions[alg]) { - free(ml_module->coll_ml_reduce_functions[alg]); - ml_module->coll_ml_reduce_functions[alg] = NULL; - } + free(ml_module->coll_ml_reduce_functions[alg]); + ml_module->coll_ml_reduce_functions[alg] = NULL; } diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c index 98fe4afca8..8751a25ab9 100644 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c +++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c @@ -1,6 +1,9 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,7 +79,7 @@ int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module, /* allocate space for the functions */ collective_alg->functions = (mca_bcol_base_function_t *) - malloc(sizeof(mca_bcol_base_function_t) * collective_alg->n_functions); + calloc(collective_alg->n_functions, sizeof(mca_bcol_base_function_t)); if( NULL == collective_alg->functions) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; @@ -98,7 +101,7 @@ int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module, /* Figure out how many of the same bcols are called in a row. * The index of the bcol in row we store in scratch_indx and * the total number of bcols in the row we store in scratch_num */ - scratch_indx = (int *) malloc(sizeof(int) * (2 * num_hierarchies)); + scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int)); if(NULL == scratch_indx) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; @@ -373,7 +376,7 @@ int ml_coll_barrier_constant_group_data_setup( /* Figure out how many of the same bcols are called in a row. * The index of the bcol in row we store in scratch_indx and * the total number of bcols in the row we store in scratch_num */ - scratch_indx = (int *) malloc(sizeof(int) * (2 * num_hierarchies)); + scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int)); if(NULL == scratch_indx) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; @@ -498,6 +501,10 @@ int ml_coll_barrier_constant_group_data_setup( mca_bcol_base_module_t *current_bcol = component_functions[i].constant_group_data.bcol_module; + /* silence clang warning about possible NULL dereference of component_functions. + * this case is a developer error if it occurs */ + assert (NULL != component_functions && NULL != constant_group_data); + cnt = 0; for (j = 0; j < n_functions; ++j) { if (current_bcol == diff --git a/ompi/mca/coll/ml/coll_ml_mca.c b/ompi/mca/coll/ml/coll_ml_mca.c index 0ee75e569c..618521caa9 100644 --- a/ompi/mca/coll/ml/coll_ml_mca.c +++ b/ompi/mca/coll/ml/coll_ml_mca.c @@ -251,6 +251,9 @@ int mca_coll_ml_register_params(void) "Algorithm to use for broadcast", MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_ml_component.bcast_algorithm); + if (0 > tmp) { + ret = tmp; + } CHECK(reg_bool("disable_allgather", NULL, "Disable Allgather", false, &mca_coll_ml_component.disable_allgather)); diff --git a/ompi/mca/coll/ml/coll_ml_module.c b/ompi/mca/coll/ml/coll_ml_module.c index 0bf217361b..dd7b8a8641 100644 --- a/ompi/mca/coll/ml/coll_ml_module.c +++ b/ompi/mca/coll/ml/coll_ml_module.c @@ -143,8 +143,6 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module) ML_VERBOSE(4, ("ML module destruct")); - ml_coll_hier_reduce_cleanup(module); - for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) { topo = &module->topo_list[index_topo]; if (COLL_ML_TOPO_DISABLED == topo->status) { @@ -230,6 +228,7 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module) ml_coll_hier_allreduce_cleanup_new(module); ml_coll_hier_allgather_cleanup(module); ml_coll_hier_bcast_cleanup(module); + ml_coll_hier_reduce_cleanup(module); /* release saved collectives */ ML_RELEASE_FALLBACK(module, allreduce); @@ -629,7 +628,7 @@ static int check_global_view_of_subgroups( int n_procs_selected, return ret; } -static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ranks_in_all_subgroups, int my_rank_in_list) +static int ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ranks_in_all_subgroups, int my_rank_in_list) { int *list_n_connected; int group_size, rank, i, j, knt, offset, k, my_sbgp = 0; @@ -643,6 +642,10 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra mca_coll_ml_leader_offset_info_t *loc_leader = (mca_coll_ml_leader_offset_info_t *) malloc(sizeof(mca_coll_ml_leader_offset_info_t)*(n_hier+1)); + if (NULL == loc_leader) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + /* first thing I want to know is where does the first level end */ level_one_knt = 0; @@ -692,7 +695,11 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra /* malloc some memory for the new list to cache on the bcol module */ - list_n_connected = (int *) malloc(sizeof(int)*group_size); + list_n_connected = (int *) calloc(group_size, sizeof (int)); + if (NULL == list_n_connected) { + free (loc_leader); + return OMPI_ERR_OUT_OF_RESOURCE; + } /* next thing to do is to find out which subgroup I'm in * at this particular level @@ -803,6 +810,8 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra /* what other goodies do I want to cache on the ml-module? */ topo->hier_layout_info = loc_leader; + + return OMPI_SUCCESS; } static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo, @@ -857,7 +866,6 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo, } if( my_rank == root ) { - sum=0; for(i=0 ; i < (*num_total_subgroups) ; i++ ) { scratch_space[4*i]=(*array_of_all_subgroup_ranks)[i].root_rank_in_comm; scratch_space[4*i+1]=(*array_of_all_subgroup_ranks)[i].n_ranks; @@ -899,7 +907,7 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo, for(i=0 ; i < (*num_total_subgroups) ; i++ ) { sum+=(*array_of_all_subgroup_ranks)[i].n_ranks; } - if( in_num_total_subgroups != (*num_total_subgroups) ) { + if( in_num_total_subgroups != (*num_total_subgroups) && sum > 0 ) { (*list_of_ranks_in_all_subgroups)=(int *) realloc((*list_of_ranks_in_all_subgroups),sizeof(int)*sum); if (OPAL_UNLIKELY(NULL == (*list_of_ranks_in_all_subgroups))) { @@ -1040,11 +1048,16 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte PROVIDE_SUFFICIENT_MEMORY((*sub_group_meta_data), dummy1, (*size_of_sub_group_meta_data), sub_group_params_t, (*num_total_subgroups), 1, 5); + if (OPAL_UNLIKELY(NULL == (*sub_group_meta_data))) { + ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data.")); + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto exit_ERROR; + } /* do this for the temporary memory slots */ PROVIDE_SUFFICIENT_MEMORY(temp, dummy2, knt1, int32_t *, knt2, 1, 5); - if (OPAL_UNLIKELY(NULL == (*sub_group_meta_data))) { - ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data.")); + if (OPAL_UNLIKELY(NULL == temp)) { + ML_VERBOSE(10, ("Cannot allocate memory for temporary storage")); rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit_ERROR; } @@ -1053,7 +1066,7 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte /* no need for this here - use a temporary ptr */ temp[knt2]= - (int *)malloc(sizeof(int)*size_of_all_selected); + (int *)calloc(size_of_all_selected, sizeof(int)); if (OPAL_UNLIKELY(NULL == temp[knt2] ) ){ ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data.")); rc = OMPI_ERR_OUT_OF_RESOURCE; @@ -1102,12 +1115,12 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte } /* clean up temporary storage */ - if(NULL != temp) { + exit_ERROR: + if (NULL != temp) { free(temp); } /* return */ - exit_ERROR: return rc; } @@ -1716,6 +1729,9 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module, /* number of processes selected with this sbgp on all ranks */ int global_n_procs_selected; + /* silence clang warnings */ + assert (NULL != bcol_cli && NULL != sbgp_cli); + /* ** obtain the list of ranks in the current level */ @@ -1985,6 +2001,8 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module, * set largest power of 2 for this group */ module->n_levels_pow2 = ml_fls(module->group_size); + /* silence a clang warning */ + assert (module->n_levels_pow2 > 0 && module->n_levels_pow2 < 32); module->pow_2 = 1 << module->n_levels_pow2; n_hier++; @@ -2113,7 +2131,11 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module, topo->number_of_all_subgroups = num_total_subgroups; topo->array_of_all_subgroups = array_of_all_subgroup_ranks; - ml_init_k_nomial_trees(topo, list_of_ranks_in_all_subgroups, ompi_comm_rank(ml_module->comm)); + ret = ml_init_k_nomial_trees(topo, list_of_ranks_in_all_subgroups, ompi_comm_rank(ml_module->comm)); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + goto exit_ERROR; + } + /* Set the route table if know-root type of algorithms is used */ if (COLL_ML_STATIC_BCAST == mca_coll_ml_component.bcast_algorithm) { ret = mca_coll_ml_fill_in_route_tab(topo, ml_module->comm); @@ -2203,15 +2225,12 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module, sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_first(&mca_sbgp_base_components_in_use); - for (bcol_cli = (mca_base_component_list_item_t *) - opal_list_get_first(&mca_bcol_base_components_in_use); - (opal_list_item_t *) bcol_cli != - opal_list_get_end(&mca_bcol_base_components_in_use); - bcol_cli = (mca_base_component_list_item_t *) - opal_list_get_next((opal_list_item_t *) bcol_cli), - sbgp_cli = (sbgp_base_component_keyval_t *) - opal_list_get_next((opal_list_item_t *) sbgp_cli)) { + OPAL_LIST_FOREACH(bcol_cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) { bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component; + + /* silence false-positive clang warning */ + assert (NULL != sbgp_cli); + if (NULL != bcol_component->coll_support_all_types && !bcol_component->coll_support_all_types(BCOL_ALLREDUCE)) { mca_base_component_list_item_t *bcol_cli_next; @@ -2256,6 +2275,8 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module, &ml_module->topo_list[COLL_ML_HR_ALLREDUCE], n_hierarchies, sbgp_component->sbgp_version.mca_component_name, NULL); } + + sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli); } return OMPI_SUCCESS;