coll/ml: fix issues identified by the clang static analyser and fix
a segmentation fault in the reduce cleanup Some of the changes address false warnings produced by scan-build. I added asserts and changed some malloc calls to calloc to silence these warnings. The was one issue in cleanup for reduce since the component_functions member is changed by the allreduce call. There may be other issues with how this code works but releasing the allocated component_functions after setting up the static functions addresses the primary issue (SIGSEGV). cmr=v1.8.1:reviewer=manjugv This commit was SVN r31417.
Этот коммит содержится в:
родитель
f80aece271
Коммит
484a3f6147
@ -456,14 +456,14 @@ int mca_coll_ml_allgather_start (void *sbuf, int scount,
|
||||
|
||||
mca_coll_ml_convertor_pack(
|
||||
(void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len *
|
||||
(coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
|
||||
coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
|
||||
(topo_info->hier_layout_info[0].offset +
|
||||
topo_info->hier_layout_info[0].level_one_index)),
|
||||
frag_len, &coll_op->full_message.send_convertor);
|
||||
} else {
|
||||
/* change 6 */
|
||||
memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len *
|
||||
(coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
|
||||
coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
|
||||
(topo_info->hier_layout_info[0].offset +
|
||||
topo_info->hier_layout_info[0].level_one_index)),
|
||||
sbuf, frag_len);
|
||||
}
|
||||
|
||||
|
@ -88,6 +88,10 @@ int mca_coll_ml_initialize_block(mca_bcol_base_memory_block_desc_t *ml_memblock,
|
||||
uint64_t addr_offset = 0;
|
||||
mca_bcol_base_payload_buffer_desc_t *pbuff_descs = NULL,*pbuff_desc = NULL;
|
||||
|
||||
if (0 == num_banks || 0 == num_buffers || 0 == buffer_size) {
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
if (NULL == ml_memblock){
|
||||
ML_ERROR(("Memory block not initialized"));
|
||||
ret = OMPI_ERROR;
|
||||
@ -102,6 +106,9 @@ int mca_coll_ml_initialize_block(mca_bcol_base_memory_block_desc_t *ml_memblock,
|
||||
|
||||
pbuff_descs = (mca_bcol_base_payload_buffer_desc_t*) malloc(sizeof(mca_bcol_base_payload_buffer_desc_t)
|
||||
* num_banks * num_buffers);
|
||||
if (NULL == pbuff_descs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for(bank_loop = 0; bank_loop < num_banks; bank_loop++)
|
||||
for(buff_loop = 0; buff_loop < num_buffers; buff_loop++){
|
||||
|
@ -2,6 +2,8 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -466,6 +468,7 @@ do {
|
||||
for (i = 0; i < (schedule)->n_fns; ++i) { \
|
||||
mca_bcol_base_module_t *current_bcol = \
|
||||
(schedule)->component_functions[i].constant_group_data.bcol_module; \
|
||||
assert (NULL != current_bcol); \
|
||||
if (current_bcol->bcol_component->need_ordering) { \
|
||||
(schedule)->n_fns_need_ordering++; \
|
||||
} \
|
||||
|
@ -182,6 +182,10 @@ static int coll_ml_progress()
|
||||
}
|
||||
} else {
|
||||
rc = seq_coll_op->sequential_routine.seq_task_setup(seq_coll_op);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
mca_coll_ml_abort_ml("Failed to run sequential task setup");
|
||||
}
|
||||
|
||||
seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
|
||||
continue;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -43,6 +43,7 @@ typedef struct coll_config_t {
|
||||
|
||||
static int algorithm_name_to_id(char *name)
|
||||
{
|
||||
assert (NULL != name);
|
||||
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_KNOWN"))
|
||||
return ML_BCAST_SMALL_DATA_KNOWN;
|
||||
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_UNKNOWN"))
|
||||
@ -110,6 +111,7 @@ static int algorithm_name_to_id(char *name)
|
||||
|
||||
static int hierarchy_name_to_id(char *name)
|
||||
{
|
||||
assert (NULL != name);
|
||||
if (!strcasecmp(name, "FULL_HR")) {
|
||||
return COLL_ML_HR_FULL;
|
||||
}
|
||||
@ -128,6 +130,7 @@ static int hierarchy_name_to_id(char *name)
|
||||
|
||||
static int section_name_to_id(char *name)
|
||||
{
|
||||
assert (NULL != name);
|
||||
if (!strcasecmp(name, "SMALL")) {
|
||||
return ML_SMALL_MSG;
|
||||
}
|
||||
@ -141,6 +144,7 @@ static int section_name_to_id(char *name)
|
||||
|
||||
static int coll_name_to_id(char *name)
|
||||
{
|
||||
assert (NULL != name);
|
||||
if (!strcasecmp(name, "ALLGATHER")) {
|
||||
return ML_ALLGATHER;
|
||||
}
|
||||
@ -339,6 +343,8 @@ static int parse_algorithm_key(section_config_t *section, char *value)
|
||||
|
||||
static int parse_threshold_key(section_config_t *section, char *value)
|
||||
{
|
||||
assert (NULL != value);
|
||||
|
||||
if(!strcasecmp(value, "unlimited")) {
|
||||
section->config.threshold = -1;
|
||||
} else {
|
||||
@ -364,6 +370,8 @@ static int parse_hierarchy_key(section_config_t *section, char *value)
|
||||
|
||||
static int parse_fragmentation_key(section_config_t *section, char *value)
|
||||
{
|
||||
assert (NULL != value);
|
||||
|
||||
if(!strcasecmp(value, "enable")) {
|
||||
section->config.fragmentation_enabled = 1;
|
||||
} else if (!strcasecmp(value, "disable")) {
|
||||
|
@ -1,6 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -68,6 +71,8 @@ int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_m
|
||||
if (OPAL_UNLIKELY(NULL == ranks_in_comm)) {
|
||||
ML_ERROR(("Memory allocation failed."));
|
||||
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_NO_MEM, true);
|
||||
/* not reached but causes a clang warning to not return here */
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for (i = 0; i < comm_size; ++i) {
|
||||
@ -114,12 +119,9 @@ int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_m
|
||||
int mca_coll_ml_check_if_bcol_is_requested(const char *component_name)
|
||||
{
|
||||
mca_base_component_list_item_t *bcol_comp;
|
||||
bcol_comp = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use);
|
||||
|
||||
ML_VERBOSE(10, ("Loop over bcol components"));
|
||||
for ( bcol_comp = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use);
|
||||
bcol_comp != (mca_base_component_list_item_t *) opal_list_get_end(&mca_bcol_base_components_in_use);
|
||||
bcol_comp = (mca_base_component_list_item_t *) opal_list_get_next(bcol_comp)) {
|
||||
OPAL_LIST_FOREACH(bcol_comp, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
|
||||
if(0 == strcmp(component_name,
|
||||
((mca_bcol_base_component_2_0_0_t *)
|
||||
bcol_comp->cli_component)->bcol_version.mca_component_name)) {
|
||||
|
@ -1,6 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,13 +32,12 @@ static int mca_coll_ml_build_memsync_schedule(
|
||||
mca_coll_ml_collective_operation_description_t *schedule;
|
||||
|
||||
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
|
||||
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
|
||||
schedule = *coll_desc;
|
||||
if (OPAL_UNLIKELY(NULL == schedule)) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto Barrier_Setup_Error;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (topo_info->global_highest_hier_group_index ==
|
||||
@ -168,6 +170,9 @@ Barrier_Setup_Error:
|
||||
schedule->component_functions = NULL;
|
||||
}
|
||||
|
||||
free (schedule);
|
||||
*coll_desc = NULL;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -106,9 +106,7 @@ static int mca_coll_ml_build_allgather_schedule(mca_coll_ml_topology_t *topo_inf
|
||||
if (NULL != scratch_num) {
|
||||
free(scratch_num);
|
||||
}
|
||||
if (NULL != schedule->component_functions) {
|
||||
free(schedule->component_functions);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -65,15 +65,14 @@ static int mca_coll_ml_build_allreduce_schedule(
|
||||
}
|
||||
|
||||
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
|
||||
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
schedule = *coll_desc;
|
||||
if (NULL == schedule) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto Allreduce_Setup_Error;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers * 2));
|
||||
scratch_indx = (int *) calloc(n_hiers * 2, sizeof (int));
|
||||
if (NULL == scratch_indx) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -93,7 +92,6 @@ static int mca_coll_ml_build_allreduce_schedule(
|
||||
if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
|
||||
scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
|
||||
} else {
|
||||
scratch_indx[cnt] = 0;
|
||||
prev_bcol = GET_BCOL(topo_info, i);
|
||||
}
|
||||
}
|
||||
@ -103,7 +101,6 @@ static int mca_coll_ml_build_allreduce_schedule(
|
||||
if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, n_hiers - 1))) {
|
||||
scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
|
||||
} else {
|
||||
scratch_indx[cnt] = 0;
|
||||
prev_bcol = GET_BCOL(topo_info, n_hiers - 1);
|
||||
}
|
||||
|
||||
@ -115,7 +112,6 @@ static int mca_coll_ml_build_allreduce_schedule(
|
||||
if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
|
||||
scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
|
||||
} else {
|
||||
scratch_indx[cnt] = 0;
|
||||
prev_bcol = GET_BCOL(topo_info, i);
|
||||
}
|
||||
}
|
||||
@ -282,6 +278,8 @@ Allreduce_Setup_Error:
|
||||
if (NULL != schedule->component_functions) {
|
||||
free(schedule->component_functions);
|
||||
}
|
||||
*coll_desc = NULL;
|
||||
free (schedule);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -89,6 +89,7 @@ static int mca_coll_ml_build_barrier_schedule(
|
||||
|
||||
if (NULL == comp_fn->bcol_function) {
|
||||
ML_VERBOSE(10, ("no function available for BCOL_FANIN, NON_BLOCKING, DATA_SRC_KNOWN"));
|
||||
rc = OMPI_ERR_NOT_AVAILABLE;
|
||||
goto Barrier_Setup_Error;
|
||||
}
|
||||
|
||||
@ -105,6 +106,7 @@ static int mca_coll_ml_build_barrier_schedule(
|
||||
|
||||
if (NULL == comp_fn->bcol_function) {
|
||||
ML_VERBOSE(10, ("no function available for BCOL_BARRIER, NON_BLOCKING, DATA_SRC_KNOWN"));
|
||||
rc = OMPI_ERR_NOT_AVAILABLE;
|
||||
goto Barrier_Setup_Error;
|
||||
}
|
||||
|
||||
@ -125,6 +127,7 @@ static int mca_coll_ml_build_barrier_schedule(
|
||||
|
||||
if (NULL == comp_fn->bcol_function) {
|
||||
ML_VERBOSE(10, ("no function available for BCOL_FANOUT, NON_BLOCKING, DATA_SRC_KNOWN"));
|
||||
rc = OMPI_ERR_NOT_AVAILABLE;
|
||||
goto Barrier_Setup_Error;
|
||||
}
|
||||
|
||||
|
@ -145,15 +145,14 @@ static int mca_coll_ml_build_bcast_dynamic_schedule_no_attributes(
|
||||
*bcol_module;
|
||||
|
||||
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
|
||||
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
schedule = *coll_desc;
|
||||
if (NULL == schedule) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto Bcast_Setup_Error;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
|
||||
scratch_indx = (int *) calloc(n_hiers, sizeof (int));
|
||||
if (NULL == scratch_indx) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -315,15 +314,14 @@ static int mca_coll_ml_build_bcast_sequential_schedule_no_attributes(
|
||||
*bcol_module;
|
||||
|
||||
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
|
||||
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
schedule = *coll_desc;
|
||||
if (NULL == schedule) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto Bcast_Setup_Error;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
|
||||
scratch_indx = (int *) calloc(n_hiers, sizeof (int));
|
||||
if (NULL == scratch_indx) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -520,6 +518,8 @@ Bcast_Setup_Error:
|
||||
if (NULL != schedule->comp_fn_arr) {
|
||||
free(schedule->comp_fn_arr);
|
||||
}
|
||||
free (schedule);
|
||||
*coll_desc = NULL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -569,15 +569,14 @@ static int mca_coll_ml_build_bcast_known_schedule_no_attributes(
|
||||
*bcol_module;
|
||||
|
||||
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
|
||||
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
schedule = *coll_desc;
|
||||
if (NULL == schedule) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto Bcast_Setup_Error;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
|
||||
scratch_indx = (int *) calloc(n_hiers, sizeof (int));
|
||||
if (NULL == scratch_indx) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -598,7 +597,6 @@ static int mca_coll_ml_build_bcast_known_schedule_no_attributes(
|
||||
if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) {
|
||||
scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1;
|
||||
} else {
|
||||
scratch_indx[i_hier] = 0;
|
||||
prev_bcol = GET_BCOL(topo_info, i_hier);
|
||||
}
|
||||
}
|
||||
@ -725,6 +723,8 @@ Bcast_Setup_Error:
|
||||
if (NULL != schedule->component_functions) {
|
||||
free(schedule->component_functions);
|
||||
}
|
||||
free (schedule);
|
||||
*coll_desc = NULL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -813,6 +813,8 @@ void ml_coll_hier_bcast_cleanup(mca_coll_ml_module_t *ml_module)
|
||||
int topo_index = 0;
|
||||
mca_coll_ml_topology_t *topo_info = ml_module->topo_list;
|
||||
|
||||
assert (NULL != ml_module);
|
||||
|
||||
for (i = 0; i < ML_NUM_MSG; i++) {
|
||||
|
||||
switch (i) {
|
||||
@ -832,30 +834,18 @@ void ml_coll_hier_bcast_cleanup(mca_coll_ml_module_t *ml_module)
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL == ml_module->coll_ml_bcast_functions[alg]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (alg) {
|
||||
case ML_BCAST_SMALL_DATA_KNOWN:
|
||||
case ML_BCAST_LARGE_DATA_KNOWN:
|
||||
case ML_BCAST_SMALL_DATA_UNKNOWN:
|
||||
case ML_BCAST_LARGE_DATA_UNKNOWN:
|
||||
case ML_BCAST_SMALL_DATA_SEQUENTIAL:
|
||||
case ML_BCAST_LARGE_DATA_SEQUENTIAL:
|
||||
if (NULL != ml_module->coll_ml_bcast_functions[alg]) {
|
||||
if (ML_BCAST_SMALL_DATA_KNOWN <= alg && ML_BCAST_LARGE_DATA_SEQUENTIAL >= alg) {
|
||||
if (ml_module->coll_ml_bcast_functions[alg]->component_functions) {
|
||||
free(ml_module->coll_ml_bcast_functions[alg]->component_functions);
|
||||
ml_module->coll_ml_bcast_functions[alg]->component_functions = NULL;
|
||||
}
|
||||
|
||||
if (ml_module->coll_ml_bcast_functions[alg]) {
|
||||
free(ml_module->coll_ml_bcast_functions[alg]);
|
||||
ml_module->coll_ml_bcast_functions[alg] = NULL;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
free(ml_module->coll_ml_bcast_functions[alg]);
|
||||
ml_module->coll_ml_bcast_functions[alg] = NULL;
|
||||
} else {
|
||||
topo_info->hierarchical_algorithms[ML_BCAST] = NULL;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -74,7 +74,7 @@ static int mca_coll_ml_build_static_reduce_schedule(
|
||||
mca_coll_ml_collective_operation_description_t *schedule = NULL;
|
||||
|
||||
*coll_desc = (mca_coll_ml_collective_operation_description_t *)
|
||||
malloc(sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
calloc(1, sizeof(mca_coll_ml_collective_operation_description_t));
|
||||
|
||||
schedule = *coll_desc;
|
||||
if (OPAL_UNLIKELY(NULL == schedule)) {
|
||||
@ -83,7 +83,7 @@ static int mca_coll_ml_build_static_reduce_schedule(
|
||||
goto Error;
|
||||
}
|
||||
|
||||
scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
|
||||
scratch_indx = (int *) calloc (n_hiers, sizeof (int));
|
||||
if (NULL == scratch_indx) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -267,15 +267,32 @@ static int mca_coll_ml_build_static_reduce_schedule(
|
||||
|
||||
MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule);
|
||||
|
||||
/* reduce does not use the component functions so we no longer need this. see
|
||||
* coll_ml_reduce.c:442 */
|
||||
free (schedule->component_functions);
|
||||
schedule->component_functions = NULL;
|
||||
|
||||
free(scratch_num);
|
||||
free(scratch_indx);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
Error:
|
||||
if (NULL != schedule->component_functions) {
|
||||
free(schedule->component_functions);
|
||||
schedule->component_functions = NULL;
|
||||
if (NULL != scratch_num) {
|
||||
free (scratch_num);
|
||||
}
|
||||
|
||||
if (NULL != scratch_indx) {
|
||||
free (scratch_indx);
|
||||
}
|
||||
|
||||
if (NULL != schedule) {
|
||||
if (NULL != schedule->component_functions) {
|
||||
free(schedule->component_functions);
|
||||
schedule->component_functions = NULL;
|
||||
}
|
||||
free (schedule);
|
||||
*coll_desc = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -335,25 +352,20 @@ void ml_coll_hier_reduce_cleanup(mca_coll_ml_module_t *ml_module)
|
||||
return;
|
||||
}
|
||||
|
||||
for (i=0; i<ml_module->topo_list[topo_index].n_levels; i++) {
|
||||
if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]) {
|
||||
free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]);
|
||||
ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr) {
|
||||
for (i=0; i<ml_module->topo_list[topo_index].n_levels; i++) {
|
||||
if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]) {
|
||||
free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]);
|
||||
ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr);
|
||||
ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr = NULL;
|
||||
}
|
||||
|
||||
if (ml_module->coll_ml_reduce_functions[alg]->component_functions) {
|
||||
free(ml_module->coll_ml_reduce_functions[alg]->component_functions);
|
||||
ml_module->coll_ml_reduce_functions[alg]->component_functions = NULL;
|
||||
}
|
||||
ml_module->coll_ml_reduce_functions[alg]->component_functions = NULL;
|
||||
|
||||
if (ml_module->coll_ml_reduce_functions[alg]) {
|
||||
free(ml_module->coll_ml_reduce_functions[alg]);
|
||||
ml_module->coll_ml_reduce_functions[alg] = NULL;
|
||||
}
|
||||
free(ml_module->coll_ml_reduce_functions[alg]);
|
||||
ml_module->coll_ml_reduce_functions[alg] = NULL;
|
||||
}
|
||||
|
@ -1,6 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -76,7 +79,7 @@ int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module,
|
||||
|
||||
/* allocate space for the functions */
|
||||
collective_alg->functions = (mca_bcol_base_function_t *)
|
||||
malloc(sizeof(mca_bcol_base_function_t) * collective_alg->n_functions);
|
||||
calloc(collective_alg->n_functions, sizeof(mca_bcol_base_function_t));
|
||||
if( NULL == collective_alg->functions) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -98,7 +101,7 @@ int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module,
|
||||
/* Figure out how many of the same bcols are called in a row.
|
||||
* The index of the bcol in row we store in scratch_indx and
|
||||
* the total number of bcols in the row we store in scratch_num */
|
||||
scratch_indx = (int *) malloc(sizeof(int) * (2 * num_hierarchies));
|
||||
scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int));
|
||||
if(NULL == scratch_indx) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -373,7 +376,7 @@ int ml_coll_barrier_constant_group_data_setup(
|
||||
/* Figure out how many of the same bcols are called in a row.
|
||||
* The index of the bcol in row we store in scratch_indx and
|
||||
* the total number of bcols in the row we store in scratch_num */
|
||||
scratch_indx = (int *) malloc(sizeof(int) * (2 * num_hierarchies));
|
||||
scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int));
|
||||
if(NULL == scratch_indx) {
|
||||
ML_ERROR(("Can't allocate memory."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -498,6 +501,10 @@ int ml_coll_barrier_constant_group_data_setup(
|
||||
mca_bcol_base_module_t *current_bcol =
|
||||
component_functions[i].constant_group_data.bcol_module;
|
||||
|
||||
/* silence clang warning about possible NULL dereference of component_functions.
|
||||
* this case is a developer error if it occurs */
|
||||
assert (NULL != component_functions && NULL != constant_group_data);
|
||||
|
||||
cnt = 0;
|
||||
for (j = 0; j < n_functions; ++j) {
|
||||
if (current_bcol ==
|
||||
|
@ -251,6 +251,9 @@ int mca_coll_ml_register_params(void)
|
||||
"Algorithm to use for broadcast", MCA_BASE_VAR_TYPE_INT,
|
||||
new_enum, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_coll_ml_component.bcast_algorithm);
|
||||
if (0 > tmp) {
|
||||
ret = tmp;
|
||||
}
|
||||
|
||||
CHECK(reg_bool("disable_allgather", NULL, "Disable Allgather", false,
|
||||
&mca_coll_ml_component.disable_allgather));
|
||||
|
@ -143,8 +143,6 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module)
|
||||
|
||||
ML_VERBOSE(4, ("ML module destruct"));
|
||||
|
||||
ml_coll_hier_reduce_cleanup(module);
|
||||
|
||||
for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) {
|
||||
topo = &module->topo_list[index_topo];
|
||||
if (COLL_ML_TOPO_DISABLED == topo->status) {
|
||||
@ -230,6 +228,7 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module)
|
||||
ml_coll_hier_allreduce_cleanup_new(module);
|
||||
ml_coll_hier_allgather_cleanup(module);
|
||||
ml_coll_hier_bcast_cleanup(module);
|
||||
ml_coll_hier_reduce_cleanup(module);
|
||||
|
||||
/* release saved collectives */
|
||||
ML_RELEASE_FALLBACK(module, allreduce);
|
||||
@ -629,7 +628,7 @@ static int check_global_view_of_subgroups( int n_procs_selected,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ranks_in_all_subgroups, int my_rank_in_list)
|
||||
static int ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ranks_in_all_subgroups, int my_rank_in_list)
|
||||
{
|
||||
int *list_n_connected;
|
||||
int group_size, rank, i, j, knt, offset, k, my_sbgp = 0;
|
||||
@ -643,6 +642,10 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra
|
||||
mca_coll_ml_leader_offset_info_t *loc_leader = (mca_coll_ml_leader_offset_info_t *)
|
||||
malloc(sizeof(mca_coll_ml_leader_offset_info_t)*(n_hier+1));
|
||||
|
||||
if (NULL == loc_leader) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* first thing I want to know is where does the first level end */
|
||||
level_one_knt = 0;
|
||||
|
||||
@ -692,7 +695,11 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra
|
||||
/* malloc some memory for the new list to cache
|
||||
on the bcol module
|
||||
*/
|
||||
list_n_connected = (int *) malloc(sizeof(int)*group_size);
|
||||
list_n_connected = (int *) calloc(group_size, sizeof (int));
|
||||
if (NULL == list_n_connected) {
|
||||
free (loc_leader);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* next thing to do is to find out which subgroup I'm in
|
||||
* at this particular level
|
||||
@ -803,6 +810,8 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra
|
||||
|
||||
/* what other goodies do I want to cache on the ml-module? */
|
||||
topo->hier_layout_info = loc_leader;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
|
||||
@ -857,7 +866,6 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
|
||||
}
|
||||
|
||||
if( my_rank == root ) {
|
||||
sum=0;
|
||||
for(i=0 ; i < (*num_total_subgroups) ; i++ ) {
|
||||
scratch_space[4*i]=(*array_of_all_subgroup_ranks)[i].root_rank_in_comm;
|
||||
scratch_space[4*i+1]=(*array_of_all_subgroup_ranks)[i].n_ranks;
|
||||
@ -899,7 +907,7 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
|
||||
for(i=0 ; i < (*num_total_subgroups) ; i++ ) {
|
||||
sum+=(*array_of_all_subgroup_ranks)[i].n_ranks;
|
||||
}
|
||||
if( in_num_total_subgroups != (*num_total_subgroups) ) {
|
||||
if( in_num_total_subgroups != (*num_total_subgroups) && sum > 0 ) {
|
||||
(*list_of_ranks_in_all_subgroups)=(int *)
|
||||
realloc((*list_of_ranks_in_all_subgroups),sizeof(int)*sum);
|
||||
if (OPAL_UNLIKELY(NULL == (*list_of_ranks_in_all_subgroups))) {
|
||||
@ -1040,11 +1048,16 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
|
||||
PROVIDE_SUFFICIENT_MEMORY((*sub_group_meta_data), dummy1,
|
||||
(*size_of_sub_group_meta_data),
|
||||
sub_group_params_t, (*num_total_subgroups), 1, 5);
|
||||
if (OPAL_UNLIKELY(NULL == (*sub_group_meta_data))) {
|
||||
ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data."));
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
/* do this for the temporary memory slots */
|
||||
PROVIDE_SUFFICIENT_MEMORY(temp, dummy2,
|
||||
knt1, int32_t *, knt2, 1, 5);
|
||||
if (OPAL_UNLIKELY(NULL == (*sub_group_meta_data))) {
|
||||
ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data."));
|
||||
if (OPAL_UNLIKELY(NULL == temp)) {
|
||||
ML_VERBOSE(10, ("Cannot allocate memory for temporary storage"));
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
@ -1053,7 +1066,7 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
|
||||
|
||||
/* no need for this here - use a temporary ptr */
|
||||
temp[knt2]=
|
||||
(int *)malloc(sizeof(int)*size_of_all_selected);
|
||||
(int *)calloc(size_of_all_selected, sizeof(int));
|
||||
if (OPAL_UNLIKELY(NULL == temp[knt2] ) ){
|
||||
ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data."));
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -1102,12 +1115,12 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
|
||||
}
|
||||
|
||||
/* clean up temporary storage */
|
||||
if(NULL != temp) {
|
||||
exit_ERROR:
|
||||
if (NULL != temp) {
|
||||
free(temp);
|
||||
}
|
||||
|
||||
/* return */
|
||||
exit_ERROR:
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -1716,6 +1729,9 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
/* number of processes selected with this sbgp on all ranks */
|
||||
int global_n_procs_selected;
|
||||
|
||||
/* silence clang warnings */
|
||||
assert (NULL != bcol_cli && NULL != sbgp_cli);
|
||||
|
||||
/*
|
||||
** obtain the list of ranks in the current level
|
||||
*/
|
||||
@ -1985,6 +2001,8 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
* set largest power of 2 for this group
|
||||
*/
|
||||
module->n_levels_pow2 = ml_fls(module->group_size);
|
||||
/* silence a clang warning */
|
||||
assert (module->n_levels_pow2 > 0 && module->n_levels_pow2 < 32);
|
||||
module->pow_2 = 1 << module->n_levels_pow2;
|
||||
|
||||
n_hier++;
|
||||
@ -2113,7 +2131,11 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
topo->number_of_all_subgroups = num_total_subgroups;
|
||||
topo->array_of_all_subgroups = array_of_all_subgroup_ranks;
|
||||
|
||||
ml_init_k_nomial_trees(topo, list_of_ranks_in_all_subgroups, ompi_comm_rank(ml_module->comm));
|
||||
ret = ml_init_k_nomial_trees(topo, list_of_ranks_in_all_subgroups, ompi_comm_rank(ml_module->comm));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* Set the route table if know-root type of algorithms is used */
|
||||
if (COLL_ML_STATIC_BCAST == mca_coll_ml_component.bcast_algorithm) {
|
||||
ret = mca_coll_ml_fill_in_route_tab(topo, ml_module->comm);
|
||||
@ -2203,15 +2225,12 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
sbgp_cli = (sbgp_base_component_keyval_t *)
|
||||
opal_list_get_first(&mca_sbgp_base_components_in_use);
|
||||
|
||||
for (bcol_cli = (mca_base_component_list_item_t *)
|
||||
opal_list_get_first(&mca_bcol_base_components_in_use);
|
||||
(opal_list_item_t *) bcol_cli !=
|
||||
opal_list_get_end(&mca_bcol_base_components_in_use);
|
||||
bcol_cli = (mca_base_component_list_item_t *)
|
||||
opal_list_get_next((opal_list_item_t *) bcol_cli),
|
||||
sbgp_cli = (sbgp_base_component_keyval_t *)
|
||||
opal_list_get_next((opal_list_item_t *) sbgp_cli)) {
|
||||
OPAL_LIST_FOREACH(bcol_cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
|
||||
bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component;
|
||||
|
||||
/* silence false-positive clang warning */
|
||||
assert (NULL != sbgp_cli);
|
||||
|
||||
if (NULL != bcol_component->coll_support_all_types &&
|
||||
!bcol_component->coll_support_all_types(BCOL_ALLREDUCE)) {
|
||||
mca_base_component_list_item_t *bcol_cli_next;
|
||||
@ -2256,6 +2275,8 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
&ml_module->topo_list[COLL_ML_HR_ALLREDUCE],
|
||||
n_hierarchies, sbgp_component->sbgp_version.mca_component_name, NULL);
|
||||
}
|
||||
|
||||
sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user