coll/ml: improve the buffer size calculation and ensure the bcol_index in
a hierarchy actually matches a bcol that is in use. There was a bug in one of the paths to calculate the ml buffer size. I fixed the bug and squashed all the paths together to avoid further issues (the result was correct in another path that calculated the same value). Additionally, the i_hier was being used as the bcol_index. This is not correct in a couple of cases so I added a variable to keep track of the real bcol_index. cmr=v1.8:reviewer=pasha This commit was SVN r31189.
Этот коммит содержится в:
родитель
f1dd589092
Коммит
c7d830f4b9
@ -40,6 +40,7 @@
|
||||
#include "opal/datatype/opal_datatype.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/arch.h"
|
||||
#include "opal/align.h"
|
||||
|
||||
#include "coll_ml.h"
|
||||
#include "coll_ml_inlines.h"
|
||||
@ -449,7 +450,7 @@ static int calculate_buffer_header_size(mca_coll_ml_module_t *ml_module)
|
||||
}
|
||||
}
|
||||
|
||||
offset = ((offset + BCOL_HEAD_ALIGN - 1) / BCOL_HEAD_ALIGN) * BCOL_HEAD_ALIGN;
|
||||
offset = OPAL_ALIGN(offset, BCOL_HEAD_ALIGN, uint32_t);
|
||||
/* select largest offset between multiple topologies */
|
||||
if (data_offset < (int) offset) {
|
||||
data_offset = (int) offset;
|
||||
@ -745,7 +746,6 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra
|
||||
/* now cache this on the bcol module */
|
||||
pair->bcol_modules[0]->list_n_connected = list_n_connected;
|
||||
|
||||
|
||||
/* I should do one more round here and figure out my offset at this level
|
||||
* the calculation is simple: Am I a local leader in this level? If so, then I keep the offset
|
||||
* from the previous level. Else, I find out how "far away" the local leader is from me and set
|
||||
@ -1322,6 +1322,7 @@ static int mca_coll_ml_read_allbcols_settings(mca_coll_ml_module_t *ml_module,
|
||||
int64_t frag_size;
|
||||
const mca_bcol_base_component_2_0_0_t *bcol_component = NULL;
|
||||
mca_base_component_list_item_t *bcol_cli = NULL;
|
||||
int bcol_index;
|
||||
|
||||
/* If this assert fails, it means that you changed initialization
|
||||
* order and the date offset , that is critical for this section of code,
|
||||
@ -1332,9 +1333,9 @@ static int mca_coll_ml_read_allbcols_settings(mca_coll_ml_module_t *ml_module,
|
||||
|
||||
/* need to figure out which bcol's are participating
|
||||
* in the hierarchy across the communicator, so that we can set
|
||||
* appropriate segmantation parameters.
|
||||
* appropriate segmentation parameters.
|
||||
*/
|
||||
bcols_in_use = (int *) malloc(sizeof(int) * 2 * n_hierarchies);
|
||||
bcols_in_use = (int *) calloc(2 * n_hierarchies, sizeof(int));
|
||||
if (OPAL_UNLIKELY(NULL == bcols_in_use)) {
|
||||
ML_VERBOSE(10, ("Cannot allocate memory for bcols_in_use."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -1348,10 +1349,6 @@ static int mca_coll_ml_read_allbcols_settings(mca_coll_ml_module_t *ml_module,
|
||||
bcols_in_use_all_ranks = bcols_in_use+n_hierarchies;
|
||||
|
||||
/* get list of bcols that I am using */
|
||||
for(i = 0; i < n_hierarchies; i++) {
|
||||
bcols_in_use[i] = 0;
|
||||
}
|
||||
|
||||
for (j = 0; j < COLL_ML_TOPO_MAX; j++) {
|
||||
mca_coll_ml_topology_t *topo_info = &ml_module->topo_list[j];
|
||||
if (COLL_ML_TOPO_DISABLED == topo_info->status) {
|
||||
@ -1391,116 +1388,63 @@ static int mca_coll_ml_read_allbcols_settings(mca_coll_ml_module_t *ml_module,
|
||||
/*
|
||||
* figure out fragmenation parameters
|
||||
*/
|
||||
/* can user buffers be used */
|
||||
use_user_bufs = true;
|
||||
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use);
|
||||
|
||||
for (i = 0; i < n_hierarchies; i++) {
|
||||
if(!bcols_in_use_all_ranks) {
|
||||
/* this bcol is not being used - do nothing */
|
||||
continue;
|
||||
}
|
||||
bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component;
|
||||
/* check to see if user buffers can be used */
|
||||
if(!bcol_component->can_use_user_buffers) {
|
||||
/* need to use library buffers, so all will do this */
|
||||
use_user_bufs = false;
|
||||
break;
|
||||
}
|
||||
|
||||
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
|
||||
}
|
||||
|
||||
/* size of ml buffer */
|
||||
length_ml_payload = mca_coll_ml_component.payload_buffer_size - ml_module->data_offset;
|
||||
|
||||
if (use_user_bufs) {
|
||||
/*
|
||||
* using user buffers
|
||||
*/
|
||||
ml_module->use_user_buffers = 1;
|
||||
|
||||
/* figure out if data will be segmented for pipelining -
|
||||
* for non-contigous data will just use a fragment the size
|
||||
* of the ml payload buffer */
|
||||
|
||||
/* check to see if any bcols impose a limit */
|
||||
limit_size_user_bufs = false;
|
||||
bcol_cli = (mca_base_component_list_item_t *)opal_list_get_first(&mca_bcol_base_components_in_use);
|
||||
for (i = 0; i < n_hierarchies; i++) {
|
||||
bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component;
|
||||
if(bcol_component->max_frag_size != FRAG_SIZE_NO_LIMIT ){
|
||||
limit_size_user_bufs = true;
|
||||
break;
|
||||
}
|
||||
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
|
||||
}
|
||||
use_user_bufs = true;
|
||||
frag_size = length_ml_payload;
|
||||
bcol_index = 0;
|
||||
|
||||
if (limit_size_user_bufs) {
|
||||
/* figure out fragement size */
|
||||
frag_size = 0;
|
||||
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use);
|
||||
for (i = 0; i < n_hierarchies; i++) {
|
||||
OPAL_LIST_FOREACH(bcol_cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
|
||||
/* check to see if this bcol is being used */
|
||||
if(!bcols_in_use_all_ranks[i]) {
|
||||
if(!bcols_in_use_all_ranks[bcol_index]) {
|
||||
/* not in use */
|
||||
continue;
|
||||
}
|
||||
|
||||
bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component;
|
||||
if (FRAG_SIZE_NO_LIMIT == bcol_component->max_frag_size) {
|
||||
/* no limit - will not determine fragement size */
|
||||
continue;
|
||||
|
||||
/* check to see if user buffers can be used */
|
||||
if (!bcol_component->can_use_user_buffers) {
|
||||
/* need to use library buffers, so all will do this */
|
||||
use_user_bufs = false;
|
||||
}
|
||||
if (0 != bcol_component->max_frag_size) {
|
||||
|
||||
/* figure out fragement size */
|
||||
if (bcol_component->max_frag_size != FRAG_SIZE_NO_LIMIT ){
|
||||
/* user buffers need to be limited in size */
|
||||
limit_size_user_bufs = true;
|
||||
|
||||
if (0 == frag_size) {
|
||||
/* nothing set yet */
|
||||
frag_size = bcol_component->max_frag_size;
|
||||
} else {
|
||||
if(frag_size < bcol_component->max_frag_size) {
|
||||
} else if (frag_size < bcol_component->max_frag_size) {
|
||||
/* stricter constraint on fragment size */
|
||||
frag_size = bcol_component->max_frag_size;
|
||||
}
|
||||
}
|
||||
bcol_cli = (mca_base_component_list_item_t *)opal_list_get_next((opal_list_item_t *)bcol_cli);
|
||||
}
|
||||
|
||||
if (!use_user_bufs || limit_size_user_bufs) {
|
||||
/* we need to limit the user buffer size or use library buffers */
|
||||
ml_module->fragment_size = frag_size;
|
||||
} else {
|
||||
/* entire message may be processed in single chunk */
|
||||
ml_module->fragment_size = FRAG_SIZE_NO_LIMIT;
|
||||
}
|
||||
|
||||
/* for non-contigous data - just use the ML buffers */
|
||||
ml_module->ml_fragment_size = length_ml_payload;
|
||||
|
||||
} else {
|
||||
/*
|
||||
* using library buffers
|
||||
*/
|
||||
ml_module->use_user_buffers = 0;
|
||||
|
||||
/* figure out buffer size */
|
||||
ml_module->fragment_size = length_ml_payload;
|
||||
/* see if this is too large */
|
||||
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use);
|
||||
for (i = 0; i < n_hierarchies; i++) {
|
||||
/* check to see if this bcol is being used */
|
||||
if(!bcols_in_use_all_ranks[i]) {
|
||||
/* not in use */
|
||||
continue;
|
||||
}
|
||||
bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component;
|
||||
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
|
||||
if (FRAG_SIZE_NO_LIMIT == bcol_component->max_frag_size) {
|
||||
/* no limit - will not affect fragement size */
|
||||
continue;
|
||||
}
|
||||
if (bcol_component->max_frag_size < (int)ml_module->fragment_size)
|
||||
{
|
||||
/* frag size set too large */
|
||||
ml_module->fragment_size = bcol_component->max_frag_size;
|
||||
}
|
||||
}
|
||||
/* for non-contigous data - just use the ML buffers */
|
||||
ml_module->ml_fragment_size = ml_module->fragment_size;
|
||||
}
|
||||
/* set whether we can use user buffers */
|
||||
ml_module->use_user_buffers = use_user_bufs;
|
||||
|
||||
ML_VERBOSE(10, ("Seting payload size to %d %d [%d %d]",
|
||||
ml_module->ml_fragment_size, length_ml_payload,
|
||||
@ -1638,7 +1582,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
const mca_sbgp_base_component_2_0_0_t *sbgp_component = NULL;
|
||||
|
||||
|
||||
int i_hier = 0, n_hier = 0, ll_p1,
|
||||
int i_hier = 0, n_hier = 0, ll_p1, bcol_index = 0,
|
||||
n_procs_in = 0, group_index = 0, n_remain = 0,
|
||||
i, j, ret = OMPI_SUCCESS, my_rank_in_list = 0,
|
||||
n_procs_selected = 0, original_group_size = 0, i_am_done = 0,
|
||||
@ -1992,7 +1936,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
pair->bcol_component = (mca_bcol_base_component_t *)
|
||||
((mca_base_component_list_item_t *) bcol_cli)->cli_component;
|
||||
|
||||
pair->bcol_index = i_hier;
|
||||
pair->bcol_index = bcol_index;
|
||||
|
||||
/* create bcol modules */
|
||||
ML_VERBOSE(10, ("Create bcol modules."));
|
||||
@ -2026,7 +1970,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
module->group_size;
|
||||
|
||||
/* set the bcol id */
|
||||
pair->bcol_modules[i]->bcol_id = (int16_t) i_hier;
|
||||
pair->bcol_modules[i]->bcol_id = (int16_t) bcol_index;
|
||||
|
||||
/* Set bcol mode bits */
|
||||
topo->all_bcols_mode &= (( mca_bcol_base_module_t *) pair->bcol_modules[i])->supported_mode;
|
||||
@ -2041,10 +1985,10 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
n_hier++;
|
||||
|
||||
if (-1 == my_lowest_group_index) {
|
||||
my_lowest_group_index = i_hier;
|
||||
my_lowest_group_index = bcol_index;
|
||||
}
|
||||
|
||||
my_highest_group_index = i_hier;
|
||||
my_highest_group_index = bcol_index;
|
||||
}
|
||||
|
||||
/* if n_remain is 1, and the communicator size is not 1, and module
|
||||
@ -2077,6 +2021,8 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
i_hier++;
|
||||
}
|
||||
|
||||
++bcol_index;
|
||||
|
||||
n_procs_in = n_remain;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user