1
1

coll/ml: further improve the hierarchy discovery to handle the case where a

sbgp module fails to group any processes on any nodes.

cmr=v1.7.5:reviewer=manjugv

This commit was SVN r31131.
Этот коммит содержится в:
Nathan Hjelm 2014-03-18 21:26:24 +00:00
родитель 8b2d723fd4
Коммит e030443d45

Просмотреть файл

@ -1654,7 +1654,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
sub_group_params_t *array_of_all_subgroup_ranks=NULL;
/* this pointer should probably be an int32_t and not an int type */
int32_t *list_of_ranks_in_all_subgroups=NULL;
int cum_number_ranks_in_all_subgroups=0,num_total_subgroups=0;
int num_ranks_in_all_subgroups=0,num_total_subgroups=0;
int size_of_array_of_all_subgroup_ranks=0;
int size_of_list_of_ranks_in_all_subgroups=0;
int32_t in_allgather_value;
@ -1761,6 +1761,9 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
i_hier = 0;
while ((opal_list_item_t *) sbgp_cli != opal_list_get_end(&mca_sbgp_base_components_in_use)){
/* number of processes selected with this sbgp on all ranks */
int global_n_procs_selected;
/*
** obtain the list of ranks in the current level
*/
@ -1842,7 +1845,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
my_rank_in_subgroup=-1;
ll_p1=-1;
in_allgather_value = 0;
if( n_procs_selected) {
if (n_procs_selected) {
/* I need to contribute to the vector */
for (group_index = 0; group_index < n_procs_selected; group_index++) {
/* set my rank within the group */
@ -1925,12 +1928,13 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
* accumulate data on the new subgroups created
*/
/*XXX*/
global_n_procs_selected = num_ranks_in_all_subgroups;
ret = get_new_subgroup_data(all_selected, n_procs_in,
&array_of_all_subgroup_ranks,
&size_of_array_of_all_subgroup_ranks,
&list_of_ranks_in_all_subgroups,
&size_of_list_of_ranks_in_all_subgroups,
&cum_number_ranks_in_all_subgroups,
&num_ranks_in_all_subgroups,
&num_total_subgroups, map_to_comm_ranks,i_hier);
if( OMPI_SUCCESS != ret ) {
@ -1938,6 +1942,11 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
goto exit_ERROR;
}
/* the global number of processes selected at this level is the difference
* in the number of procs in all subgroups between this level and the
* last */
global_n_procs_selected = num_ranks_in_all_subgroups - global_n_procs_selected;
/* am I done ? */
i_am_done=0;
if ( (all_selected[my_rank_in_list] == ll_p1) &&
@ -1969,8 +1978,6 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
if ((1 == n_procs_selected) && n_remain > 1) {
OBJ_RELEASE(module);
n_procs_selected = 0;
/* at least one process was sselected at this level. increment the hierarchy */
i_hier++;
}
if( 0 < n_procs_selected ) {
@ -2055,15 +2062,16 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
if (n_remain != n_procs_in) {
/* do not increment the hierarchy if no processes were selected at this level */
i_hier++;
/* if no processes were selected anywhere with this sbgp module don't bother
* incrementing the hierarchy index. this resolves issues where (for example)
* process binding is not enabled or supported. */
if (global_n_procs_selected) {
/* The way initialization is currently written *all* ranks MUST appear
* in the first level (0) of the hierarchy. If any rank is not in the first
* level then the calculation of gather/scatter offsets will be wrong.
* NTH: DO NOT REMOVE this assert until this changes! */
assert (i_hier || cum_number_ranks_in_all_subgroups == n_procs_in);
assert (i_hier || global_n_procs_selected == n_procs_in);
i_hier++;
}
n_procs_in = n_remain;