1
1

coll/ml : Replace longer error message with opal_show_help; thanks Jeff for identifying those

This commit was SVN r31279.
Этот коммит содержится в:
Manjunath Gorentla Venkata 2014-03-28 19:25:54 +00:00
родитель c0931a5246
Коммит 8c849ee991
3 изменённых файлов: 80 добавлений и 54 удалений

Просмотреть файл

@ -13,7 +13,8 @@ AM_LFLAGS = -Pcoll_ml_config_yy
LEX_OUTPUT_ROOT = lex.coll_ml_config_yy
dist_ompidata_DATA = \
mca-coll-ml.config
mca-coll-ml.config \
help-mpi-coll-ml.txt
not_used_yet =

Просмотреть файл

@ -2046,21 +2046,17 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
/* If I was not done, it means that we skipped all subgroups and no hierarchy was build */
if (0 == i_am_done) {
if (NULL != include_sbgp_name || NULL != exclude_sbgp_name) {
/* User explicitly asked for specific type of topology, which generates empty group */
/* JMS You really should use opal_show_help() here;
showing long error messages is *exactly* what
opal_show_help() is for. */
ML_ERROR(("ML topology configuration explicitly requested to %s subgroup %s. "
"Such configuration results in a creation of empty groups. As a result, ML framework can't "
"configure requested collective operations. ML framework will be disabled.",
NULL != include_sbgp_name ? "include only" : "exclude",
NULL != include_sbgp_name ? include_sbgp_name : exclude_sbgp_name
));
opal_show_help("help-mpi-coll-ml.txt",
"empty-sub-group", true,
NULL != include_sbgp_name ? include_sbgp_name : exclude_sbgp_name);
ret = OMPI_ERROR;
goto exit_ERROR;
}
ML_VERBOSE(10, ("Empty hierarchy..."));
ML_VERBOSE(10, ("Constructing empty hierarchy"));
ret = OMPI_SUCCESS;
goto exit_ERROR;
}
@ -2233,44 +2229,26 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
ML_VERBOSE(10, ("Topology build: sbgp %s will be excluded.",
sbgp_component->sbgp_version.mca_component_name));
/* If there isn't additional component supports all types => print warning */
if (1 == opal_list_get_size(&mca_bcol_base_components_in_use) ||
(opal_list_item_t *) bcol_cli_next ==
opal_list_get_end(&mca_bcol_base_components_in_use)) {
/* JMS You really should use opal_show_help() here;
showing long error messages is *exactly* what
opal_show_help() is for. */
ML_ERROR(("\n--------------------------------------------------------------------------------"
"The BCOL component %s doesn't support "
"all possible tuples (OPERATION X DATATYPE) for Allreduce "
"and you didn't provide additional one for alternative topology building, "
"as a result ML isn't be run correctly and its behavior is undefined. "
"You should run this bcol with another one supports all possible tuples, "
"\"--mca bcol_base_string %s,ptpcoll --mca sbgp_base_subgroups_string %s,p2p\" for example.",
bcol_component->bcol_version.mca_component_name,
bcol_component->bcol_version.mca_component_name,
sbgp_component->sbgp_version.mca_component_name));
opal_show_help("help-mpi-coll-ml.txt",
"allreduce-not-supported", true,
bcol_component->bcol_version.mca_component_name);
} else {
bcol_component_next = (mca_bcol_base_component_2_0_0_t *)
bcol_cli_next->cli_component;
if (NULL != bcol_component_next->coll_support_all_types &&
!bcol_component_next->coll_support_all_types(BCOL_ALLREDUCE)) {
/* JMS You really should use opal_show_help() here;
showing long error messages is *exactly* what
opal_show_help() is for. */
ML_ERROR(("\n--------------------------------------------------------------------------------"
"The BCOL component %s doesn't support "
"all possible tuples for Allreduce. "
"While you did provid an additional %s bcol component for alternative topology building, "
"this component also lacks support for all tuples. "
"As a result, ML Allreduce's behavior is undefined. "
"You must provide a component that supports all possible tuples, e.g. "
"\"--mca bcol_base_string %s,ptpcoll --mca sbgp_base_subgroups_string %s,p2p",
bcol_component->bcol_version.mca_component_name,
bcol_component_next->bcol_version.mca_component_name,
bcol_component->bcol_version.mca_component_name,
sbgp_component->sbgp_version.mca_component_name));
opal_show_help("help-mpi-coll-ml.txt",
"allreduce-alt-nosupport", true,
bcol_component->bcol_version.mca_component_name);
}
}
@ -2739,12 +2717,11 @@ static int setup_bcast_table(mca_coll_ml_module_t *module)
if (1 == cm->enable_fragmentation || (2 == cm->enable_fragmentation && !has_zero_copy)) {
module->bcast_fn_index_table[1] = ML_BCAST_SMALL_DATA_KNOWN;
} else if (!has_zero_copy) {
/* JMS You really should use opal_show_help() here;
showing long error messages is *exactly* what
opal_show_help() is for. */
ML_ERROR(("ML couldn't be used: because the mca param coll_ml_enable_fragmentation "
"was set to zero and there is a bcol doesn't support zero copy method."));
opal_show_help("help-mpi-coll-ml.txt",
"fragmentation-disabled", true);
return OMPI_ERROR;
} else {
module->bcast_fn_index_table[1] = ML_BCAST_LARGE_DATA_KNOWN;
}
@ -2752,11 +2729,10 @@ static int setup_bcast_table(mca_coll_ml_module_t *module)
module->bcast_fn_index_table[0] = ML_BCAST_SMALL_DATA_UNKNOWN;
if (NULL == module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]) {
/* JMS You really should use opal_show_help() here;
showing long error messages is *exactly* what
opal_show_help() is for. */
ML_ERROR(("ML couldn't be used: because the mca param coll_ml_bcast_algorithm was not set "
"to static and no function is available."));
opal_show_help("help-mpi-coll-ml.txt",
"static-bcast-disabled", true);
return OMPI_ERROR;
}
@ -2766,11 +2742,10 @@ static int setup_bcast_table(mca_coll_ml_module_t *module)
if (1 == cm->enable_fragmentation || (2 == cm->enable_fragmentation && !has_zero_copy)) {
module->bcast_fn_index_table[1] = ML_BCAST_SMALL_DATA_UNKNOWN;
} else if (!has_zero_copy) {
/* JMS You really should use opal_show_help() here;
showing long error messages is *exactly* what
opal_show_help() is for. */
ML_ERROR(("ML couldn't be used: because the mca param coll_ml_enable_fragmentation "
"was set to zero and there is a bcol doesn't support zero copy method."));
opal_show_help("help-mpi-coll-ml.txt",
"fragmentation-disabled", true);
return OMPI_ERROR;
} else {
/* If the topology support zero level and no fragmentation was requested */

50
ompi/mca/coll/ml/help-mpi-coll-ml.txt Обычный файл
Просмотреть файл

@ -0,0 +1,50 @@
# -*- text -*-
#
# Copyright (c) 2009-2014 Oak Ridge National Laboratory. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI's Hierarchical Collective
# Component (coll/ml).
#
[empty-sub-group]
ML topology configuration explicitly requested for this subgroup:
%s
Such configuration results in a creation of empty groups. As a result, ML
framework cannot configure requested collective operations. ML framework will be
disabled. One configuration that might enable ML component is --mca bcol_base_string basesmuma,ptpcoll
--mca sbgp_base_subgroups_string basesmuma,p2p
[allreduce-not-supported]
This BCOL is configured in one of the hierarchy :
%s
The BCOL does not support Allreduce for all
operations and datatype combination. In addition, you did not suggest
alternate topology building configurations.
[allreduce-alt-nosupport]
The hierarchy is configured with alternate BCOL:
%s
Both the original topology and alternate topology not support Allreduce for all
operations and datatype combination. In addition, you did not suggest
alternate topology building configurations.
[fragmentation-disabled]
ML could not be used because the mca param coll_ml_enable_fragmentation
was set to zero and there is a bcol that does not support
zero copy method.
[static-bcast-disabled]
ML could not be used because the mca param coll_ml_bcast_algorithm
was not set to static and other broadcast implementation was available.