diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c index c1b33e6569..b0eff310d7 100644 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c +++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c @@ -480,9 +480,12 @@ mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules) sm_module->super.supported_mode = 0; + /* NTH: this is not set anywhere on the trunk as of 08/13/13 */ +#if 0 if (module->use_hdl) { sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY; } +#endif /* Initializes portals library required for basesmuma large message */ #ifdef __PORTALS_AVAIL__ diff --git a/ompi/mca/coll/ml/coll_ml.h b/ompi/mca/coll/ml/coll_ml.h index 5c32117fe6..e129ddc53e 100644 --- a/ompi/mca/coll/ml/coll_ml.h +++ b/ompi/mca/coll/ml/coll_ml.h @@ -525,8 +525,8 @@ struct mca_coll_ml_component_t { /* use hdl_framework */ bool use_hdl_bcast; - /* Enable / Disable fragmentation */ - bool enable_fragmentation; + /* Enable / Disable fragmentation (0 - off, 1 - on, 2 - auto) */ + int enable_fragmentation; /* Use sequential bcast algorithm */ bool use_sequential_bcast; diff --git a/ompi/mca/coll/ml/coll_ml_config.c b/ompi/mca/coll/ml/coll_ml_config.c index 041c09b4fd..7d8bba859a 100644 --- a/ompi/mca/coll/ml/coll_ml_config.c +++ b/ompi/mca/coll/ml/coll_ml_config.c @@ -276,9 +276,9 @@ static int set_section_name(section_config_t *section_config) void mca_coll_ml_reset_config(per_collective_configuration_t *config) { config->topology_id = ML_UNDEFINED; - config->threshold = ML_UNDEFINED;; - config->algorithm_id = ML_UNDEFINED;; - config->fragmentation_enabled = ML_UNDEFINED;; + config->threshold = ML_UNDEFINED; + config->algorithm_id = ML_UNDEFINED; + config->fragmentation_enabled = ML_UNDEFINED; } static void reset_section(section_config_t *section_cf) @@ -369,12 +369,12 @@ static int parse_fragmentation_key(section_config_t *section, char *value) } /* Save configuration that have been collected so far */ -static void save_settings(coll_config_t *coll_config) +static int save_settings(coll_config_t *coll_config) { per_collective_configuration_t *cf; if (ML_UNDEFINED == coll_config->coll_id || ML_UNDEFINED == coll_config->section.section_id) { - return; + return OMPI_ERROR; } cf = &mca_coll_ml_component.coll_config[coll_config->coll_id][coll_config->section.section_id]; @@ -383,6 +383,8 @@ static void save_settings(coll_config_t *coll_config) cf->threshold = coll_config->section.config.threshold; cf->algorithm_id = coll_config->section.config.algorithm_id; cf->fragmentation_enabled = coll_config->section.config.fragmentation_enabled; + + return OMPI_SUCCESS; } /* @@ -495,7 +497,7 @@ static int parse_file(char *filename) { int val; int ret = OMPI_SUCCESS; - + bool first_section = true, first_coll = true; coll_config_t coll_config; memset (&coll_config, 0, sizeof (coll_config)); @@ -521,11 +523,21 @@ static int parse_file(char *filename) break; case COLL_ML_CONFIG_PARSE_COLLECTIVE: /* dump all the information to last section that was defined */ - save_settings(&coll_config); + if (!first_coll) { + ret = save_settings(&coll_config); + if (OMPI_SUCCESS != ret) { + ML_ERROR(("Error in syntax for collective %s", coll_config.coll_name)); + goto cleanup; + } + } + /* reset collective config */ reset_collective(&coll_config); + first_coll = false; + first_section = true; + ret = set_collective_name(&coll_config); if (OMPI_SUCCESS != ret) { goto cleanup; @@ -538,8 +550,17 @@ static int parse_file(char *filename) goto cleanup; } - /* dump all the information to last section that was defined */ - save_settings(&coll_config); + if (!first_section) { + /* dump all the information to last section that was defined */ + ret = save_settings(&coll_config); + if (OMPI_SUCCESS != ret) { + ML_ERROR(("Error in syntax for collective %s section %s", coll_config.coll_name, + coll_config.section.section_name)); + goto cleanup; + } + } + + first_section = false; /* reset all section values */ reset_section(&coll_config.section); diff --git a/ompi/mca/coll/ml/coll_ml_mca.c b/ompi/mca/coll/ml/coll_ml_mca.c index c523ad876f..c7e2ff2580 100644 --- a/ompi/mca/coll/ml/coll_ml_mca.c +++ b/ompi/mca/coll/ml/coll_ml_mca.c @@ -1,6 +1,9 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,6 +44,15 @@ enum { REGSTR_MAX = 0x88 }; +/* + * Enumerators + */ +mca_base_var_enum_value_t fragmentation_enable_enum[] = { + {0, "disable"}, + {1, "enable"}, + {2, "auto"} +}; + /* * utility routine for string parameter registration */ @@ -172,6 +184,7 @@ static int mca_coll_ml_verify_params(void) int mca_coll_ml_register_params(void) { + mca_base_var_enum_t *new_enum; int ret, tmp; char *str = NULL; @@ -241,9 +254,21 @@ int mca_coll_ml_register_params(void) "Alltoall disabling", false, &mca_coll_ml_component.disable_alltoall)); - CHECK(reg_bool("enable_fragmentation", NULL, - "Disable/Enable fragmentation for large messages", - false, &mca_coll_ml_component.enable_fragmentation)); + tmp = mca_base_var_enum_create ("coll_ml_enable_fragmentation_enum", fragmentation_enable_enum, &new_enum); + if (OPAL_SUCCESS != ret) { + return tmp; + } + + /* default to auto-enable fragmentation */ + mca_coll_ml_component.enable_fragmentation = 2; + tmp = mca_base_component_var_register (&mca_coll_ml_component.super.collm_version, "enable_fragmentation", + "Disable/Enable fragmentation for large messages", MCA_BASE_VAR_TYPE_INT, + new_enum, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_ml_component.enable_fragmentation); + if (0 > tmp) { + ret = tmp; + } + OBJ_RELEASE(new_enum); CHECK(reg_int("use_brucks_smsg_alltoall", NULL, "Use Bruck's Algo for Small Msg Alltoall" diff --git a/ompi/mca/coll/ml/coll_ml_module.c b/ompi/mca/coll/ml/coll_ml_module.c index 0200b51292..d4106c6221 100644 --- a/ompi/mca/coll/ml/coll_ml_module.c +++ b/ompi/mca/coll/ml/coll_ml_module.c @@ -3106,14 +3106,18 @@ static int mca_coll_ml_need_multi_topo(int bcol_collective) static int setup_bcast_table(mca_coll_ml_module_t *module) { mca_coll_ml_component_t *cm = &mca_coll_ml_component; + bool has_zero_copy; /* setup bcast index table */ if (cm->use_static_bcast) { module->bcast_fn_index_table[0] = ML_BCAST_SMALL_DATA_KNOWN; - if (cm->enable_fragmentation) { + + has_zero_copy = !!(MCA_BCOL_BASE_ZERO_COPY & + module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_KNOWN]->topo_info->all_bcols_mode); + + if (1 == cm->enable_fragmentation || (2 == cm->enable_fragmentation && !has_zero_copy)) { module->bcast_fn_index_table[1] = ML_BCAST_SMALL_DATA_KNOWN; - } else if (!(MCA_BCOL_BASE_ZERO_COPY & - module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_KNOWN]->topo_info->all_bcols_mode)) { + } else if (!has_zero_copy) { ML_ERROR(("ML couldn't be used: because the mca param coll_ml_enable_fragmentation " "was set to zero and there is a bcol doesn't support zero copy method.")); return OMPI_ERROR; @@ -3122,10 +3126,19 @@ static int setup_bcast_table(mca_coll_ml_module_t *module) } } else { module->bcast_fn_index_table[0] = ML_BCAST_SMALL_DATA_UNKNOWN; - if (cm->enable_fragmentation) { + + if (NULL == module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]) { + ML_ERROR(("ML couldn't be used: because the mca param coll_ml_use_static_bcast was set " + "to zero and no function is available.")); + return OMPI_ERROR; + } + + has_zero_copy = !!(MCA_BCOL_BASE_ZERO_COPY & + module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]->topo_info->all_bcols_mode); + + if (1 == cm->enable_fragmentation || (2 == cm->enable_fragmentation && !has_zero_copy)) { module->bcast_fn_index_table[1] = ML_BCAST_SMALL_DATA_UNKNOWN; - } else if (!(MCA_BCOL_BASE_ZERO_COPY & - module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]->topo_info->all_bcols_mode)) { + } else if (!has_zero_copy) { ML_ERROR(("ML couldn't be used: because the mca param coll_ml_enable_fragmentation " "was set to zero and there is a bcol doesn't support zero copy method.")); return OMPI_ERROR;