1
1
openmpi/ompi/mca/coll/han/coll_han_dynamic_file.c
George Bosilca 154117515a Fix HAN issues reported by Coverity.
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
2020-10-28 15:44:16 -04:00

611 строки
26 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STDIO_H
#include <stdio.h>
#endif
#include "ompi_config.h"
#include "coll_han.h"
#include "coll_han_dynamic.h"
#include "coll_han_dynamic_file.h"
#include "ompi/mca/coll/base/coll_base_util.h"
#define getnext_long(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval)
#define getnext_string(fptr, pval) ompi_coll_base_file_getnext_string(fptr, &fileline, pval)
#define getnext_size_t(fptr, pval) ompi_coll_base_file_getnext_size_t(fptr, &fileline, pval)
static void check_dynamic_rules(void);
/* Current file line for verbose message */
static int fileline = 1;
int
mca_coll_han_init_dynamic_rules(void)
{
/* File management */
const char *fname;
FILE *fptr = NULL;
int nb_entries = 0, rc;
/* Loop counters */
int i, j, k, l;
/* Collective informations */
long nb_coll, coll_id;
char * coll_name = NULL;
collective_rule_t *coll_rules;
/* Topo informations */
long nb_topo, topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
long nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
long nb_msg_size;
size_t msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
long component;
/* If the dynamic rules are not used, do not even read the file */
if(!mca_coll_han_component.use_dynamic_file_rules) {
nb_coll = 0;
return OMPI_SUCCESS;
}
if( NULL == (fname = mca_coll_han_component.dynamic_rules_filename) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules coll_han_use_dynamic_file_rules is set but "
"coll_han_dynamic_rules_filename is not Rules from MCA parameters will be used instead\n");
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS;
}
if( NULL == (fptr = fopen(fname, "r")) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules cannot open dynamic file provided by "
"coll_han_dynamic_rules_filename=%s. Make sure it provides the full path and "
"check file permissions. Rules from MCA parameters will be used instead\n",
fname);
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS;
}
/* The first information of the file is the collective count */
if( (getnext_long(fptr, &nb_coll) < 0) || (nb_coll <= 0) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %ld is given for collective count "
"or the reader encountered an unexpected EOF\n",
fname, fileline, nb_coll);
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
goto file_reading_error;
}
mca_coll_han_component.dynamic_rules.nb_collectives = nb_coll;
/* Allocate collective rules */
coll_rules = malloc(nb_coll * sizeof(collective_rule_t));
mca_coll_han_component.dynamic_rules.collective_rules = coll_rules;
if(NULL == coll_rules) {
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
goto cannot_allocate;
}
/* Iterates on collective rules */
for( i = 0 ; i < nb_coll ; i++ ) {
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
/* Get the collective identifier */
if( getnext_string(fptr, &coll_name) < 0 ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules invalid collective at line %d."
"The rest of the input file will be ignored.\n",
fileline);
goto file_reading_error;
}
coll_id = mca_coll_base_name_to_colltype(coll_name);
if( (coll_id < ALLGATHER) || (coll_id >= COLLCOUNT)) {
/* maybe the file was in the old format and we read the collective index instead of the name. */
char* endp;
coll_id = strtol(coll_name, &endp, 10);
if( ('\0' != *endp ) || (coll_id < ALLGATHER) || (coll_id >= COLLCOUNT) ) { /* there is garbage in the input */
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules invalid collective %s "
"at line %d: the collective must be at least %d and less than %d. "
"The rest of the input file will be ignored.\n",
coll_name, fileline, ALLGATHER, COLLCOUNT);
goto file_reading_error;
}
if( NULL != coll_name ) {
free(coll_name);
}
coll_name = strdup(mca_coll_base_colltype_to_str(coll_id));
}
if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"read collective id %ld at line %d but this collective is not implemented yet. "
"This is not an error but this set of rules will not be used\n",
fname, coll_id, fileline);
}
/*
* The first information of a collective rule
* is the number of topologic rules
*/
if( (getnext_long(fptr, &nb_topo) < 0) || (nb_topo < 0) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %ld is given for topo level count "
"or the reader encountered an unexpected EOF\n",
fname, fileline, nb_topo);
goto file_reading_error;
}
/* Store the collective rule informations */
coll_rules[i].nb_topologic_levels = nb_topo;
coll_rules[i].collective_id = (COLLTYPE_T)coll_id;
if(0 == nb_topo) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s "
"at line %d: an invalid value %ld is given for topo level count\n",
fname, fileline, nb_topo);
continue;
}
/* Allocate topologic rules */
topo_rules = malloc(nb_topo * sizeof(topologic_rule_t));
coll_rules[i].topologic_rules = topo_rules;
if(NULL == topo_rules) {
coll_rules[i].nb_topologic_levels = 0;
goto cannot_allocate;
}
/* Iterates on topologic rules */
for( j = 0 ; j < nb_topo ; j++ ) {
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
/* Get the topologic level identifier */
if( (getnext_long(fptr, &topo_lvl) < 0) || (topo_lvl < INTRA_NODE) || (topo_lvl >= NB_TOPO_LVL) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid topo level %ld is given or the reader encountered an unexpected EOF. "
"Topologic level must be at least %d and less than %d\n",
fname, fileline, topo_lvl, INTRA_NODE, NB_TOPO_LVL);
goto file_reading_error;
}
/*
* The first information of a topologic rule
* is the number of configurations
*/
nb_rules = -1;
if( (getnext_long(fptr, &nb_rules) < 0) || (nb_rules < 0) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %ld is given for rules count "
"or the reader encountered an unexpected EOF\n",
fname, fileline, nb_rules);
goto file_reading_error;
}
/* Store the topologic rule informations */
topo_rules[j].collective_id = coll_id;
topo_rules[j].topologic_level = (TOPO_LVL_T)topo_lvl;
topo_rules[j].nb_rules = nb_rules;
if(0 == nb_rules) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s "
"at line %d: an invalid value %ld is given for configuration rules count\n",
fname, fileline, nb_rules);
continue;
}
/* Allocate configuration rules */
conf_rules = malloc(nb_rules * sizeof(configuration_rule_t));
topo_rules[j].configuration_rules = conf_rules;
if(NULL == conf_rules) {
topo_rules[j].nb_rules = 0;
goto cannot_allocate;
}
/* Iterate on configuration rules */
for( k = 0; k < nb_rules; k++ ) {
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
/* Get the configuration size */
if( (getnext_long(fptr, &conf_size) < 0) || (conf_size < 1) || (0 == k && conf_size > 1) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules invalid configuration size %ld at line %d "
"or the reader encountered an unexpected EOF the configuration size must be at least %d "
"and the first configuration size of a topologic level must be %d\n",
conf_size, fileline, 1, 1);
goto file_reading_error;
}
/*
* The first information of a configuration rule
* is the number of message size rules
*/
if( (getnext_long(fptr, &nb_msg_size) < 0) || (nb_msg_size < 0) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %ld is given for message size rules count "
"or the reader encountered an unexpected EOF\n",
fname, fileline, nb_msg_size);
goto file_reading_error;
}
/* Store configuration rule information */
conf_rules[k].collective_id = coll_id;
conf_rules[k].topologic_level = topo_lvl;
conf_rules[k].configuration_size = conf_size;
conf_rules[k].nb_msg_size = nb_msg_size;
if(0 == nb_msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s "
"at line %d: an invalid value %ld is given for message size rules count\n",
fname, fileline, nb_msg_size);
continue;
}
/* Allocate message size rules */
msg_size_rules = malloc(nb_msg_size * sizeof(msg_size_rule_t));
conf_rules[k].msg_size_rules = msg_size_rules;
if(NULL == msg_size_rules) {
conf_rules[k].nb_msg_size = 0;
goto cannot_allocate;
}
/* Iterate on message size rules */
for( l = 0; l < nb_msg_size; l++ ) {
char* target_comp_name = NULL;
conf_rules[k].nb_msg_size = l+1;
/* Get the message size */
rc = getnext_size_t(fptr, &msg_size);
if( (rc < 0) ||
(0 == l && msg_size > 1)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid value %" PRIsize_t " is given for message size "
"or the reader encountered an unexpected EOF. "
"The first message size rule of a configuration must be 0\n",
fname, fileline, msg_size);
goto file_reading_error;
}
/* Get the component identifier for this message size rule */
if( getnext_string(fptr, &target_comp_name) < 0 ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: cannot read the name of a collective component\n",
fname, fileline);
goto file_reading_error;
}
component = mca_coll_han_component_name_to_id(target_comp_name);
if( (component < SELF) || (component >= COMPONENTS_COUNT) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s "
"at line %d: an invalid collective component name %s was given or the "
"reader encountered an unexpected EOF. Collective component id must be at "
"least %d and less than %d\n",
fname, fileline, target_comp_name, SELF, COMPONENTS_COUNT);
free(target_comp_name);
goto file_reading_error;
}
/* Store message size rule information */
msg_size_rules[l].collective_id = coll_id;
msg_size_rules[l].topologic_level = topo_lvl;
msg_size_rules[l].configuration_size = conf_size;
msg_size_rules[l].msg_size = msg_size;
msg_size_rules[l].component = (COMPONENT_T)component;
nb_entries++;
/* do we have the optional segment length */
if( 1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '[') ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules found optional pipelining segment lengths\n");
long seglength;
if( 0 != topo_lvl ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"file %s line %d found segment lengths for topological collective at level != 0 "
"for collective %s component %s. These values will be ignored.\n",
fname, fileline, coll_name, target_comp_name);
}
while( 0 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, ']') ) {
if( getnext_long(fptr, &seglength) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"file %s line %d found end of file while reading the optional list "
"of segment lengths for collective %s component %s\n",
fname, fileline, coll_name, target_comp_name);
free(target_comp_name);
goto file_reading_error;
}
}
}
free(target_comp_name);
}
}
}
if( NULL != coll_name ) {
free(coll_name);
coll_name = NULL;
}
}
if( getnext_long(fptr, &nb_coll) > 0 ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules. Warning on file %s at line %d: "
"rule reading is over but reader does not seem to have reached the end of the file\n",
fname, fileline);
}
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules read %d rules from %s\n",
nb_entries, fname);
if(mca_coll_han_component.dump_dynamic_rules) {
mca_coll_han_dump_dynamic_rules();
}
fclose(fptr);
check_dynamic_rules();
return OMPI_SUCCESS;
cannot_allocate:
/* The dynamic rules allocation failed
* Free the already allocated rules and return a failure
*/
opal_output_verbose(0, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"cannot allocate dynamic rules\n");
if( NULL != coll_name ) {
free(coll_name);
}
fclose (fptr);
/* We disable the module, we don't need to keep the rules */
mca_coll_han_free_dynamic_rules();
return OMPI_ERROR;
file_reading_error:
opal_output_verbose(0, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"could not fully read dynamic rules file. "
"Will use mca parameters defined rules. "
"To see error detail, please set "
"collective verbosity level over 5\n");
if( NULL != coll_name ) {
free(coll_name);
}
fclose (fptr);
/* We disable the module, we don't need to keep the rules */
mca_coll_han_free_dynamic_rules();
return OMPI_SUCCESS;
}
void
mca_coll_han_free_dynamic_rules(void)
{
/* Loop counters */
int i, j, k;
/* Loop ranges */
int nb_coll, nb_topo, nb_conf;
/* Aliases */
collective_rule_t *coll_rules;
topologic_rule_t *topo_rules;
configuration_rule_t *conf_rules;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) {
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) {
nb_conf = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_conf ; k++) {
if(conf_rules[k].nb_msg_size > 0) {
free(conf_rules[k].msg_size_rules);
}
}
if(nb_conf > 0) {
free(conf_rules);
}
}
if(nb_topo > 0) {
free(topo_rules);
}
}
if(nb_coll > 0) {
free(coll_rules);
}
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
}
/*
* Try to find any logical issue in dynamic rules
*/
static void check_dynamic_rules(void)
{
/* Loop counters */
int i, j, k, l;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size;
size_t msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for( i = 0; i < nb_coll; i++ ) {
coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for( j = 0; j < nb_topo; j++ ) {
topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for( k = 0; k < nb_rules; k++ ) {
conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules;
if( k >= 1 && conf_rules[k-1].configuration_size > conf_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules HAN found an issue on dynamic rules "
"for collective %d on topological level %d: "
"configuration sizes %d and %d are not sorted by increasing value\n",
coll_id, topo_lvl, conf_rules[k-1].configuration_size, conf_size);
}
for( l = 0; l < nb_msg_size; l++ ) {
msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component;
if( l >= 1 && msg_size_rules[l-1].msg_size > msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules HAN found an issue on dynamic rules "
"for collective %d on topological level %d with configuration size %d: "
"message sizes %" PRIsize_t " and %" PRIsize_t " are "
"not sorted by increasing value\n",
coll_id, topo_lvl, conf_size, msg_size_rules[l-1].msg_size, msg_size);
}
if( (HAN == component) && (GLOBAL_COMMUNICATOR != topo_lvl) ) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules HAN found an issue on dynamic rules "
"for collective %d on topological level %d with configuration size %d "
"for message size %" PRIsize_t ": han collective component %d "
"can only be activated for topology level %d\n",
coll_id, topo_lvl, conf_size, msg_size, HAN, GLOBAL_COMMUNICATOR);
}
}
}
}
}
}
void mca_coll_han_dump_dynamic_rules(void)
{
int nb_entries = 0;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size, msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(int i = 0; i < nb_coll; i++ ) {
coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(int j = 0; j < nb_topo; j++ ) {
topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(int k = 0; k < nb_rules; k++ ) {
conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules;
for(int l = 0; l < nb_msg_size; l++ ) {
msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component;
opal_output(mca_coll_han_component.han_output,
"coll:han:dump_dynamic_rules %d collective %d (%s) "
"topology level %d (%s) configuration size %d "
"mesage size %d -> collective component %d (%s)\n",
nb_entries, coll_id, mca_coll_base_colltype_to_str(coll_id),
topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), conf_size,
msg_size, component, available_components[component].component_name);
nb_entries++;
}
}
}
}
}