1
1
openmpi/ompi/mca/coll/han/coll_han_dynamic_file.c
bsergentm 94c817ceff Coll/han Bull
* first import of Bull specific modifications to HAN

* Cleaning, renaming and compilation fixing Changed all future into han.

* Import BULL specific modifications in coll/tuned and coll/base

* Fixed compilation issues in Han

* Changed han_output to directly point to coll framework output.

* The verbosity MCA parameter was removed as a duplicated of coll verbosity

* Add fallback in han reduce when op cannot commute and ppn are imbalanced

* Added fallback wfor han bcast when nodes do not have the same number of process

* Add fallback in han scatter when ppn are imbalanced

+ fixed missing scatter_fn pointer in the module interface

Signed-off-by: Brelle Emmanuel <emmanuel.brelle@atos.net>
Co-authored-by: a700850 <pierre.lemarinier@atos.net>
Co-authored-by: germainf <florent.germain@atos.net>
2020-10-26 21:35:12 -04:00

691 строка
28 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STDIO_H
#include <stdio.h>
#endif
#include "ompi_config.h"
#include "coll_han.h"
#include "coll_han_dynamic.h"
#include "coll_han_dynamic_file.h"
#include "ompi/mca/coll/base/coll_base_util.h"
static void check_dynamic_rules(void);
/* Current file line for verbose message */
static int fileline = 1;
#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline)
int
mca_coll_han_init_dynamic_rules(void)
{
/* File management */
const char *fname;
FILE *fptr = NULL;
int nb_entries = 0;
/* Loop counters */
int i, j, k, l;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size, msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
/* If the dynamic rules are not used, do not even read the file */
if(!mca_coll_han_component.use_dynamic_file_rules) {
nb_coll = 0;
return OMPI_SUCCESS;
}
fname = mca_coll_han_component.dynamic_rules_filename;
if(NULL == fname) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"coll_han_use_dynamic_file_rules is true but "
"coll_han_dynamic_rules_filename is not set: "
"coll han will use dynamic rules from mca "
"parameters and their default value\n");
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS;
}
fptr = fopen(fname, "r");
if(NULL == fptr) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"cannot open dynamic file provided by "
"coll_han_dynamic_rules_filename=%s "
"please provide it with full path and "
"check file permissions. Rules from "
"MCA parameters will be used instead\n",
fname);
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS;
}
/* The first information of the file is the collective count */
nb_coll = getnext(fptr);
if(nb_coll <= 0) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for collective count "
"or the reader encountered an unexpected EOF\n",
fname,
fileline,
nb_coll);
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
goto file_reading_error;
}
mca_coll_han_component.dynamic_rules.nb_collectives = nb_coll;
/* Allocate collective rules */
coll_rules = malloc(nb_coll * sizeof(collective_rule_t));
mca_coll_han_component.dynamic_rules.collective_rules = coll_rules;
if(NULL == coll_rules) {
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
goto cannot_allocate;
}
/* Iterates on collective rules */
for(i=0 ; i<nb_coll ; i++) {
/* Get the collective identifier */
coll_id = getnext(fptr);
if(coll_id < ALLGATHER || coll_id >= COLLCOUNT) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"invalid collective id %d at line %d: the collective "
"must be at least %d and less than %d\n",
coll_id,
fileline,
ALLGATHER,
COLLCOUNT);
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"read collective id %d at line %d "
"but this collective is not implemented yet. "
"This is not an error but this set of rules "
"will not be used\n",
fname,
coll_id,
fileline);
}
/*
* The first information of a collective rule
* is the number of topologic rules
*/
nb_topo = getnext(fptr);
if(nb_topo < 0) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for topo level count "
"or the reader encountered an unexpected EOF\n",
fname,
fileline,
nb_topo);
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Store the collective rule informations */
coll_rules[i].collective_id = coll_id;
coll_rules[i].nb_topologic_levels = nb_topo;
if(0 == nb_topo) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"Warning on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for topo level count\n",
fname,
fileline,
nb_topo);
continue;
}
/* Allocate topologic rules */
topo_rules = malloc(nb_topo * sizeof(topologic_rule_t));
coll_rules[i].topologic_rules = topo_rules;
if(NULL == topo_rules) {
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto cannot_allocate;
}
/* Iterates on topologic rules */
for(j=0 ; j<nb_topo ; j++) {
/* Get the topologic level identifier */
topo_lvl = getnext(fptr);
if(topo_lvl < INTRA_NODE || topo_lvl >= NB_TOPO_LVL) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid topo level %d is given "
"or the reader encountered an unexpected EOF. "
"Topologic level must be at least %d and "
"less than %d\n",
fname,
fileline,
topo_lvl,
INTRA_NODE,
NB_TOPO_LVL);
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/*
* The first information of a topologic rule
* is the number of configurations
*/
nb_rules = getnext(fptr);
if(nb_rules < 0) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d "
"is given for rules count "
"or the reader encountered an unexpected EOF\n",
fname,
fileline,
nb_rules);
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Store the topologic rule informations */
topo_rules[j].collective_id = coll_id;
topo_rules[j].topologic_level = topo_lvl;
topo_rules[j].nb_rules = nb_rules;
if(0 == nb_rules) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"Warning on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for configuration rules count\n",
fname,
fileline,
nb_rules);
continue;
}
/* Allocate configuration rules */
conf_rules = malloc(nb_rules * sizeof(configuration_rule_t));
topo_rules[j].configuration_rules = conf_rules;
if(NULL == conf_rules) {
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto cannot_allocate;
}
/* Iterate on configuration rules */
for(k=0 ; k<nb_rules ; k++) {
/* Get the configuration size */
conf_size = getnext(fptr);
if(conf_size < 1 || (0 == k && conf_size > 1)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"invalid configuration size %d at line %d "
"or the reader encountered an unexpected EOF "
"the configuration size must be at least %d "
"and the first configuration size "
"of a topologic level must be %d\n",
conf_size,
fileline,
1,
1);
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/*
* The first information of a configuration rule
* is the number of message size rules
*/
nb_msg_size = getnext(fptr);
if(nb_msg_size < 0) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d "
"is given for message size rules count "
"or the reader encountered an unexpected EOF\n",
fname,
fileline,
nb_msg_size);
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Store configuration rule information */
conf_rules[k].collective_id = coll_id;
conf_rules[k].topologic_level = topo_lvl;
conf_rules[k].configuration_size = conf_size;
conf_rules[k].nb_msg_size = nb_msg_size;
if(0 == nb_msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"Warning on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for message size rules count\n",
fname,
fileline,
nb_msg_size);
continue;
}
/* Allocate message size rules */
msg_size_rules = malloc(nb_msg_size * sizeof(msg_size_rule_t));
conf_rules[k].msg_size_rules = msg_size_rules;
if(NULL == msg_size_rules) {
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto cannot_allocate;
}
/* Iterate on message size rules */
for(l=0 ; l<nb_msg_size ; l++) {
/* Get the message size */
msg_size = getnext(fptr);
if(msg_size < 0
|| (0 ==l && msg_size > 1)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d "
"is given for message size "
"or the reader encountered "
"an unexpected EOF. "
"The first message size rule of "
"a configuration must be 0\n",
fname,
fileline,
msg_size);
conf_rules[k].nb_msg_size = l+1;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Get the component identifier for this message size rule */
component = getnext(fptr);
if(component < SELF || component >= COMPONENTS_COUNT) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid collective "
"component id %d is given or the "
"reader encountered an unexpected EOF. "
"Collective component id must be at "
"least %d and less than %d\n",
fname,
fileline,
component,
SELF,
COMPONENTS_COUNT);
conf_rules[k].nb_msg_size = l+1;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Store message size rule informations */
msg_size_rules[l].collective_id = coll_id;
msg_size_rules[l].topologic_level = topo_lvl;
msg_size_rules[l].configuration_size = conf_size;
msg_size_rules[l].msg_size = msg_size;
msg_size_rules[l].component = component;
nb_entries++;
}
}
}
}
if(MYEOF != getnext(fptr)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"Warning on file %s at line %d: "
"rule reading is over but reader does not seem "
"to have reached the end of the file\n",
fname,
fileline);
}
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"read %d rules from %s\n",
nb_entries,
fname);
if(mca_coll_han_component.dump_dynamic_rules) {
mca_coll_han_dump_dynamic_rules();
}
fclose(fptr);
check_dynamic_rules();
return OMPI_SUCCESS;
cannot_allocate:
/* The dynamic rules allocation failed
* Free the already allocated rules and return a failure
*/
opal_output_verbose(0, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"cannot allocate dynamic rules\n");
/* Do not check free_dynamic_rules
* because we are returning OMPI_ERROR anyway */
mca_coll_han_free_dynamic_rules();
return OMPI_ERROR;
file_reading_error:
opal_output_verbose(0, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"could not fully read dynamic rules file. "
"Will use mca parameters defined rules. "
"To see error detail, please set "
"collective verbosity level over 5\n");
if(fptr) {
fclose (fptr);
}
mca_coll_han_free_dynamic_rules();
return OMPI_SUCCESS;
}
void
mca_coll_han_free_dynamic_rules(void)
{
/* Loop counters */
int i, j, k;
/* Loop ranges */
int nb_coll, nb_topo, nb_conf;
/* Aliases */
collective_rule_t *coll_rules;
topologic_rule_t *topo_rules;
configuration_rule_t *conf_rules;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) {
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) {
nb_conf = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_conf ; k++) {
if(conf_rules[k].nb_msg_size > 0) {
free(conf_rules[k].msg_size_rules);
}
}
if(nb_conf > 0) {
free(conf_rules);
}
}
if(nb_topo > 0) {
free(topo_rules);
}
}
if(nb_coll > 0) {
free(coll_rules);
}
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
}
/*
* Try to find any logical issue in dynamic rules
*/
static void check_dynamic_rules(void)
{
/* Loop counters */
int i, j, k, l;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size, msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) {
coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) {
topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_rules ; k++) {
conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules;
if(k>=1 && conf_rules[k-1].configuration_size > conf_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules "
"Han found an issue on dynamic rules "
"for collective %d "
"on topological level %d: "
"configuration sizes %d and %d are "
"not sorted by increasing value\n",
coll_id,
topo_lvl,
conf_rules[k-1].configuration_size,
conf_size);
}
for(l=0 ; l<nb_msg_size ; l++) {
msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component;
if(l>=1 && msg_size_rules[l-1].msg_size > msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules "
"Han found an issue on dynamic rules "
"for collective %d "
"on topological level %d "
"with configuration size %d: "
"message sizes %d and %d are "
"not sorted by increasing value\n",
coll_id,
topo_lvl,
conf_size,
msg_size_rules[l-1].msg_size,
msg_size);
}
if(HAN == component
&& GLOBAL_COMMUNICATOR != topo_lvl) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules "
"Han found an issue on dynamic rules "
"for collective %d "
"on topological level %d "
"with configuration size %d "
"for message size %d: "
"han collective component %d "
"can only be activated for "
"topology level %d\n",
coll_id,
topo_lvl,
conf_size,
msg_size,
HAN,
GLOBAL_COMMUNICATOR);
}
}
}
}
}
}
void mca_coll_han_dump_dynamic_rules(void)
{
int nb_entries = 0;
/* Loop counters */
int i, j, k, l;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size, msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) {
coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) {
topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_rules ; k++) {
conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules;
for(l=0 ; l<nb_msg_size ; l++) {
msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component;
opal_output(mca_coll_han_component.han_output,
"coll:han:dump_dynamic_rules "
"Entry %d "
"collective %d (%s) "
"topology level %d (%s) "
"configuration size %d "
"mesage size %d "
"-> collective component %d (%s)\n",
nb_entries,
coll_id,
mca_coll_han_colltype_to_str(coll_id),
topo_lvl,
mca_coll_han_topo_lvl_to_str(topo_lvl),
conf_size,
msg_size,
component,
components_name[component]);
nb_entries++;
}
}
}
}
}