1
1
* first import of Bull specific modifications to HAN

* Cleaning, renaming and compilation fixing Changed all future into han.

* Import BULL specific modifications in coll/tuned and coll/base

* Fixed compilation issues in Han

* Changed han_output to directly point to coll framework output.

* The verbosity MCA parameter was removed as a duplicated of coll verbosity

* Add fallback in han reduce when op cannot commute and ppn are imbalanced

* Added fallback wfor han bcast when nodes do not have the same number of process

* Add fallback in han scatter when ppn are imbalanced

+ fixed missing scatter_fn pointer in the module interface

Signed-off-by: Brelle Emmanuel <emmanuel.brelle@atos.net>
Co-authored-by: a700850 <pierre.lemarinier@atos.net>
Co-authored-by: germainf <florent.germain@atos.net>
Этот коммит содержится в:
bsergentm 2020-05-09 18:16:32 +02:00 коммит произвёл George Bosilca
родитель 182c333b21
Коммит 220b997a58
22 изменённых файлов: 4775 добавлений и 509 удалений

Просмотреть файл

@ -21,6 +21,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -44,20 +45,12 @@
#include "opal/mca/base/base.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/*
* Local types
* Stuff for the OBJ interface
*/
struct avail_coll_t {
opal_list_item_t super;
int ac_priority;
mca_coll_base_module_2_3_0_t *ac_module;
const char * ac_component_name;
};
typedef struct avail_coll_t avail_coll_t;
OBJ_CLASS_INSTANCE(mca_coll_base_avail_coll_t, opal_list_item_t, NULL, NULL);
/*
* Local functions
@ -77,12 +70,6 @@ static int query_2_0_0(const mca_coll_base_component_2_0_0_t *
int *priority,
mca_coll_base_module_2_3_0_t ** module);
/*
* Stuff for the OBJ interface
*/
static OBJ_CLASS_INSTANCE(avail_coll_t, opal_list_item_t, NULL, NULL);
#define COPY(module, comm, func) \
do { \
if (NULL != module->coll_ ## func) { \
@ -138,11 +125,14 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
/* FIX ME - Do some kind of collective operation to find a module
that everyone has available */
/* List to store every valid module */
comm->c_coll->module_list = OBJ_NEW(opal_list_t);
/* do the selection loop */
for (item = opal_list_remove_first(selectable);
NULL != item; item = opal_list_remove_first(selectable)) {
avail_coll_t *avail = (avail_coll_t *) item;
mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item;
/* initialize the module */
ret = avail->ac_module->coll_module_enable(avail->ac_module, comm);
@ -153,6 +143,9 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
(OMPI_SUCCESS == ret ? "Enabled": "Disabled") );
if (OMPI_SUCCESS == ret) {
/* Save every component that is initialized,
* queried and enabled successfully */
opal_list_append(comm->c_coll->module_list, &avail->super);
/* copy over any of the pointers */
COPY(avail->ac_module, comm, allgather);
@ -230,10 +223,11 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
COPY(avail->ac_module, comm, neighbor_alltoallw_init);
COPY(avail->ac_module, comm, reduce_local);
} else {
/* release the original module reference and the list item */
OBJ_RELEASE(avail->ac_module);
OBJ_RELEASE(avail);
}
/* release the original module reference and the list item */
OBJ_RELEASE(avail->ac_module);
OBJ_RELEASE(avail);
}
/* Done with the list from the check_components() call so release it. */
@ -306,8 +300,8 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
static int avail_coll_compare (opal_list_item_t **a,
opal_list_item_t **b) {
avail_coll_t *acoll = (avail_coll_t *) *a;
avail_coll_t *bcoll = (avail_coll_t *) *b;
mca_coll_base_avail_coll_t *acoll = (mca_coll_base_avail_coll_t *) *a;
mca_coll_base_avail_coll_t *bcoll = (mca_coll_base_avail_coll_t *) *b;
if (acoll->ac_priority > bcoll->ac_priority) {
return 1;
@ -332,7 +326,7 @@ static opal_list_t *check_components(opal_list_t * components,
mca_base_component_list_item_t *cli;
mca_coll_base_module_2_3_0_t *module;
opal_list_t *selectable;
avail_coll_t *avail;
mca_coll_base_avail_coll_t *avail;
/* Make a list of the components that query successfully */
selectable = OBJ_NEW(opal_list_t);
@ -345,7 +339,7 @@ static opal_list_t *check_components(opal_list_t * components,
if (priority >= 0) {
/* We have a component that indicated that it wants to run
by giving us a module */
avail = OBJ_NEW(avail_coll_t);
avail = OBJ_NEW(mca_coll_base_avail_coll_t);
avail->ac_priority = priority;
avail->ac_module = module;
// Point to the string so we don't have to free later

Просмотреть файл

@ -16,6 +16,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -34,6 +35,7 @@
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/coll/base/coll_base_util.h"
#define CLOSE(comm, func) \
do { \
@ -50,6 +52,8 @@
int mca_coll_base_comm_unselect(ompi_communicator_t * comm)
{
opal_list_item_t *item;
CLOSE(comm, allgather);
CLOSE(comm, allgatherv);
CLOSE(comm, allreduce);
@ -124,6 +128,17 @@ int mca_coll_base_comm_unselect(ompi_communicator_t * comm)
CLOSE(comm, reduce_local);
for (item = opal_list_remove_first(comm->c_coll->module_list);
NULL != item; item = opal_list_remove_first(comm->c_coll->module_list)) {
mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item;
if(avail->ac_module) {
OBJ_RELEASE(avail->ac_module);
}
OBJ_RELEASE(avail);
}
OBJ_RELEASE(comm->c_coll->module_list);
free(comm->c_coll);
comm->c_coll = NULL;

Просмотреть файл

@ -305,3 +305,39 @@ static void nbc_req_cons(ompi_coll_base_nbc_request_t *req) {
}
OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, NULL);
/* File reading functions */
static void skiptonewline (FILE *fptr, int *fileline)
{
do {
char val;
int rc;
rc = fread(&val, 1, 1, fptr);
if (0 == rc) return;
if ((1 == rc)&&('\n' == val)) {
(*fileline)++;
return;
}
} while (1);
}
long ompi_coll_base_file_getnext (FILE *fptr, int *fileline)
{
do {
long val;
int rc;
char trash;
rc = fscanf(fptr, "%li", &val);
if (rc == EOF) return MYEOF;
if (1 == rc) return val;
/* in all other cases, skip to the end */
rc = fread(&trash, sizeof(char), 1, fptr);
if (rc == EOF) return MYEOF;
if ('\n' == trash) (*fileline)++;
if ('#' == trash) {
skiptonewline (fptr, fileline);
}
} while (1);
}

Просмотреть файл

@ -84,6 +84,19 @@ ompi_coll_base_nbc_reserve_tags(ompi_communicator_t* comm, int32_t reserve)
typedef struct ompi_coll_base_nbc_request_t ompi_coll_base_nbc_request_t;
/*
* Structure to store an available module
*/
struct mca_coll_base_avail_coll_t {
opal_list_item_t super;
int ac_priority;
mca_coll_base_module_t *ac_module;
const char * ac_component_name;
};
typedef struct mca_coll_base_avail_coll_t mca_coll_base_avail_coll_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_avail_coll_t);
/**
* A MPI_like function doing a send and a receive simultaneously.
* If one of the communications results in a zero-byte message the
@ -164,5 +177,9 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request,
ompi_datatype_t * const stypes[],
ompi_datatype_t * const rtypes[]);
/* File reading function */
#define MYEOF -999
long ompi_coll_base_file_getnext(FILE *fptr, int *fileline);
END_C_DECLS
#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */

Просмотреть файл

@ -19,6 +19,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -767,6 +768,9 @@ struct mca_coll_base_comm_coll_t {
mca_coll_base_module_reduce_local_fn_t coll_reduce_local;
mca_coll_base_module_2_3_0_t *coll_reduce_local_module;
/* List of modules initialized, queried and enabled */
opal_list_t *module_list;
};
typedef struct mca_coll_base_comm_coll_t mca_coll_base_comm_coll_t;

Просмотреть файл

@ -12,6 +12,8 @@
sources = \
coll_han.h \
coll_han_trigger.h \
coll_han_dynamic.h \
coll_han_dynamic_file.h \
coll_han_bcast.c \
coll_han_reduce.c \
coll_han_scatter.c \
@ -21,6 +23,10 @@ coll_han_allgather.c \
coll_han_component.c \
coll_han_module.c \
coll_han_trigger.c \
coll_han_dynamic.c \
coll_han_dynamic_file.c \
coll_han_topo.c \
coll_han_subcomms.c \
coll_han_utils.c
# Make the output library in this directory, and name it either

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -16,19 +17,23 @@
#include "mpi.h"
#include "ompi/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/communicator/communicator.h"
#include "ompi/include/mpi.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "opal/util/info.h"
#include "ompi/op/op.h"
#include "opal/runtime/opal_progress.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "opal/util/output.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_han_trigger.h"
#include "ompi/mca/coll/han/coll_han_dynamic.h"
BEGIN_C_DECLS typedef struct {
BEGIN_C_DECLS
/*
* Today;
* . only 2 modules available for intranode (low) level
* . only 2 modules available for internode (up) level
*/
#define COLL_HAN_LOW_MODULES 2
#define COLL_HAN_UP_MODULES 2
typedef struct {
uint32_t umod;
uint32_t lmod;
uint32_t fs;
@ -200,6 +205,10 @@ typedef struct mca_coll_han_component_t {
uint32_t han_scatter_low_module;
/* whether enable auto tune */
uint32_t han_auto_tune;
/* whether we need reproducible results
* (but disables topological optimisations)
*/
uint32_t han_reproducible;
/* create a 3D array
* num_processes (n): 2 4 8 16 32 64 (6)
* num_core (c): 2 4 8 12 (4)
@ -209,8 +218,42 @@ typedef struct mca_coll_han_component_t {
uint32_t han_auto_tune_c;
uint32_t han_auto_tune_m;
selection *han_auto_tuned;
bool use_simple_algorithm[COLLCOUNT];
/* Dynamic configuration rules */
bool use_dynamic_file_rules;
bool dump_dynamic_rules;
char* dynamic_rules_filename;
/* Dynamic rules from file */
mca_coll_han_dynamic_rules_t dynamic_rules;
/* Dynamic rules from mca parameter */
COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL];
int topo_level;
/* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */
int max_dynamic_errors;
} mca_coll_han_component_t;
typedef void (*previous_dummy_fn_t) (void);
/*
* Structure used to store what is necessary for the collective operations
* routines in case of fallback.
*/
typedef struct collective_fallback_t {
union {
mca_coll_base_module_allgather_fn_t allgather;
mca_coll_base_module_allgatherv_fn_t allgatherv;
mca_coll_base_module_allreduce_fn_t allreduce;
mca_coll_base_module_bcast_fn_t bcast;
mca_coll_base_module_gather_fn_t gather;
mca_coll_base_module_reduce_fn_t reduce;
mca_coll_base_module_scatter_fn_t scatter;
previous_dummy_fn_t dummy;
} previous_routine;
mca_coll_base_module_t *previous_module;
} collective_fallback_t;
/** Coll han module */
typedef struct mca_coll_han_module_t {
/** Base module */
@ -225,9 +268,56 @@ typedef struct mca_coll_han_module_t {
int *cached_vranks;
int *cached_topo;
bool is_mapbycore;
bool are_ppn_imbalanced;
/* To be able to fallback when the cases are not supported */
struct collective_fallback_t previous_routines[COLLCOUNT];
/* To be able to fallback on reproducible algorithm */
mca_coll_base_module_reduce_fn_t reproducible_reduce;
mca_coll_base_module_t *reproducible_reduce_module;
mca_coll_base_module_allreduce_fn_t reproducible_allreduce;
mca_coll_base_module_t *reproducible_allreduce_module;
/* Topological level of this communicator */
int topologic_level;
/* Collective module storage for module choice */
mca_coll_han_collective_modules_storage_t modules_storage;
bool storage_initialized;
/*
* Number of dynamic errors encountered
* The first mca_coll_han_component.max_dynamic_errors
* of rank 0 are printed with verbosity = 0
*/
int dynamic_errors;
/* Sub-communicator */
struct ompi_communicator_t *sub_comm[NB_TOPO_LVL];
} mca_coll_han_module_t;
OBJ_CLASS_DECLARATION(mca_coll_han_module_t);
/*
* Some defines to stick to the naming used in the other components in terms of
* fallback routines
*/
#define previous_allgather previous_routines[ALLGATHER].previous_routine.allgather
#define previous_allgatherv previous_routines[ALLGATHERV].previous_routine.allgatherv
#define previous_allreduce previous_routines[ALLREDUCE].previous_routine.allreduce
#define previous_bcast previous_routines[BCAST].previous_routine.bcast
#define previous_gather previous_routines[GATHER].previous_routine.gather
#define previous_reduce previous_routines[REDUCE].previous_routine.reduce
#define previous_scatter previous_routines[SCATTER].previous_routine.scatter
#define previous_allgather_module previous_routines[ALLGATHER].previous_module
#define previous_allgatherv_module previous_routines[ALLGATHERV].previous_module
#define previous_allreduce_module previous_routines[ALLREDUCE].previous_module
#define previous_bcast_module previous_routines[BCAST].previous_module
#define previous_gather_module previous_routines[GATHER].previous_module
#define previous_reduce_module previous_routines[REDUCE].previous_module
#define previous_scatter_module previous_routines[SCATTER].previous_module
/**
* Global component instance
*/
@ -244,17 +334,10 @@ int han_request_free(ompi_request_t ** request);
/* Subcommunicator creation */
void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module);
void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module);
/* Gather topology information */
int mca_coll_han_pow10_int(int pow_value);
int mca_coll_han_hostname_to_number(char *hostname, int size);
void mca_coll_han_topo_get(int *topo, struct ompi_communicator_t *comm, int num_topo_level);
void mca_coll_han_topo_sort(int *topo, int start, int end, int size, int level, int num_topo_level);
bool mca_coll_han_topo_is_mapbycore(int *topo, struct ompi_communicator_t *comm,
int num_topo_level);
int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module,
int num_topo_level);
void mca_coll_han_topo_print(int *topo, struct ompi_communicator_t *comm, int num_topo_level);
/* Utils */
void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank,
@ -263,8 +346,47 @@ uint32_t han_auto_tuned_get_n(uint32_t n);
uint32_t han_auto_tuned_get_c(uint32_t c);
uint32_t han_auto_tuned_get_m(uint32_t m);
const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll);
const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl);
/** Dynamic component choice */
/*
* Get all the collective modules initialized on this communicator
* This function must be call at the start of every selector implementation
*/
int
mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module);
int
mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_allgatherv_intra_dynamic(ALLGATHERV_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_allreduce_intra_dynamic(ALLREDUCE_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_bcast_intra_dynamic(BCAST_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_gather_intra_dynamic(GATHER_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_reduce_intra_dynamic(REDUCE_BASE_ARGS,
mca_coll_base_module_t *module);
int
mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS,
mca_coll_base_module_t *module);
/* Bcast */
int mca_coll_han_bcast_intra_simple(void *buff,
int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff,
int seg_count, struct ompi_datatype_t *dtype,
int root_up_rank, int root_low_rank,
@ -278,6 +400,30 @@ int mca_coll_han_bcast_t0_task(void *task_argu);
int mca_coll_han_bcast_t1_task(void *task_argu);
/* Reduce */
int
mca_coll_han_reduce_intra_simple(const void *sbuf,
void* rbuf,
int count,
struct ompi_datatype_t *dtype,
ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int
mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int
mca_coll_han_reduce_reproducible(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task,
void *sbuf,
void *rbuf, int seg_count, struct ompi_datatype_t *dtype,
@ -301,6 +447,26 @@ int mca_coll_han_reduce_t0_task(void *task_argu);
int mca_coll_han_reduce_t1_task(void *task_argu);
/* Allreduce */
int
mca_coll_han_allreduce_intra_simple(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int
mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int
mca_coll_han_allreduce_reproducible(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu,
mca_coll_task_t * cur_task,
void *sbuf,
@ -355,7 +521,7 @@ void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu,
struct ompi_communicator_t *low_comm,
int w_rank, bool noop, ompi_request_t * req);
/* Gatter */
/* Gather */
int
mca_coll_han_gather_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
@ -380,6 +546,23 @@ void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int w_rank, bool noop, ompi_request_t * req);
int
mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
/* reordering after gather, for unordered ranks */
void
ompi_coll_han_reorder_gather(const void *sbuf,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
int * topo);
/* Allgather */
int
@ -405,6 +588,13 @@ void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu,
struct ompi_communicator_t *low_comm,
int w_rank,
bool noop, bool is_mapbycore, int *topo, ompi_request_t * req);
int
mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
END_C_DECLS
#endif /* MCA_COLL_HAN_EXPORT_H */

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -28,7 +29,10 @@ void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu,
struct ompi_communicator_t *up_comm,
struct ompi_communicator_t *low_comm,
int w_rank,
bool noop, bool is_mapbycore, int *topo, ompi_request_t * req)
bool noop,
bool is_mapbycore,
int *topo,
ompi_request_t * req)
{
argu->cur_task = cur_task;
argu->sbuf = sbuf;
@ -53,18 +57,17 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
struct ompi_communicator_t *comm,
mca_coll_base_module_t * module)
{
int w_rank;
w_rank = ompi_comm_rank(comm);
/* Create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_allgather_low_module];
ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_allgather_up_module];
mca_coll_han_comm_create_new(comm, han_module);
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
int low_rank = ompi_comm_rank(low_comm);
ompi_request_t *temp_request = NULL;
@ -160,7 +163,8 @@ int mca_coll_han_allgather_uag_task(void *task_argu)
} else {
ptrdiff_t rsize, rgap = 0;
rsize =
opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size * up_size,
opal_datatype_span(&t->rdtype->super,
(int64_t) t->rcount * low_size * up_size,
&rgap);
reorder_buf = (char *) malloc(rsize);
reorder_rbuf = reorder_buf - rgap;
@ -190,7 +194,8 @@ int mca_coll_han_allgather_uag_task(void *task_argu)
"[%d]: HAN Allgather copy from %d %d\n", t->w_rank,
(i * low_size + j) * 2 + 1,
t->topo[(i * low_size + j) * 2 + 1]));
ompi_datatype_copy_content_same_ddt(t->rdtype, (ptrdiff_t) t->rcount,
ompi_datatype_copy_content_same_ddt(t->rdtype,
(ptrdiff_t) t->rcount,
(char *) t->rbuf +
rextent *
(ptrdiff_t) t->topo[(i * low_size + j) * 2 +
@ -238,3 +243,108 @@ int mca_coll_han_allgather_lb_task(void *task_argu)
return OMPI_SUCCESS;
}
int
mca_coll_han_allgather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module){
/* create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
mca_coll_han_comm_create_new(comm, han_module);
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
/* discovery topology */
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
/* unbalanced case needs algo adaptation */
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allgather with this communicator. It need to fall back on another component\n"));
return han_module->previous_allgather(sbuf, scount, sdtype, rbuf,
rcount, rdtype,
comm, han_module->previous_allgather_module);
}
/* setup up/low coordinates */
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int up_rank = ompi_comm_rank(up_comm);
int up_size = ompi_comm_size(up_comm);
int root_low_rank = 0; // node leader will be 0 on each rank
/* allocate the intermediary buffer
* to gather on leaders on the low sub communicator */
char *tmp_buf = NULL;
char *tmp_buf_start = NULL;
if (low_rank == root_low_rank) {
ptrdiff_t rsize, rgap = 0;
/* Compute the size to receive all the local data, including datatypes empty gaps */
rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap);
// intermediary buffer on node leaders to gather on low comm
tmp_buf = (char *) malloc(rsize);
tmp_buf_start = tmp_buf - rgap;
}
/* 1. low gather on node leaders into tmp_buf */
low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype,
tmp_buf_start, rcount, rdtype, root_low_rank,
low_comm, low_comm->c_coll->coll_gather_module);
/* 2. allgather between node leaders, from tmp_buf to reorder_buf */
if (low_rank == root_low_rank) {
/* allocate buffer to store unordered result on node leaders
* * if the processes are mapped-by core, no need to reorder:
* * distribution of ranks on core first and node next,
* * in a increasing order for both patterns */
char *reorder_buf = NULL;
char *reorder_buf_start = NULL;
if (han_module->is_mapbycore) {
reorder_buf_start = rbuf;
} else {
if (0 == low_rank && 0 == up_rank) { // first rank displays message
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future Allgather needs reordering: ", w_rank));
}
ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap);
reorder_buf = (char *) malloc(rsize);
reorder_buf_start = reorder_buf - rgap;
}
/* 2a. inter node allgather */
up_comm->c_coll->coll_allgather(tmp_buf_start, scount*low_size, sdtype,
reorder_buf_start, rcount*low_size, rdtype,
up_comm, up_comm->c_coll->coll_allgather_module);
if (tmp_buf != NULL) {
free(tmp_buf);
tmp_buf = NULL;
tmp_buf_start = NULL;
}
/* 2b. reorder the node leader's into rbuf.
* if ranks are not mapped in topological order, data needs to be reordered
* (see reorder_gather)
*/
if (!han_module->is_mapbycore) {
ompi_coll_han_reorder_gather(reorder_buf_start,
rbuf, rcount, rdtype,
comm, topo);
free(reorder_buf);
reorder_buf = NULL;
}
}
/* 3. up broadcast: leaders broadcast on their nodes */
low_comm->c_coll->coll_bcast(rbuf, rcount*low_size*up_size, rdtype,
root_low_rank, low_comm,
low_comm->c_coll->coll_bcast_module);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -2,6 +2,8 @@
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -78,6 +80,17 @@ mca_coll_han_allreduce_intra(const void *sbuf,
struct ompi_op_t *op,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
{
// Fallback to another component if the op cannot commute
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
if (! ompi_op_is_commute(op)) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle allreduce with this communicator."
"It need to fall back on another component\n"));
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op,
comm, han_module->previous_allreduce_module);
}
ptrdiff_t extent, lb;
ompi_datatype_get_extent(dtype, &lb, &extent);
int w_rank;
@ -87,7 +100,6 @@ mca_coll_han_allreduce_intra(const void *sbuf,
ompi_datatype_type_size(dtype, &typelng);
/* Create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm;
@ -393,3 +405,145 @@ int mca_coll_han_allreduce_t3_task(void *task_argu)
return OMPI_SUCCESS;
}
int
mca_coll_han_allreduce_intra_simple(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm;
int root_low_rank = 0;
int low_rank;
int ret;
mca_coll_han_component_t *cs = &mca_coll_han_component;
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
OPAL_OUTPUT_VERBOSE((10, cs->han_output,
"[OMPI][han] in mca_coll_han_reduce_intra_simple\n"));
// Fallback to another component if the op cannot commute
if (! ompi_op_is_commute(op)) {
OPAL_OUTPUT_VERBOSE((30, cs->han_output,
"han cannot handle allreduce with this operation."
"It need to fall back on another component\n"));
goto prev_allreduce;
}
mca_coll_han_comm_create_new(comm, han_module);
low_comm = han_module->sub_comm[INTRA_NODE];
up_comm = han_module->sub_comm[INTER_NODE];
low_rank = ompi_comm_rank(low_comm);
/* Low_comm reduce */
ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf,
count, dtype, op, root_low_rank,
low_comm, low_comm->c_coll->coll_reduce_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((30, cs->han_output,
"HAN/ALLREDUCE: low comm reduce failed. "
"Falling back to another component\n"));
goto prev_allreduce;
}
/* Local roots perform a allreduce on the upper comm */
if (low_rank == root_low_rank) {
ret = up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, rbuf, count, dtype, op,
up_comm, up_comm->c_coll->coll_allreduce_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((30, cs->han_output,
"HAN/ALLREDUCE: up comm allreduce failed. \n"));
/*
* Do not fallback in such a case: only root_low_ranks follow this
* path, the other ranks are in another collective.
* ==> Falling back would potentially lead to a hang.
* Simply return the error
*/
return ret;
}
}
/* Low_comm bcast */
ret = low_comm->c_coll->coll_bcast(rbuf, count, dtype,
root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((30, cs->han_output,
"HAN/ALLREDUCE: low comm bcast failed. "
"Falling back to another component\n"));
goto prev_allreduce;
}
return OMPI_SUCCESS;
prev_allreduce:
return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm,
han_module->previous_allreduce_module);
}
/* Find a fallback on reproducible algorithm
* use tuned, or if impossible whatever available
*/
int
mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int w_rank = ompi_comm_rank(comm);
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* populate previous modules_storage*/
mca_coll_han_get_all_coll_modules(comm, han_module);
/* try availability of reproducible modules*/
int fallbacks[] = {TUNED, BASIC};
int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks);
int i;
for (i=0; i<fallbacks_len; i++) {
int fallback = fallbacks[i];
mca_coll_base_module_t *fallback_module = han_module->modules_storage
.modules[fallback]
.module_handler;
if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) {
if (0 == w_rank) {
opal_output_verbose(30, mca_coll_han_component.han_output,
"coll:han:allreduce_reproducible: "
"fallback on %s\n",
components_name[fallback]);
}
han_module->reproducible_allreduce_module = fallback_module;
han_module->reproducible_allreduce = fallback_module->coll_allreduce;
return OMPI_SUCCESS;
}
}
/* fallback of the fallback */
if (0 == w_rank) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:allreduce_reproducible_decision: "
"no reproducible fallback\n");
}
han_module->reproducible_allreduce_module =
han_module->previous_allreduce_module;
han_module->reproducible_allreduce = han_module->previous_allreduce;
return OMPI_SUCCESS;
}
/* Fallback on reproducible algorithm */
int
mca_coll_han_allreduce_reproducible(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
return han_module->reproducible_allreduce(sbuf, rbuf, count, dtype,
op, comm,
han_module
->reproducible_allreduce_module);
}

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -63,10 +64,22 @@ mca_coll_han_bcast_intra(void *buff,
w_rank = ompi_comm_rank(comm);
int seg_count = count;
size_t typelng;
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator. It need to fall back on another component\n"));
return han_module->previous_bcast(buff, count, dtype, root,
comm, han_module->previous_bcast_module);
}
ompi_datatype_type_size(dtype, &typelng);
/* Create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm;
@ -220,3 +233,60 @@ int mca_coll_han_bcast_t1_task(void *task_argu)
return OMPI_SUCCESS;
}
int
mca_coll_han_bcast_intra_simple(void *buff,
int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int w_rank;
w_rank = ompi_comm_rank(comm);
/* create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
mca_coll_han_comm_create_new(comm, han_module);
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int root_low_rank;
int root_up_rank;
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle bcast with this communicator. It need to fall back on another component\n"));
return han_module->previous_bcast(buff, count, dtype, root,
comm, han_module->previous_bcast_module);
} else {
OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output,
"[OMPI][han] in mca_coll_han_bcast_intra_simple\n"));
}
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: root_low_rank %d root_up_rank %d\n",
w_rank, root_low_rank, root_up_rank));
if (low_rank == root_low_rank) {
up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, up_comm, up_comm->c_coll->coll_bcast_module);
/* To remove when han has better sub-module selection.
For now switching to ibcast enables to make runs with libnbc. */
//ompi_request_t req;
//up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, up_comm, &req, up_comm->c_coll->coll_ibcast_module);
//ompi_request_wait(&req, MPI_STATUS_IGNORE);
}
low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -22,6 +23,8 @@
#include "ompi/constants.h"
#include "ompi/mca/coll/coll.h"
#include "coll_han.h"
#include "coll_han_dynamic.h"
#include "coll_han_dynamic_file.h"
/*
* Public string showing the coll ompi_han component version number
@ -84,6 +87,7 @@ mca_coll_han_component_t mca_coll_han_component = {
*/
static int han_open(void)
{
int param;
mca_coll_han_component_t *cs = &mca_coll_han_component;
if (cs->han_auto_tune) {
cs->han_auto_tuned =
@ -95,7 +99,16 @@ static int han_open(void)
2 * cs->han_auto_tune_n * cs->han_auto_tune_c * cs->han_auto_tune_m, file);
fclose(file);
}
return OMPI_SUCCESS;
/*
* Get the global coll verbosity: it will be ours
*/
cs->han_output = ompi_coll_base_framework.framework_output;
opal_output_verbose(1, cs->han_output,
"coll:han:component_open: done!");
cs->topo_level = GLOBAL_COMMUNICATOR;
return mca_coll_han_init_dynamic_rules();
}
@ -109,9 +122,89 @@ static int han_close(void)
free(cs->han_auto_tuned);
cs->han_auto_tuned = NULL;
}
mca_coll_han_free_dynamic_rules();
return OMPI_SUCCESS;
}
static bool is_simple_implemented(COLLTYPE_T coll)
{
switch(coll) {
case ALLGATHER:
case ALLREDUCE:
case BCAST:
case GATHER:
case REDUCE:
return true;
default:
return false;
}
}
const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl)
{
switch(topo_lvl) {
case INTRA_NODE:
return "intra_node";
case INTER_NODE:
return "inter_node";
case GLOBAL_COMMUNICATOR:
return "global_communicator";
case NB_TOPO_LVL:
default:
return "invalid topologic level";
}
}
const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll)
{
switch(coll) {
case ALLGATHER:
return "allgather";
case ALLGATHERV:
return "allgatherv";
case ALLREDUCE:
return "allreduce";
case ALLTOALL:
return "alltoall";
case ALLTOALLV:
return "alltoallv";
case ALLTOALLW:
return "alltoallw";
case BARRIER:
return "barrier";
case BCAST:
return "bcast";
case EXSCAN:
return "exscan";
case GATHER:
return "gather";
case GATHERV:
return "gatherv";
case REDUCE:
return "reduce";
case REDUCESCATTER:
return "reduce_scatter";
case REDUCESCATTERBLOCK:
return "reduce_scatter_block";
case SCAN:
return "scan";
case SCATTER:
return "scatter";
case SCATTERV:
return "scatterv";
case NEIGHBOR_ALLGATHER:
return "neighbor_allgather";
case NEIGHBOR_ALLGATHERV:
return "neighbor_allgatherv";
case NEIGHBOR_ALLTOALL:
return "neighbor_alltoall";
case NEIGHBOR_ALLTOALLV:
return "neighbor_alltoallv";
case NEIGHBOR_ALLTOALLW:
return "neighbor_alltoallw";
default:
return "";
}
}
/*
* Register MCA params
@ -121,21 +214,20 @@ static int han_register(void)
mca_base_component_t *c = &mca_coll_han_component.super.collm_version;
mca_coll_han_component_t *cs = &mca_coll_han_component;
cs->han_priority = 50;
/* Generated parameters name and description */
char param_name[100] = "";
char param_desc[300] = "";
int param_desc_size;
COLLTYPE_T coll;
TOPO_LVL_T topo_lvl;
COMPONENT_T component;
cs->han_priority = 0;
(void) mca_base_component_var_register(c, "priority", "Priority of the han coll component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority);
int coll_han_verbose = 0;
(void) mca_base_component_var_register(c, "verbose",
"Verbose level",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &coll_han_verbose);
cs->han_output = opal_output_open(NULL);
opal_output_set_verbosity(cs->han_output, coll_han_verbose);
cs->han_bcast_segsize = 65536;
(void) mca_base_component_var_register(c, "bcast_segsize",
"segment size for bcast",
@ -254,6 +346,93 @@ static int han_register(void)
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune);
cs->han_reproducible = 0;
(void) mca_base_component_var_register(c, "reproducible",
"whether we need reproducible results "
"(enabling this disables optimisations using topology)"
"0 disable 1 enable, default 0",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_READONLY,
&cs->han_reproducible);
/* Simple algorithms MCA parameters */
for(coll = 0 ; coll < COLLCOUNT ; coll++) {
cs->use_simple_algorithm[coll] = false;
if(is_simple_implemented(coll)) {
snprintf(param_name, 100, "use_simple_%s",
mca_coll_han_colltype_to_str(coll));
snprintf(param_desc, 300, "whether to enable simple algo for %s",
mca_coll_han_colltype_to_str(coll));
mca_base_component_var_register(c, param_name,
param_desc,
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->use_simple_algorithm[coll]));
}
}
/* Dynamic rules MCA parameters */
/* TODO: Find a way to avoid unused entried */
memset(cs->mca_rules, 0,
COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T));
for(coll = 0 ; coll < COLLCOUNT ; coll++) {
if(!mca_coll_han_is_coll_dynamic_implemented(coll)) {
continue;
}
/*
* Default values
* Do not avoid to set correct default parameters
*/
cs->mca_rules[coll][INTRA_NODE] = TUNED;
cs->mca_rules[coll][INTER_NODE] = BASIC;
cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN;
for(topo_lvl = 0 ; topo_lvl < NB_TOPO_LVL ; topo_lvl++) {
snprintf(param_name, 100, "%s_dynamic_%s_module",
mca_coll_han_colltype_to_str(coll),
mca_coll_han_topo_lvl_to_str(topo_lvl));
param_desc_size = snprintf(param_desc, 300,
"Collective module to use for "
"collective %s on %s topological level: ",
mca_coll_han_colltype_to_str(coll),
mca_coll_han_topo_lvl_to_str(topo_lvl));
/*
* Exhaustive description:
* 0 = self; 1 = basic; 2 = libnbc; ...
* FIXME: Do not print component not providing this collective
*/
for(component = 0 ; component < COMPONENTS_COUNT ; component++) {
if(HAN == component && GLOBAL_COMMUNICATOR != topo_lvl) {
/* Han can only be used on the global communicator */
continue;
}
param_desc_size += snprintf(param_desc+param_desc_size, 300,
"%d = %s; ",
component,
components_name[component]);
}
mca_base_component_var_register(c, param_name, param_desc,
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->mca_rules[coll][topo_lvl]));
}
}
/*
* TODO: remove the following lines when auto-tune is added back to the code
*/
cs->han_auto_tune = 0;
cs->han_auto_tune_n = 5;
cs->han_auto_tune_c = 3;
cs->han_auto_tune_m = 21;
#if 0
cs->han_auto_tune_n = 5;
(void) mca_base_component_var_register(c, "auto_tune_n",
"auto tune n",
@ -273,7 +452,65 @@ static int han_register(void)
"auto tune n",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_m);
MCA_BASE_VAR_SCOPE_READONLY,
&cs->han_auto_tune_m);
#endif
/* Dynamic rules */
cs->use_dynamic_file_rules = false;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"use_dynamic_file_rules",
"Switch used to decide if we use "
"dynamic module choice rules "
"defines by file",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->use_dynamic_file_rules));
cs->dynamic_rules_filename = NULL;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"dynamic_rules_filename",
"Filename of configuration file that "
"contains the dynamic module choice rules",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->dynamic_rules_filename));
cs->dump_dynamic_rules = false;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"dump_dynamic_rules",
"Switch used to decide if we dump "
"dynamic rules provided by "
"configuration file",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->dump_dynamic_rules));
if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename)
&& !cs->use_dynamic_file_rules) {
opal_output_verbose(0, cs->han_output,
"coll:han:han_register "
"you asked for dynamic rules "
"but they are not activated. "
"Check coll_han_use_dynamic_file_rules "
"MCA parameter");
}
cs->max_dynamic_errors = 10;
(void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
"max_dynamic_errors",
"Number of dynamic rules module/function "
"errors printed on rank 0 "
"with a 0 verbosity."
"Useless if coll_base_verbose is 30 or more.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&(cs->max_dynamic_errors));
return OMPI_SUCCESS;
}

1338
ompi/mca/coll/han/coll_han_dynamic.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

214
ompi/mca/coll/han/coll_han_dynamic.h Обычный файл
Просмотреть файл

@ -0,0 +1,214 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_HAN_DYNAMIC_H
#define MCA_COLL_HAN_DYNAMIC_H
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/mca.h"
#include "opal/util/output.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/coll/han/coll_han.h"
/*
* #################################################
* # Dynamic rules global architecture description #
* #################################################
*
* Han dynamic rules allow the user to define the collective
* module to call depending the topological configuration of the
* sub-communicators and the collective parameters. This mechanism
* can also be used to fallback the main collective on another module.
* The interface is described in coll_han_dynamic_file.h.
*
* #############################
* # Collective module storage #
* #############################
* To be able to switch between multiple collective modules, han
* directly accesses the module on the communicator. This information is
* stored in the collective structure of the communicator during the collective
* module choice at the communicator initialization. When han needs this
* information for the first time, it identifies the modles by their name and
* stores them in its module structure.
* Then, the modules are identified by their identifier.
*
* #########################
* # Dynamic rules storage #
* #########################
* There are two types of dynamic rules:
* - MCA parameter defined rules
* - File defined rules
*
* MCA parameter defined rules are stored in mca_coll_han_component.mca_rules.
* This is a double indexed table. The first index is the coresponding collective
* communication and the second index is the topological level aimed by the rule.
* These parameters define the collective component to use for a specific
* collective communication on a specific topologic level.
*
* File defined rules are stored in mca_coll_han_component.dynamic_rules.
* These structures are defined bellow. The rule storage is directy deduced
* from the rule file format.
*
* File defined rules precede MCA parameter defined rules.
*
* #######################
* # Dynamic rules usage #
* #######################
* To choose which collective module to use on a specific configuration, han
* adds an indirection on the collective call: dynamic choice functions. These
* functions do not implement any collective. First, they try to find a dynamic
* rule from file for the given collective. If there is not any rule for the
* fiven configuration, MCA parameter defined rules are used. Once the module
* to use is found, the correct collective implementation is called.
*
* This indirection is also used on the global communicator. This allows han
* to provide a fallback mechanism considering the collective parameters.
*
* ##############################
* # Dynamic rules choice logic #
* ##############################
* Dynamic rules choice is made with a stack logic. Each new rule precedes
* already defined rules. MCA parameters rules are the stack base. When
* a rule is needed, rules are read as a stack and the first corresponding
* encountered is chosen.
*
* Consequences:
* - If a collective identifier appears multiple times, only the last
* will be considered
* - If a topological level appears multiple times for a collective,
* only the last will be considered
* - If configuration rules or message size rules are not stored
* by increasing value, some of them will not be considered
*/
BEGIN_C_DECLS
/* Dynamic rules support */
typedef enum COMPONENTS {
SELF=0,
BASIC,
LIBNBC,
TUNED,
SM,
SHARED,
ADAPT,
HAN,
COMPONENTS_COUNT
} COMPONENT_T;
static const char *components_name[]={"self",
"basic",
"libnbc",
"tuned",
"sm",
"shared",
"adapt",
"han"};
/* Topologic levels */
typedef enum TOPO_LVL {
INTRA_NODE=0,
INTER_NODE,
/* Identifies the global communicator as a topologic level */
GLOBAL_COMMUNICATOR,
NB_TOPO_LVL
} TOPO_LVL_T;
/* Rule for a specific msg size
* in a specific configuration
* for a specific collective
* in a specific topologic level */
typedef struct msg_size_rule_s {
COLLTYPE_T collective_id;
TOPO_LVL_T topologic_level;
int configuration_size;
/* Message size of the rule */
int msg_size;
/* Component to use on this specific configuration
* and message size */
COMPONENT_T component;
} msg_size_rule_t;
/* Rule for a specific configuration
* considering a specific collective
* in a specific topologic level */
typedef struct configuration_rule_s {
COLLTYPE_T collective_id;
TOPO_LVL_T topologic_level;
/* Number of elements of the actual topologic level
* per element of the upper topologic level */
int configuration_size;
/* Number of message size rules for this configuration */
int nb_msg_size;
/* Table of message size rules for this configuration */
msg_size_rule_t *msg_size_rules;
} configuration_rule_t;
/* Set of dynamic rules for a specific collective
* in a specific topologic level */
typedef struct topologic_rule_s {
/* Collective identifier */
COLLTYPE_T collective_id;
/* Topologic level of the rule */
TOPO_LVL_T topologic_level;
/* Rule number */
int nb_rules;
/* Table of configuration rules
* for this collective on this topologic level */
configuration_rule_t *configuration_rules;
} topologic_rule_t;
/* Set of dynamic rules for a collective */
typedef struct collective_rule_s {
COLLTYPE_T collective_id;
/* Number of topologic level for this collective */
int nb_topologic_levels;
/* Table of topologic level rules
* for this collective */
topologic_rule_t *topologic_rules;
} collective_rule_t;
/* Global dynamic rules structure */
typedef struct mca_coll_han_dynamic_rule_s {
int nb_collectives;
collective_rule_t *collective_rules;
} mca_coll_han_dynamic_rules_t;
/* Module storage */
typedef struct collective_module_storage_s {
/* Module */
mca_coll_base_module_t *module_handler;
} collective_module_storage_t;
/* Table of module storage */
typedef struct mca_coll_han_collective_modules_storage_s {
/* */
collective_module_storage_t modules[COMPONENTS_COUNT];
} mca_coll_han_collective_modules_storage_t;
/* Tests if a dynamic collective is implemented */
bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id);
END_C_DECLS
#endif

690
ompi/mca/coll/han/coll_han_dynamic_file.c Обычный файл
Просмотреть файл

@ -0,0 +1,690 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STDIO_H
#include <stdio.h>
#endif
#include "ompi_config.h"
#include "coll_han.h"
#include "coll_han_dynamic.h"
#include "coll_han_dynamic_file.h"
#include "ompi/mca/coll/base/coll_base_util.h"
static void check_dynamic_rules(void);
/* Current file line for verbose message */
static int fileline = 1;
#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline)
int
mca_coll_han_init_dynamic_rules(void)
{
/* File management */
const char *fname;
FILE *fptr = NULL;
int nb_entries = 0;
/* Loop counters */
int i, j, k, l;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size, msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
/* If the dynamic rules are not used, do not even read the file */
if(!mca_coll_han_component.use_dynamic_file_rules) {
nb_coll = 0;
return OMPI_SUCCESS;
}
fname = mca_coll_han_component.dynamic_rules_filename;
if(NULL == fname) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"coll_han_use_dynamic_file_rules is true but "
"coll_han_dynamic_rules_filename is not set: "
"coll han will use dynamic rules from mca "
"parameters and their default value\n");
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS;
}
fptr = fopen(fname, "r");
if(NULL == fptr) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"cannot open dynamic file provided by "
"coll_han_dynamic_rules_filename=%s "
"please provide it with full path and "
"check file permissions. Rules from "
"MCA parameters will be used instead\n",
fname);
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
return OMPI_SUCCESS;
}
/* The first information of the file is the collective count */
nb_coll = getnext(fptr);
if(nb_coll <= 0) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for collective count "
"or the reader encountered an unexpected EOF\n",
fname,
fileline,
nb_coll);
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
goto file_reading_error;
}
mca_coll_han_component.dynamic_rules.nb_collectives = nb_coll;
/* Allocate collective rules */
coll_rules = malloc(nb_coll * sizeof(collective_rule_t));
mca_coll_han_component.dynamic_rules.collective_rules = coll_rules;
if(NULL == coll_rules) {
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
goto cannot_allocate;
}
/* Iterates on collective rules */
for(i=0 ; i<nb_coll ; i++) {
/* Get the collective identifier */
coll_id = getnext(fptr);
if(coll_id < ALLGATHER || coll_id >= COLLCOUNT) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"invalid collective id %d at line %d: the collective "
"must be at least %d and less than %d\n",
coll_id,
fileline,
ALLGATHER,
COLLCOUNT);
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"read collective id %d at line %d "
"but this collective is not implemented yet. "
"This is not an error but this set of rules "
"will not be used\n",
fname,
coll_id,
fileline);
}
/*
* The first information of a collective rule
* is the number of topologic rules
*/
nb_topo = getnext(fptr);
if(nb_topo < 0) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for topo level count "
"or the reader encountered an unexpected EOF\n",
fname,
fileline,
nb_topo);
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Store the collective rule informations */
coll_rules[i].collective_id = coll_id;
coll_rules[i].nb_topologic_levels = nb_topo;
if(0 == nb_topo) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"Warning on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for topo level count\n",
fname,
fileline,
nb_topo);
continue;
}
/* Allocate topologic rules */
topo_rules = malloc(nb_topo * sizeof(topologic_rule_t));
coll_rules[i].topologic_rules = topo_rules;
if(NULL == topo_rules) {
coll_rules[i].nb_topologic_levels = 0;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto cannot_allocate;
}
/* Iterates on topologic rules */
for(j=0 ; j<nb_topo ; j++) {
/* Get the topologic level identifier */
topo_lvl = getnext(fptr);
if(topo_lvl < INTRA_NODE || topo_lvl >= NB_TOPO_LVL) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid topo level %d is given "
"or the reader encountered an unexpected EOF. "
"Topologic level must be at least %d and "
"less than %d\n",
fname,
fileline,
topo_lvl,
INTRA_NODE,
NB_TOPO_LVL);
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/*
* The first information of a topologic rule
* is the number of configurations
*/
nb_rules = getnext(fptr);
if(nb_rules < 0) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d "
"is given for rules count "
"or the reader encountered an unexpected EOF\n",
fname,
fileline,
nb_rules);
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Store the topologic rule informations */
topo_rules[j].collective_id = coll_id;
topo_rules[j].topologic_level = topo_lvl;
topo_rules[j].nb_rules = nb_rules;
if(0 == nb_rules) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"Warning on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for configuration rules count\n",
fname,
fileline,
nb_rules);
continue;
}
/* Allocate configuration rules */
conf_rules = malloc(nb_rules * sizeof(configuration_rule_t));
topo_rules[j].configuration_rules = conf_rules;
if(NULL == conf_rules) {
topo_rules[j].nb_rules = 0;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto cannot_allocate;
}
/* Iterate on configuration rules */
for(k=0 ; k<nb_rules ; k++) {
/* Get the configuration size */
conf_size = getnext(fptr);
if(conf_size < 1 || (0 == k && conf_size > 1)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"invalid configuration size %d at line %d "
"or the reader encountered an unexpected EOF "
"the configuration size must be at least %d "
"and the first configuration size "
"of a topologic level must be %d\n",
conf_size,
fileline,
1,
1);
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/*
* The first information of a configuration rule
* is the number of message size rules
*/
nb_msg_size = getnext(fptr);
if(nb_msg_size < 0) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d "
"is given for message size rules count "
"or the reader encountered an unexpected EOF\n",
fname,
fileline,
nb_msg_size);
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Store configuration rule information */
conf_rules[k].collective_id = coll_id;
conf_rules[k].topologic_level = topo_lvl;
conf_rules[k].configuration_size = conf_size;
conf_rules[k].nb_msg_size = nb_msg_size;
if(0 == nb_msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"Warning on dynamic rules file %s "
"at line %d: an invalid value %d is given "
"for message size rules count\n",
fname,
fileline,
nb_msg_size);
continue;
}
/* Allocate message size rules */
msg_size_rules = malloc(nb_msg_size * sizeof(msg_size_rule_t));
conf_rules[k].msg_size_rules = msg_size_rules;
if(NULL == msg_size_rules) {
conf_rules[k].nb_msg_size = 0;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto cannot_allocate;
}
/* Iterate on message size rules */
for(l=0 ; l<nb_msg_size ; l++) {
/* Get the message size */
msg_size = getnext(fptr);
if(msg_size < 0
|| (0 ==l && msg_size > 1)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid value %d "
"is given for message size "
"or the reader encountered "
"an unexpected EOF. "
"The first message size rule of "
"a configuration must be 0\n",
fname,
fileline,
msg_size);
conf_rules[k].nb_msg_size = l+1;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Get the component identifier for this message size rule */
component = getnext(fptr);
if(component < SELF || component >= COMPONENTS_COUNT) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"found an error on dynamic rules file %s "
"at line %d: an invalid collective "
"component id %d is given or the "
"reader encountered an unexpected EOF. "
"Collective component id must be at "
"least %d and less than %d\n",
fname,
fileline,
component,
SELF,
COMPONENTS_COUNT);
conf_rules[k].nb_msg_size = l+1;
topo_rules[j].nb_rules = k+1;
coll_rules[i].nb_topologic_levels = j+1;
mca_coll_han_component.dynamic_rules.nb_collectives = i+1;
goto file_reading_error;
}
/* Store message size rule informations */
msg_size_rules[l].collective_id = coll_id;
msg_size_rules[l].topologic_level = topo_lvl;
msg_size_rules[l].configuration_size = conf_size;
msg_size_rules[l].msg_size = msg_size;
msg_size_rules[l].component = component;
nb_entries++;
}
}
}
}
if(MYEOF != getnext(fptr)) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"Warning on file %s at line %d: "
"rule reading is over but reader does not seem "
"to have reached the end of the file\n",
fname,
fileline);
}
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"read %d rules from %s\n",
nb_entries,
fname);
if(mca_coll_han_component.dump_dynamic_rules) {
mca_coll_han_dump_dynamic_rules();
}
fclose(fptr);
check_dynamic_rules();
return OMPI_SUCCESS;
cannot_allocate:
/* The dynamic rules allocation failed
* Free the already allocated rules and return a failure
*/
opal_output_verbose(0, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"cannot allocate dynamic rules\n");
/* Do not check free_dynamic_rules
* because we are returning OMPI_ERROR anyway */
mca_coll_han_free_dynamic_rules();
return OMPI_ERROR;
file_reading_error:
opal_output_verbose(0, mca_coll_han_component.han_output,
"coll:han:mca_coll_han_init_dynamic_rules "
"could not fully read dynamic rules file. "
"Will use mca parameters defined rules. "
"To see error detail, please set "
"collective verbosity level over 5\n");
if(fptr) {
fclose (fptr);
}
mca_coll_han_free_dynamic_rules();
return OMPI_SUCCESS;
}
void
mca_coll_han_free_dynamic_rules(void)
{
/* Loop counters */
int i, j, k;
/* Loop ranges */
int nb_coll, nb_topo, nb_conf;
/* Aliases */
collective_rule_t *coll_rules;
topologic_rule_t *topo_rules;
configuration_rule_t *conf_rules;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) {
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) {
nb_conf = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_conf ; k++) {
if(conf_rules[k].nb_msg_size > 0) {
free(conf_rules[k].msg_size_rules);
}
}
if(nb_conf > 0) {
free(conf_rules);
}
}
if(nb_topo > 0) {
free(topo_rules);
}
}
if(nb_coll > 0) {
free(coll_rules);
}
mca_coll_han_component.dynamic_rules.nb_collectives = 0;
}
/*
* Try to find any logical issue in dynamic rules
*/
static void check_dynamic_rules(void)
{
/* Loop counters */
int i, j, k, l;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size, msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) {
coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) {
topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_rules ; k++) {
conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules;
if(k>=1 && conf_rules[k-1].configuration_size > conf_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules "
"Han found an issue on dynamic rules "
"for collective %d "
"on topological level %d: "
"configuration sizes %d and %d are "
"not sorted by increasing value\n",
coll_id,
topo_lvl,
conf_rules[k-1].configuration_size,
conf_size);
}
for(l=0 ; l<nb_msg_size ; l++) {
msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component;
if(l>=1 && msg_size_rules[l-1].msg_size > msg_size) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules "
"Han found an issue on dynamic rules "
"for collective %d "
"on topological level %d "
"with configuration size %d: "
"message sizes %d and %d are "
"not sorted by increasing value\n",
coll_id,
topo_lvl,
conf_size,
msg_size_rules[l-1].msg_size,
msg_size);
}
if(HAN == component
&& GLOBAL_COMMUNICATOR != topo_lvl) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:check_dynamic_rules "
"Han found an issue on dynamic rules "
"for collective %d "
"on topological level %d "
"with configuration size %d "
"for message size %d: "
"han collective component %d "
"can only be activated for "
"topology level %d\n",
coll_id,
topo_lvl,
conf_size,
msg_size,
HAN,
GLOBAL_COMMUNICATOR);
}
}
}
}
}
}
void mca_coll_han_dump_dynamic_rules(void)
{
int nb_entries = 0;
/* Loop counters */
int i, j, k, l;
/* Collective informations */
int nb_coll;
COLLTYPE_T coll_id;
collective_rule_t *coll_rules;
/* Topo informations */
int nb_topo;
TOPO_LVL_T topo_lvl;
topologic_rule_t *topo_rules;
/* Configuration informations */
int nb_rules, conf_size;
configuration_rule_t *conf_rules;
/* Message size informations */
int nb_msg_size, msg_size;
msg_size_rule_t *msg_size_rules;
/* Component informations */
COMPONENT_T component;
nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives;
coll_rules = mca_coll_han_component.dynamic_rules.collective_rules;
for(i=0 ; i<nb_coll ; i++) {
coll_id = coll_rules[i].collective_id;
nb_topo = coll_rules[i].nb_topologic_levels;
topo_rules = coll_rules[i].topologic_rules;
for(j=0 ; j<nb_topo ; j++) {
topo_lvl = topo_rules[j].topologic_level;
nb_rules = topo_rules[j].nb_rules;
conf_rules = topo_rules[j].configuration_rules;
for(k=0 ; k<nb_rules ; k++) {
conf_size = conf_rules[k].configuration_size;
nb_msg_size = conf_rules[k].nb_msg_size;
msg_size_rules = conf_rules[k].msg_size_rules;
for(l=0 ; l<nb_msg_size ; l++) {
msg_size = msg_size_rules[l].msg_size;
component = msg_size_rules[l].component;
opal_output(mca_coll_han_component.han_output,
"coll:han:dump_dynamic_rules "
"Entry %d "
"collective %d (%s) "
"topology level %d (%s) "
"configuration size %d "
"mesage size %d "
"-> collective component %d (%s)\n",
nb_entries,
coll_id,
mca_coll_han_colltype_to_str(coll_id),
topo_lvl,
mca_coll_han_topo_lvl_to_str(topo_lvl),
conf_size,
msg_size,
component,
components_name[component]);
nb_entries++;
}
}
}
}
}

111
ompi/mca/coll/han/coll_han_dynamic_file.h Обычный файл
Просмотреть файл

@ -0,0 +1,111 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_HAN_DYNAMIC_FILE_H
#define MCA_COLL_HAN_DYNAMIC_FILE_H
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/mca.h"
#include "opal/util/output.h"
/*
* ############################
* # MCA parameters interface #
* ############################
* An MCA parameter defined rule allows the user to choose which collective
* module will be used for a specific collective communication on a specific
* topological level. The standard name for these parameters is:
* [collective]_dynamic_[topologic_level]_module
*
* #######################
* # Dynamic file format #
* #######################
* File defined rules precede MCA parameter defined rule.
* To activate file reader, the MCA parameter use_dynamic_file_rules must
* be set to true. The path to the dynamic file is given by the MCA
* parameter dynamic_rules_filename. If there is any issue reading the file,
* the file is considered as invalid and only MCA parameter defined rules are
* used. If a potential logical issue is identified in the file, a
* warning is printed but the file is not considered as invalid.
*
* The file is built recursively.
* A set of rules of a type is built as follows:
* Number of rules of the set
* Rule1
* Rule2
* ...
*
* A rule of the level i is built as follows (excluding message size rule):
* Rule property
* Set of rules of level i+1
*
* A message size rule is built as follows:
* Message_size Component
*
* Rule properties are (by increasing level):
* - Collective identifier:
* Defined in ompi/mca/coll/base/coll_base_functions.h.
* - Topologic level:
* Defined in coll_han_dynamic.h. It defines the communicator
* topology level. This is GLOBAL_COMMUNICATOR for the user
* communicator and the corresponding level for sub-communicators
* created by han.
* - Configuration size:
* The configuration size is the number of elements of the actual
* topology level in the upper topology level. For example, if
* topology levels are intra-node and inter-node, it can be the
* number of MPI ranks per node or the number of nodes in the global
* communicator. For the GLOBAL_COMMUNICATOR topologic level,
* the configuration size is the communicator size.
* - Message_size Component:
* This is the message size, in bytes, of the message. Component is
* the component identifier to use for this collective on this
* communicator with this message size. Components identifier are
* defined in coll_han_dynamic.h
*
* Here is an example of a dynamic rules file:
* 2 # Collective count
* 7 # Collective identifier 1 (defined in ompi/mca/coll/base/coll_base_functions.h)
* 2 # Topologic level count
* 0 # Topologic level identifier 1
* 1 # Configuration count
* 1 # Configuration size 1
* 2 # Message size rules count
* 0 3 # Message size 1 and component identifier
* 128 1 # Message size 2 and component identifier
* 1 # Topologic level identifier 2
* 1 # Configuration count
* 1 # Configuration size 1
* 1 # Message size rules count
* 0 1 # Message size 1 and component identifier
* 3 # Collective identifier 2
* # Set of topological rules
*
* Note that configuration size and message size rules define minimal
* values and each new rule precede every other rules. This property
* implies that this types of rules must be sorted by increasing value.
* If they are not, some rules wont be used.
*
* The counts define a stack. If the count is set to x, the reader will
* attempt to read x rules of the corresponding type. If a set of rules
* has an invalid count, this is an error and it might not be detected by
* the reader.
*/
BEGIN_C_DECLS
int mca_coll_han_init_dynamic_rules(void);
void mca_coll_han_free_dynamic_rules(void);
void mca_coll_han_dump_dynamic_rules(void);
END_C_DECLS
#endif

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,29 +54,39 @@ void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu,
int
mca_coll_han_gather_intra(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t * module)
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t * module)
{
int i, j;
int w_rank, w_size;
int i;
int w_rank, w_size; /* information about the global communicator */
int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */
char *reorder_buf = NULL, *reorder_rbuf = NULL;
ptrdiff_t rsize, rgap = 0, rextent;
int *vranks, low_rank, low_size;
int * topo;
ompi_request_t *temp_request = NULL;
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
/* Create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module];
ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module];
int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
int up_size = ompi_comm_size(up_comm);
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
topo = mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator. It need to fall back on another component\n"));
return han_module->previous_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, han_module->previous_gather_module);
}
ompi_request_t *temp_request = NULL;
/* Set up request */
temp_request = OBJ_NEW(ompi_request_t);
OMPI_REQUEST_INIT(temp_request, false);
@ -88,27 +99,44 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
temp_request->req_status._cancelled = 0;
temp_request->req_status._ucount = 0;
int root_low_rank;
int root_up_rank;
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future Gather root %d root_low_rank %d root_up_rank %d\n", w_rank,
root, root_low_rank, root_up_rank));
/* create the subcommunicators */
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module];
ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module];
/* Get the 'virtual ranks' mapping correspondong to the communicators */
vranks = han_module->cached_vranks;
/* information about sub-communicators */
low_rank = ompi_comm_rank(low_comm);
low_size = ompi_comm_size(low_comm);
/* Get root ranks for low and up comms */
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n",
w_rank, root, root_low_rank, root_up_rank));
char *reorder_buf = NULL;
char *reorder_rbuf = NULL;
ptrdiff_t rsize, rgap = 0, rextent;
ompi_datatype_type_extent(rdtype, &rextent);
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
/* Allocate reorder buffers */
if (w_rank == root) {
/* If the processes are mapped-by core, no need to reorder */
/* if the processes are mapped-by core, no need to reorder:
* distribution of ranks on core first and node next,
* in a increasing order for both patterns */
if (han_module->is_mapbycore) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future Gather is_bycore: ", w_rank));
reorder_rbuf = (char *) rbuf;
"[%d]: Han Gather is_bycore: ", w_rank));
reorder_rbuf = (char *)rbuf;
} else {
rsize = opal_datatype_span(&rdtype->super, (int64_t) rcount * w_size, &rgap);
reorder_buf = (char *) malloc(rsize); //TODO:free
/* Need a buffer to store unordered final result */
rsize = opal_datatype_span(&rdtype->super,
(int64_t)rcount * w_size,
&rgap);
reorder_buf = (char *)malloc(rsize); //TODO:free
/* rgap is the size of unused space at the start of the datatype */
reorder_rbuf = reorder_buf - rgap;
}
}
@ -128,27 +156,29 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
ompi_request_wait(&temp_request, MPI_STATUS_IGNORE);
/* Reorder rbuf based on rank.
* Suppose, message is 0 1 2 3 4 5 6 7,
* and the processes are mapped on 2 nodes (the processes on the node 0 is 0 2 4 6 and the processes on the node 1 is 1 3 5 7),
* so the message needs to be reordered to 0 2 4 6 1 3 5 7
/* Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are
* mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from
* low gather will be 0 2 4 6 and 1 3 5 7.
* So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered.
* The 3rd element (4) must be recopied at the 4th place. In general, the
* i-th element must be recopied at the place given by the i-th entry of the
* topology, which is topo[i*topolevel +1]
*/
/* reorder rbuf based on rank */
if (w_rank == root && !han_module->is_mapbycore) {
for (i = 0; i < up_size; i++) {
for (j = 0; j < low_size; j++) {
for (i=0; i<w_size; i++) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future Gather copy from %d %d\n", w_rank,
(i * low_size + j) * 2 + 1, topo[(i * low_size + j) * 2 + 1]));
ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t) rcount,
(char *) rbuf +
rextent * (ptrdiff_t) topo[(i * low_size + j) *
2 +
1] *
(ptrdiff_t) rcount,
reorder_rbuf + rextent * (i * low_size +
j) *
(ptrdiff_t) rcount);
}
"[%d]: Han Gather copy from %d to %d\n",
w_rank,
i * 2 + 1,
topo[i * 2 + 1]));
ptrdiff_t block_size = rextent * (ptrdiff_t)rcount;
ptrdiff_t src_shift = block_size * i;
ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * 2 + 1];
ompi_datatype_copy_content_same_ddt(rdtype,
(ptrdiff_t)rcount,
(char *)rbuf + dest_shift,
reorder_rbuf + src_shift);
}
free(reorder_buf);
}
@ -156,11 +186,11 @@ mca_coll_han_gather_intra(const void *sbuf, int scount,
return OMPI_SUCCESS;
}
/* lg: lower level (shared memory) gather task */
/* Perform a intra node gather and when it ends launch the inter node gather */
int mca_coll_han_gather_lg_task(void *task_argu)
{
mca_gather_argu_t *t = (mca_gather_argu_t *) task_argu;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Future Gather: lg\n",
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Gather: lg\n",
t->w_rank));
OBJ_RELEASE(t->cur_task);
@ -168,16 +198,29 @@ int mca_coll_han_gather_lg_task(void *task_argu)
char *tmp_buf = NULL;
char *tmp_rbuf = NULL;
if (!t->noop) {
/* if the process is one of the node leader, allocate the intermediary
* buffer to gather on the low sub communicator */
int low_size = ompi_comm_size(t->low_comm);
ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap);
rsize = opal_datatype_span(&t->rdtype->super,
(int64_t)t->rcount * low_size,
&rgap);
tmp_buf = (char *) malloc(rsize);
tmp_rbuf = tmp_buf - rgap;
}
/* Shared memory gather */
t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount,
t->rdtype, t->root_low_rank, t->low_comm,
t->low_comm->c_coll->coll_gather_module);
/* shared memory node gather */
t->low_comm->c_coll->coll_gather((char *)t->sbuf,
t->scount,
t->sdtype,
tmp_rbuf,
t->rcount,
t->rdtype,
t->root_low_rank,
t->low_comm,
t->low_comm->c_coll->coll_gather_module);
/* Prepare up comm gather */
t->sbuf = tmp_rbuf;
t->sbuf_inter_free = tmp_buf;
@ -201,24 +244,192 @@ int mca_coll_han_gather_ug_task(void *task_argu)
if (t->noop) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] Future Gather: ug noop\n", t->w_rank));
"[%d] Han Gather: ug noop\n", t->w_rank));
} else {
int low_size = ompi_comm_size(t->low_comm);
/* Inter node gather */
t->up_comm->c_coll->coll_gather((char *) t->sbuf, t->scount * low_size, t->sdtype,
(char *) t->rbuf, t->rcount * low_size, t->rdtype,
t->root_up_rank, t->up_comm,
t->up_comm->c_coll->coll_gather_module);
/* inter node gather */
t->up_comm->c_coll->coll_gather((char *)t->sbuf,
t->scount*low_size,
t->sdtype,
(char *)t->rbuf,
t->rcount*low_size,
t->rdtype,
t->root_up_rank,
t->up_comm,
t->up_comm->c_coll->coll_gather_module);
if (t->sbuf_inter_free != NULL) {
free(t->sbuf_inter_free);
t->sbuf_inter_free = NULL;
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] Future Gather: ug gather finish\n", t->w_rank));
"[%d] Han Gather: ug gather finish\n", t->w_rank));
}
ompi_request_t *temp_req = t->req;
free(t);
ompi_request_complete(temp_req, 1);
return OMPI_SUCCESS;
}
/* only work with regular situation (each node has equal number of processes) */
int
mca_coll_han_gather_intra_simple(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int w_rank = ompi_comm_rank(comm);
int w_size = ompi_comm_size(comm);
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
/* Here root needs to reach all nodes on up_comm.
* But in case of unbalance some up_comms are smaller,
* as the comm_split is made on the base of low_rank */
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle gather with this communicator. It need to fall back on another component\n"));
return han_module->previous_gather(sbuf, scount, sdtype, rbuf,
rcount, rdtype, root,
comm, han_module->previous_gather_module);
}
/* create the subcommunicators */
mca_coll_han_comm_create_new(comm, han_module);
ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE];
ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE];
/* Get the 'virtual ranks' mapping corresponding to the communicators */
int *vranks = han_module->cached_vranks;
/* information about sub-communicators */
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
/* Get root ranks for low and up comms */
int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
/* allocate buffer to store unordered result on root
* if the processes are mapped-by core, no need to reorder:
* distribution of ranks on core first and node next,
* in a increasing order for both patterns */
char *reorder_buf = NULL; // allocated memory
char *reorder_buf_start = NULL; // start of the data
if (w_rank == root) {
if (han_module->is_mapbycore) {
reorder_buf_start = (char *)rbuf;
} else {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future Gather needs reordering: ", w_rank));
ptrdiff_t rgap = 0;
ptrdiff_t rsize = opal_datatype_span(&rdtype->super,
(int64_t)rcount * w_size,
&rgap);
reorder_buf = (char *)malloc(rsize);
/* rgap is the size of unused space at the start of the datatype */
reorder_buf_start = reorder_buf - rgap;
}
}
/* allocate the intermediary buffer
* * to gather on leaders on the low sub communicator */
char *tmp_buf = NULL; // allocated memory
char *tmp_buf_start = NULL; // start of the data
if (low_rank == root_low_rank) {
ptrdiff_t rsize, rgap = 0;
rsize = opal_datatype_span(&rdtype->super,
(int64_t)rcount * low_size,
&rgap);
tmp_buf = (char *) malloc(rsize);
tmp_buf_start = tmp_buf - rgap;
}
/* 1. low gather on nodes leaders */
low_comm->c_coll->coll_gather((char *)sbuf,
scount,
sdtype,
tmp_buf_start,
rcount,
rdtype,
root_low_rank,
low_comm,
low_comm->c_coll->coll_gather_module);
/* 2. upper gather (inter-node) between node leaders */
if (low_rank == root_low_rank) {
up_comm->c_coll->coll_gather((char *)tmp_buf_start,
scount*low_size,
sdtype,
(char *)reorder_buf_start,
rcount*low_size,
rdtype,
root_up_rank,
up_comm,
up_comm->c_coll->coll_gather_module);
if (tmp_buf != NULL) {
free(tmp_buf);
tmp_buf = NULL;
tmp_buf_start = NULL;
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] Future Gather: ug gather finish\n", t->w_rank));
}
/* 3. reorder data on root into rbuf
* if ranks are not mapped in topological order, data needs to be reordered
* (see reorder_gather)
*/
if (w_rank == root && !han_module->is_mapbycore) {
ompi_coll_han_reorder_gather(reorder_buf_start,
rbuf, rcount, rdtype,
comm, topo);
free(reorder_buf);
}
return OMPI_SUCCESS;
}
/* Reorder after gather operation, for unordered ranks
*
* Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are
* mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from
* low gather will be 0 2 4 6 and 1 3 5 7.
* So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered.
* The 3rd element (4) must be recopied at the 4th place. In general, the
* i-th element must be recopied at the place given by the i-th entry of the
* topology, which is topo[i*topolevel +1]
*/
void
ompi_coll_han_reorder_gather(const void *sbuf,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
int * topo) {
int i;
int topolevel = 2; // always 2 levels in topo
int w_rank = ompi_comm_rank(comm);
int w_size = ompi_comm_size(comm);
ptrdiff_t rextent;
ompi_datatype_type_extent(rdtype, &rextent);
for (i=0; i<w_size; i++) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Future reorder from %d to %d\n",
w_rank,
i * topolevel + 1,
topo[i * topolevel + 1]));
ptrdiff_t block_size = rextent * (ptrdiff_t)rcount;
ptrdiff_t src_shift = block_size * i;
ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * topolevel + 1];
ompi_datatype_copy_content_same_ddt(rdtype,
(ptrdiff_t)rcount,
(char *)rbuf + dest_shift,
(char *)sbuf + src_shift);
}
}

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -11,49 +12,44 @@
#include "ompi_config.h"
#include <stdio.h>
#include <string.h>
#ifdef HAVE_SCHED_H
#include <sched.h>
#endif
#include <sys/types.h>
#ifdef HAVE_SYS_MMAN_H
#include <sys/mman.h>
#endif /* HAVE_SYS_MMAN_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include "mpi.h"
#include "opal_stdint.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/util/os_path.h"
#include "ompi/communicator/communicator.h"
#include "ompi/group/group.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/proc/proc.h"
#include "coll_han.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include <math.h>
#include <limits.h>
#include "coll_han_dynamic.h"
/*
* Local functions
*/
static int han_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm);
static int han_module_enable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm);
static int mca_coll_han_module_disable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm);
/*
* Module constructor
*/
static void han_module_clear(mca_coll_han_module_t *han_module)
{
int i;
for (i = 0; i < COLLCOUNT; i++) {
/*
* Since the previous routines function pointers are declared as
* a union, initializing the dummy routineis enough
*/
han_module->previous_routines[i].previous_routine.dummy = NULL;
han_module->previous_routines[i].previous_module = NULL;
}
han_module->reproducible_reduce = NULL;
han_module->reproducible_reduce_module = NULL;
han_module->reproducible_allreduce = NULL;
han_module->reproducible_allreduce_module = NULL;
}
static void mca_coll_han_module_construct(mca_coll_han_module_t * module)
{
int i;
module->enabled = false;
module->super.coll_module_disable = mca_coll_han_module_disable;
module->cached_comm = NULL;
@ -62,27 +58,47 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module)
module->cached_vranks = NULL;
module->cached_topo = NULL;
module->is_mapbycore = false;
module->storage_initialized = false;
for (i = 0 ; i < NB_TOPO_LVL ; i++) {
module->sub_comm[i] = NULL;
}
for (i=SELF ; i<COMPONENTS_COUNT ; i++) {
module->modules_storage.modules[i].module_handler = NULL;
}
module->dynamic_errors = 0;
han_module_clear(module);
}
#define OBJ_RELEASE_IF_NOT_NULL(obj) do { \
if (NULL != (obj)) { \
OBJ_RELEASE(obj); \
} \
} while (0)
/*
* Module destructor
*/
static void mca_coll_han_module_destruct(mca_coll_han_module_t * module)
{
int i;
module->enabled = false;
if (module->cached_low_comms != NULL) {
ompi_comm_free(&(module->cached_low_comms[0]));
ompi_comm_free(&(module->cached_low_comms[1]));
module->cached_low_comms[0] = NULL;
module->cached_low_comms[1] = NULL;
for (i = 0; i < COLL_HAN_LOW_MODULES; i++) {
ompi_comm_free(&(module->cached_low_comms[i]));
module->cached_low_comms[i] = NULL;
}
free(module->cached_low_comms);
module->cached_low_comms = NULL;
}
if (module->cached_up_comms != NULL) {
ompi_comm_free(&(module->cached_up_comms[0]));
ompi_comm_free(&(module->cached_up_comms[1]));
module->cached_up_comms[0] = NULL;
module->cached_up_comms[1] = NULL;
for (i = 0; i < COLL_HAN_UP_MODULES; i++) {
ompi_comm_free(&(module->cached_up_comms[i]));
module->cached_up_comms[i] = NULL;
}
free(module->cached_up_comms);
module->cached_up_comms = NULL;
}
@ -94,21 +110,27 @@ static void mca_coll_han_module_destruct(mca_coll_han_module_t * module)
free(module->cached_topo);
module->cached_topo = NULL;
}
}
for(i=0 ; i<NB_TOPO_LVL ; i++) {
if(NULL != module->sub_comm[i]) {
ompi_comm_free(&(module->sub_comm[i]));
}
}
/*
* Module disable
*/
static int mca_coll_han_module_disable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm)
{
return OMPI_SUCCESS;
OBJ_RELEASE_IF_NOT_NULL(module->previous_allgather_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_allreduce_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_bcast_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_gather_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_reduce_module);
OBJ_RELEASE_IF_NOT_NULL(module->previous_scatter_module);
han_module_clear(module);
}
OBJ_CLASS_INSTANCE(mca_coll_han_module_t,
mca_coll_base_module_t,
mca_coll_han_module_construct, mca_coll_han_module_destruct);
mca_coll_han_module_construct,
mca_coll_han_module_destruct);
/*
* Initial query function that is invoked during MPI_INIT, allowing
@ -116,7 +138,8 @@ OBJ_CLASS_INSTANCE(mca_coll_han_module_t,
* required level of thread support. This function is invoked exactly
* once.
*/
int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_threads)
int mca_coll_han_init_query(bool enable_progress_threads,
bool enable_mpi_threads)
{
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:init_query: pick me! pick me!");
@ -129,16 +152,23 @@ int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_thread
* Look at the communicator and decide which set of functions and
* priority we want to return.
*/
mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority)
mca_coll_base_module_t *
mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority)
{
mca_coll_han_module_t *han_module;
/* If we're intercomm, or if there's only one process in the
communicator */
if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm)
|| !ompi_group_have_remote_peers(comm->c_local_group)) {
/*
* If we're intercomm, or if there's only one process in the communicator
*/
if (OMPI_COMM_IS_INTER(comm)) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:comm_query (%d/%s): intercomm, comm is too small, only on one node; disqualifying myself",
"coll:han:comm_query (%d/%s): intercomm; disqualifying myself",
comm->c_contextid, comm->c_name);
return NULL;
}
if (1 == ompi_comm_size(comm)) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:comm_query (%d/%s): comm is too small; disqualifying myself",
comm->c_contextid, comm->c_name);
return NULL;
}
@ -159,24 +189,53 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * com
}
/* All is good -- return a module */
han_module->super.coll_module_enable = han_module_enable;
han_module->super.ft_event = NULL;
han_module->super.coll_allgather = NULL; //mca_coll_han_allgather_intra;
han_module->super.coll_allgatherv = NULL;
han_module->super.coll_allreduce = mca_coll_han_allreduce_intra;
han_module->super.coll_alltoall = NULL;
han_module->super.coll_alltoallv = NULL;
han_module->super.coll_alltoallw = NULL;
han_module->super.coll_barrier = NULL;
han_module->super.coll_bcast = mca_coll_han_bcast_intra;
han_module->super.coll_exscan = NULL;
han_module->super.coll_gather = mca_coll_han_gather_intra;
han_module->super.coll_gatherv = NULL;
han_module->super.coll_reduce = mca_coll_han_reduce_intra;
han_module->super.coll_reduce_scatter = NULL;
han_module->super.coll_scan = NULL;
han_module->super.coll_scatter = mca_coll_han_scatter_intra;
han_module->super.coll_scatterv = NULL;
han_module->topologic_level = mca_coll_han_component.topo_level;
/*
* TODO: When the selector is fully implemented,
* this if will be meaningless
*/
if (GLOBAL_COMMUNICATOR == han_module->topologic_level) {
/* We are on the global communicator, return topological algorithms */
han_module->super.coll_module_enable = han_module_enable;
han_module->super.ft_event = NULL;
han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic;
han_module->super.coll_allgatherv = NULL;
han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic;
han_module->super.coll_alltoall = NULL;
han_module->super.coll_alltoallv = NULL;
han_module->super.coll_alltoallw = NULL;
han_module->super.coll_barrier = NULL;
han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic;
han_module->super.coll_exscan = NULL;
han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic;
han_module->super.coll_gatherv = NULL;
han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic;
han_module->super.coll_reduce_scatter = NULL;
han_module->super.coll_scan = NULL;
han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic;
han_module->super.coll_scatterv = NULL;
} else {
/* We are on a topologic sub-communicator, return only the selector */
han_module->super.coll_module_enable = han_module_enable;
han_module->super.ft_event = NULL;
han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic;
han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic;
han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic;
han_module->super.coll_alltoall = NULL;
han_module->super.coll_alltoallv = NULL;
han_module->super.coll_alltoallw = NULL;
han_module->super.coll_barrier = NULL;
han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic;
han_module->super.coll_exscan = NULL;
han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic;
han_module->super.coll_gatherv = NULL;
han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic;
han_module->super.coll_reduce_scatter = NULL;
han_module->super.coll_scan = NULL;
han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic;
han_module->super.coll_scatterv = NULL;
}
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:han:comm_query (%d/%s): pick me! pick me!",
@ -185,14 +244,71 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * com
}
/*
* In this macro, the following variables are supposed to have been declared
* in the caller:
* . ompi_communicator_t *comm
* . mca_coll_han_module_t *han_module
*/
#define HAN_SAVE_PREV_COLL_API(__api) do { \
han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \
han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module;\
if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \
opal_output_verbose(1, ompi_coll_base_framework.framework_output, \
"(%d/%s): no underlying " # __api"; disqualifying myself", \
comm->c_contextid, comm->c_name); \
return OMPI_ERROR; \
} \
/* TODO add a OBJ_RELEASE at module disabling */ \
/* + FIXME find why releasing generates memory corruption */ \
OBJ_RETAIN(han_module->previous_ ## __api ## _module); \
} while(0)
/*
* Init module on the communicator
*/
static int han_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm)
static int han_module_enable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm)
{
mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module;
HAN_SAVE_PREV_COLL_API(allgather);
HAN_SAVE_PREV_COLL_API(allgatherv);
HAN_SAVE_PREV_COLL_API(allreduce);
HAN_SAVE_PREV_COLL_API(bcast);
HAN_SAVE_PREV_COLL_API(gather);
HAN_SAVE_PREV_COLL_API(reduce);
HAN_SAVE_PREV_COLL_API(scatter);
/* set reproducible algos */
mca_coll_han_reduce_reproducible_decision(comm, module);
mca_coll_han_allreduce_reproducible_decision(comm, module);
return OMPI_SUCCESS;
}
/*
* Module disable
*/
static int mca_coll_han_module_disable(mca_coll_base_module_t * module,
struct ompi_communicator_t *comm)
{
mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module;
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module);
OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module);
han_module_clear(han_module);
return OMPI_SUCCESS;
}
/*
* Free the han request
*/
@ -203,266 +319,3 @@ int han_request_free(ompi_request_t ** request)
*request = MPI_REQUEST_NULL;
return OMPI_SUCCESS;
}
/* Create the communicators used in the HAN module */
void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module)
{
/* Use cached communicators if possible */
if (han_module->cached_comm == comm && han_module->cached_low_comms != NULL
&& han_module->cached_up_comms != NULL && han_module->cached_vranks != NULL) {
return;
}
/* Create communicators if there is no cached communicator */
else {
int low_rank, low_size;
int up_rank;
int w_rank = ompi_comm_rank(comm);
int w_size = ompi_comm_size(comm);
ompi_communicator_t **low_comms =
(struct ompi_communicator_t **) malloc(sizeof(struct ompi_communicator_t *) * 2);
ompi_communicator_t **up_comms =
(struct ompi_communicator_t **) malloc(sizeof(struct ompi_communicator_t *) * 2);
/* Create low_comms which contain all the process on a node */
const int *origin_priority = NULL;
/* Lower the priority of HAN module */
int han_var_id;
int tmp_han_priority = 0;
int tmp_han_origin = 0;
mca_base_var_find_by_name("coll_han_priority", &han_var_id);
mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL);
tmp_han_origin = *origin_priority;
mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET,
NULL);
comm->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling;
comm->c_coll->coll_allgather = ompi_coll_base_allgather_intra_bruck;
int var_id;
int tmp_priority = 100;
int tmp_origin = 0;
/* Set up low_comms[0] with sm module */
mca_base_var_find_by_name("coll_sm_priority", &var_id);
mca_base_var_get_value(var_id, &origin_priority, NULL, NULL);
tmp_origin = *origin_priority;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] sm_priority origin %d %d\n", w_rank, *origin_priority,
tmp_origin));
mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, (opal_info_t *) (&ompi_mpi_info_null),
&(low_comms[0]));
mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
low_size = ompi_comm_size(low_comms[0]);
low_rank = ompi_comm_rank(low_comms[0]);
/* Set up low_comms[1] with solo module */
mca_base_var_find_by_name("coll_solo_priority", &var_id);
mca_base_var_get_value(var_id, &origin_priority, NULL, NULL);
tmp_origin = *origin_priority;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] solo_priority origin %d %d\n", w_rank, *origin_priority,
tmp_origin));
mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, (opal_info_t *) (&ompi_mpi_info_null),
&(low_comms[1]));
mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
/* Create up_comms[0] with libnbc which contain one process per node (across nodes) */
mca_base_var_find_by_name("coll_libnbc_priority", &var_id);
mca_base_var_get_value(var_id, &origin_priority, NULL, NULL);
tmp_origin = *origin_priority;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] libnbc_priority origin %d %d\n", w_rank, *origin_priority,
tmp_origin));
mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
ompi_comm_split(comm, low_rank, w_rank, &(up_comms[0]), false);
mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
up_rank = ompi_comm_rank(up_comms[0]);
/* Create up_comms[1] with adapt which contain one process per node (across nodes) */
mca_base_var_find_by_name("coll_adapt_priority", &var_id);
mca_base_var_get_value(var_id, &origin_priority, NULL, NULL);
tmp_origin = *origin_priority;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] adapt_priority origin %d %d\n", w_rank, *origin_priority,
tmp_origin));
mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
ompi_comm_split(comm, low_rank, w_rank, &(up_comms[1]), false);
mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL);
int *vranks = malloc(sizeof(int) * w_size);
/* Do allgather to gather vrank from each process so every process knows other processes' vrank */
int vrank = low_size * up_rank + low_rank;
ompi_coll_base_allgather_intra_bruck(&vrank, 1, MPI_INT, vranks, 1, MPI_INT, comm,
comm->c_coll->coll_allgather_module);
han_module->cached_comm = comm;
han_module->cached_low_comms = low_comms;
han_module->cached_up_comms = up_comms;
han_module->cached_vranks = vranks;
mca_base_var_set_value(han_var_id, &tmp_han_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET,
NULL);
comm->c_coll->coll_allreduce = mca_coll_han_allreduce_intra;
comm->c_coll->coll_allgather = mca_coll_han_allgather_intra;
}
}
int mca_coll_han_pow10_int(int pow_value)
{
int i, result = 1;
for (i = 0; i < pow_value; i++) {
result *= 10;
}
return result;
}
int mca_coll_han_hostname_to_number(char *hostname, int size)
{
int i = 0, j = 0;
char *number_array = (char *) malloc(sizeof(char) * size);
while (hostname[i] != '\0') {
if (hostname[i] >= '0' && hostname[i] <= '9') {
number_array[j++] = hostname[i];
}
i++;
}
int number = 0;
for (i = 0; i < j; i++) {
number += (number_array[i] - '0') * mca_coll_han_pow10_int(j - 1 - i);
}
free(number_array);
return number;
}
void mca_coll_han_topo_get(int *topo, struct ompi_communicator_t *comm, int num_topo_level)
{
int *self_topo = (int *) malloc(sizeof(int) * num_topo_level);
/* Set daemon vpid */
char hostname[1024];
gethostname(hostname, 1024);
self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024);
/* Set core id */
self_topo[1] = ompi_comm_rank(comm);
/* Allgather all the topology information */
ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT, topo, num_topo_level,
MPI_INT, comm, comm->c_coll->coll_allgather_module);
free(self_topo);
return;
}
void mca_coll_han_topo_sort(int *topo, int start, int end, int size, int level, int num_topo_level)
{
if (level > num_topo_level - 1 || start >= end) {
return;
}
int i, j;
int min = INT_MAX;
int min_loc = -1;
for (i = start; i <= end; i++) {
/* Find min */
for (j = i; j <= end; j++) {
if (topo[j * num_topo_level + level] < min) {
min = topo[j * num_topo_level + level];
min_loc = j;
}
}
/* Swap i and min_loc */
int temp;
for (j = 0; j < num_topo_level; j++) {
temp = topo[i * num_topo_level + j];
topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j];
topo[min_loc * num_topo_level + j] = temp;
}
min = INT_MAX;
min_loc = -1;
}
int last = 0;
int new_start = 0;
int new_end = 0;
for (i = start; i <= end; i++) {
if (i == start) {
last = topo[i * num_topo_level + level];
new_start = start;
} else if (i == end) {
new_end = end;
mca_coll_han_topo_sort(topo, new_start, new_end, size, level + 1, num_topo_level);
} else if (last != topo[i * num_topo_level + level]) {
new_end = i - 1;
mca_coll_han_topo_sort(topo, new_start, new_end, size, level + 1, num_topo_level);
new_start = i;
last = topo[i * num_topo_level + level];
}
}
return;
}
/* Check if the current processes are mapped by core */
bool mca_coll_han_topo_is_mapbycore(int *topo, struct ompi_communicator_t * comm,
int num_topo_level)
{
int i;
int size = ompi_comm_size(comm);
for (i = 1; i < size; i++) {
if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level]
|| topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) {
return false;
}
}
return true;
}
int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module,
int num_topo_level)
{
int size;
size = ompi_comm_size(comm);
int *topo;
if ((han_module->cached_topo != NULL) && (han_module->cached_comm == comm)) {
topo = han_module->cached_topo;
}
else {
if (han_module->cached_topo != NULL) {
free(han_module->cached_topo);
han_module->cached_topo = NULL;
}
topo = (int *) malloc(sizeof(int) * size * num_topo_level);
/* Get topo infomation */
mca_coll_han_topo_get(topo, comm, num_topo_level);
mca_coll_han_topo_print(topo, comm, num_topo_level);
/* Check if the processes are mapped by core */
han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level);
/* Sort the topo such that each group is contiguous */
if (!han_module->is_mapbycore) {
mca_coll_han_topo_sort(topo, 0, size - 1, size, 0, num_topo_level);
}
han_module->cached_topo = topo;
han_module->cached_comm = comm;
}
mca_coll_han_topo_print(topo, comm, num_topo_level);
return topo;
}
/* Print out the topology info, for debugging purpose */
void mca_coll_han_topo_print(int *topo, struct ompi_communicator_t *comm, int num_topo_level)
{
int rank = ompi_comm_rank(comm);
int size = ompi_comm_size(comm);
if (rank == 0) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: HAN topo: ", rank));
int i;
for (i = 0; i < size * num_topo_level; i++) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i]));
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n"));
}
}

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -69,8 +70,24 @@ mca_coll_han_reduce_intra(const void *sbuf,
size_t typelng;
ompi_datatype_type_size(dtype, &typelng);
/* Create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
/* Do not initialize topology if the operation cannot commute */
if(!ompi_op_is_commute(op)){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this operation. It needs to fall back on another component\n"));
goto prev_reduce_intra;
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator. It needs to fall back on another component\n"));
goto prev_reduce_intra;
}
/* Create the subcommunicators */
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm;
ompi_communicator_t *up_comm;
@ -133,6 +150,11 @@ mca_coll_han_reduce_intra(const void *sbuf,
free(t);
return OMPI_SUCCESS;
prev_reduce_intra:
return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root,
comm,
han_module->previous_reduce_module);
}
/* t0 task: issue and wait for the low level reduce of segment 0 */
@ -189,4 +211,178 @@ int mca_coll_han_reduce_t1_task(void *task_argu) {
}
return OMPI_SUCCESS;
}
}
/* In case of non regular situation (imbalanced number of processes per nodes),
* a fallback is made on the next component that provides a reduce in priority order */
int
mca_coll_han_reduce_intra_simple(const void *sbuf,
void* rbuf,
int count,
struct ompi_datatype_t *dtype,
ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int w_rank; /* information about the global communicator */
int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */
int ret;
int *vranks, low_rank, low_size;
ptrdiff_t rsize, rgap = 0;
void * tmp_buf;
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* Do not initialize topology if the operation cannot commute */
if(!ompi_op_is_commute(op)){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this operation. It needs to fall back on another component\n"));
goto prev_reduce_intra_simple;
}
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle reduce with this communicator. It needs to fall back on another component\n"));
goto prev_reduce_intra_simple;
}
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module];
ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module];
/* Get the 'virtual ranks' mapping corresponding to the communicators */
vranks = han_module->cached_vranks;
w_rank = ompi_comm_rank(comm);
low_rank = ompi_comm_rank(low_comm);
low_size = ompi_comm_size(low_comm);
/* Get root ranks for low and up comms */
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
if (root_low_rank == low_rank && w_rank != root) {
rsize = opal_datatype_span(&dtype->super, (int64_t)count, &rgap);
tmp_buf = malloc(rsize);
if (NULL == tmp_buf) {
return OMPI_ERROR;
}
} else {
/* global root rbuf is valid, local non-root do not need buffers */
tmp_buf = rbuf;
}
/* No need to handle MPI_IN_PLACE: only the global root may ask for it and
* it is ok to use it for intermediary reduces since it is also a local root*/
/* Low_comm reduce */
ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)tmp_buf,
count, dtype, op, root_low_rank,
low_comm, low_comm->c_coll->coll_reduce_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){
if (root_low_rank == low_rank && w_rank != root){
free(tmp_buf);
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"HAN/REDUCE: low comm reduce failed. "
"Falling back to another component\n"));
goto prev_reduce_intra_simple;
}
/* Up_comm reduce */
if (root_low_rank == low_rank ){
if(w_rank != root){
ret = up_comm->c_coll->coll_reduce((char *)tmp_buf, NULL,
count, dtype, op, root_up_rank,
up_comm, up_comm->c_coll->coll_reduce_module);
free(tmp_buf);
} else {
/* Take advantage of any optimisation made for IN_PLACE
* communcations */
ret = up_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)tmp_buf,
count, dtype, op, root_up_rank,
up_comm, up_comm->c_coll->coll_reduce_module);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"HAN/REDUCE: low comm reduce failed.\n"));
return ret;
}
}
return OMPI_SUCCESS;
prev_reduce_intra_simple:
return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root,
comm,
han_module->previous_reduce_module);
}
/* Find a fallback on reproducible algorithm
* use tuned or basic or if impossible whatever available
*/
int
mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int w_rank = ompi_comm_rank(comm);
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
/* populate previous modules_storage*/
mca_coll_han_get_all_coll_modules(comm, han_module);
/* try availability of reproducible modules */
int fallbacks[] = {TUNED, BASIC};
int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks);
int i;
for (i=0; i<fallbacks_len; i++) {
int fallback = fallbacks[i];
mca_coll_base_module_t *fallback_module = han_module->modules_storage
.modules[fallback]
.module_handler;
if (fallback_module != NULL && fallback_module->coll_reduce != NULL) {
if (0 == w_rank) {
opal_output_verbose(30, mca_coll_han_component.han_output,
"coll:han:reduce_reproducible: "
"fallback on %s\n",
components_name[fallback]);
}
han_module->reproducible_reduce_module = fallback_module;
han_module->reproducible_reduce = fallback_module->coll_reduce;
return OMPI_SUCCESS;
}
}
/* fallback of the fallback */
if (0 == w_rank) {
opal_output_verbose(5, mca_coll_han_component.han_output,
"coll:han:reduce_reproducible_decision: "
"no reproducible fallback\n");
}
han_module->reproducible_reduce_module =
han_module->previous_reduce_module;
han_module->reproducible_reduce = han_module->previous_reduce;
return OMPI_SUCCESS;
}
/* Fallback on reproducible algorithm */
int
mca_coll_han_reduce_reproducible(const void *sbuf,
void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
return han_module->reproducible_reduce(sbuf, rbuf, count, dtype,
op, root, comm,
han_module
->reproducible_reduce_module);
}

Просмотреть файл

@ -66,13 +66,23 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount,
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
/* Create the subcommunicators */
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module;
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
/* Topo must be initialized to know rank distribution which then is used to
* determine if han can be used */
mca_coll_han_topo_init(comm, han_module, 2);
if (han_module->are_ppn_imbalanced){
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"han cannot handle scatter with this communicator. It needs to fall back on another component\n"));
goto prev_scatter_intra;
}
/* Create the subcommunicators */
mca_coll_han_comm_create(comm, han_module);
ompi_communicator_t *low_comm =
han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module];
han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module];
ompi_communicator_t *up_comm =
han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module];
han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module];
int *vranks = han_module->cached_vranks;
int low_rank = ompi_comm_rank(low_comm);
int low_size = ompi_comm_size(low_comm);
@ -93,6 +103,8 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount,
int root_low_rank;
int root_up_rank;
mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank,
@ -105,7 +117,6 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount,
*/
char *reorder_buf = NULL;
char *reorder_sbuf = NULL;
int *topo = mca_coll_han_topo_init(comm, han_module, 2);
if (w_rank == root) {
/* If the processes are mapped-by core, no need to reorder */
@ -154,6 +165,11 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount,
ompi_request_wait(&temp_request, MPI_STATUS_IGNORE);
return OMPI_SUCCESS;
prev_scatter_intra:
return han_module->previous_scatter(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm,
han_module->previous_scatter_module);
}
/* us: upper level (intra-node) scatter task */

490
ompi/mca/coll/han/coll_han_subcomms.c Обычный файл
Просмотреть файл

@ -0,0 +1,490 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Warning: this is not for the faint of heart -- don't even bother
* reading this source code if you don't have a strong understanding
* of nested data structures and pointer math (remember that
* associativity and order of C operations is *critical* in terms of
* pointer math!).
*/
#include "ompi_config.h"
#include "mpi.h"
#include "coll_han.h"
#include "coll_han_dynamic.h"
/*
* Local functions
*/
static void create_intranode_comm_new(ompi_communicator_t *,
ompi_communicator_t **);
static void create_internode_comm_new(ompi_communicator_t *,
int, int,
ompi_communicator_t **);
static void create_intranode_comm(ompi_communicator_t *,
const char *,
int,
ompi_communicator_t **);
static void create_internode_comm(ompi_communicator_t *,
const char *,
int, int,
ompi_communicator_t **);
/**
* Create a sub-communicator containing the ranks that share my node.
*
* @param comm (IN) original communicator for the collective
* target module priority
* @param sub_comm (OUT) created sub-communicator
*/
static void create_intranode_comm_new(ompi_communicator_t *comm,
ompi_communicator_t **sub_comm)
{
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
(opal_info_t *)(&ompi_mpi_info_null), sub_comm);
return;
}
/**
* Create a sub-communicator containing one rank per node.
*
* @param comm (IN) original communicator for the collective
* @param my_rank (IN) my rank in comm
* @param intra_rank (IN) local rank in the intra-node sub-communicator
* @param sub_comm (OUT) created sub-communicator
*/
static void create_internode_comm_new(ompi_communicator_t *comm,
int my_rank,
int intra_rank,
ompi_communicator_t **sub_comm)
{
ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false);
return;
}
/*
* Routine that creates the local hierarchical sub-communicators
* Called each time a collective is called.
* comm: input communicator of the collective
*/
void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module)
{
int low_rank, low_size;
int up_rank;
int w_rank;
int w_size;
ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]);
ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]);
const int *origin_priority;
int han_var_id;
int tmp_han_priority;
int vrank, *vranks;
mca_coll_base_module_allreduce_fn_t old_allreduce;
mca_coll_base_module_t *old_allreduce_module;
mca_coll_base_module_allgather_fn_t old_allgather;
mca_coll_base_module_t *old_allgather_module;
mca_coll_base_module_bcast_fn_t old_bcast;
mca_coll_base_module_t *old_bcast_module;
mca_coll_base_module_gather_fn_t old_gather;
mca_coll_base_module_t *old_gather_module;
mca_coll_base_module_reduce_fn_t old_reduce;
mca_coll_base_module_t *old_reduce_module;
/* The sub communicators have already been created */
if (NULL != han_module->sub_comm[INTRA_NODE]
&& NULL != han_module->sub_comm[INTER_NODE]
&& NULL != han_module->cached_vranks) {
return;
}
/*
* We cannot use han allreduce and allgather without sub-communicators
* Temporary set previous ones
*
* Allgather is used to compute vranks
* Allreduce is used by ompi_comm_split_type in create_intranode_comm_new
* Reduce + Bcast may be called by the allreduce implementation
* Gather + Bcast may be called by the allgather implementation
*/
old_allreduce = comm->c_coll->coll_allreduce;
old_allreduce_module = comm->c_coll->coll_allreduce_module;
old_allgather = comm->c_coll->coll_allgather;
old_allgather_module = comm->c_coll->coll_allgather_module;
old_reduce = comm->c_coll->coll_reduce;
old_reduce_module = comm->c_coll->coll_reduce_module;
old_bcast = comm->c_coll->coll_bcast;
old_bcast_module = comm->c_coll->coll_bcast_module;
old_gather = comm->c_coll->coll_gather;
old_gather_module = comm->c_coll->coll_gather_module;
comm->c_coll->coll_allreduce = han_module->previous_allreduce;
comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module;
comm->c_coll->coll_allgather = han_module->previous_allgather;
comm->c_coll->coll_allgather_module = han_module->previous_allgather_module;
comm->c_coll->coll_reduce = han_module->previous_reduce;
comm->c_coll->coll_reduce_module = han_module->previous_reduce_module;
comm->c_coll->coll_bcast = han_module->previous_bcast;
comm->c_coll->coll_bcast_module = han_module->previous_bcast_module;
comm->c_coll->coll_gather = han_module->previous_gather;
comm->c_coll->coll_gather_module = han_module->previous_gather_module;
/* Create topological sub-communicators */
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
origin_priority = NULL;
mca_base_var_find_by_name("coll_han_priority", &han_var_id);
mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL);
/*
* Maximum priority for selector on sub-communicators
*/
tmp_han_priority = 100;
mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/*
* This sub-communicator contains the ranks that share my node.
*/
mca_coll_han_component.topo_level = INTRA_NODE;
create_intranode_comm_new(comm, low_comm);
/*
* Get my local rank and the local size
*/
low_size = ompi_comm_size(*low_comm);
low_rank = ompi_comm_rank(*low_comm);
/*
* This sub-communicator contains one process per node: processes with the
* same intra-node rank id share such a sub-communicator
*/
mca_coll_han_component.topo_level = INTER_NODE;
create_internode_comm_new(comm, w_rank, low_rank, up_comm);
up_rank = ompi_comm_rank(*up_comm);
/*
* Set my virtual rank number.
* my rank # = <intra-node comm size> * <inter-node rank number>
* + <intra-node rank number>
* WARNING: this formula works only if the ranks are perfectly spread over
* the nodes
* TODO: find a better way of doing
*/
vrank = low_size * up_rank + low_rank;
vranks = (int *)malloc(sizeof(int) * w_size);
/*
* gather vrank from each process so every process will know other processes
* vrank
*/
comm->c_coll->coll_allgather(&vrank,
1,
MPI_INT,
vranks,
1,
MPI_INT,
comm,
comm->c_coll->coll_allgather_module);
/*
* Set the cached info
*/
han_module->cached_vranks = vranks;
/*
* Come back to the original han module priority
*/
mca_base_var_set_value(han_var_id, origin_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/* Put allreduce, allgather, reduce and bcast back */
comm->c_coll->coll_allreduce = old_allreduce;
comm->c_coll->coll_allreduce_module = old_allreduce_module;
comm->c_coll->coll_allgather = old_allgather;
comm->c_coll->coll_allgather_module = old_allgather_module;
comm->c_coll->coll_reduce = old_reduce;
comm->c_coll->coll_reduce_module = old_reduce_module;
comm->c_coll->coll_bcast = old_bcast;
comm->c_coll->coll_bcast_module = old_bcast_module;
comm->c_coll->coll_gather = old_gather;
comm->c_coll->coll_gather_module = old_gather_module;
mca_coll_han_component.topo_level = GLOBAL_COMMUNICATOR;
}
/**
* Create a sub-communicator containing the ranks that share my node.
* Associate this sub-communicator a given collective module.
* module can be one of:
* . sm
* . shared
*
* @param comm (IN) original communicator for the collective
* @param prio_string (IN) string containing the mca variable associated to
* target module priority
* @param my_rank (IN) my rank in comm
* @param sub_comm (OUT) created sub-communicator
*/
static void create_intranode_comm(ompi_communicator_t *comm,
const char *prio_string,
int my_rank,
ompi_communicator_t **sub_comm)
{
int var_id;
const int *sav_priority;
int tmp_priority = 100;
/*
* Upgrade the target module priority to make the resulting sub-communicator
* use that collective module
*/
mca_base_var_find_by_name(prio_string, &var_id);
mca_base_var_get_value(var_id, &sav_priority, NULL, NULL);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] %s origin %d\n",
my_rank, prio_string, *sav_priority));
mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(var_id, &tmp_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/*
* Create the sub-communicator
* Since the target priority has been set to the highest value, this
* sub-communicator will inherit it as a collective module.
*/
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
(opal_info_t *)(&ompi_mpi_info_null), sub_comm);
/*
* Come back to the target module's original priority
*/
mca_base_var_set_value(var_id, sav_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
return;
}
/**
* Create a sub-communicator containing one rank per node.
* Associate this sub-communicator a given collective module.
* module can be one of:
* . libnbc
* . adapt
*
* @param comm (IN) original communicator for the collective
* @param prio_string (IN) string containing the mca variable associated to
* target module priority
* @param my_rank (IN) my rank in comm
* @param intra_rank (IN) local rank in the intra-node sub-communicator
* @param sub_comm (OUT) created sub-communicator
*/
static void create_internode_comm(ompi_communicator_t *comm,
const char *prio_string,
int my_rank,
int intra_rank,
ompi_communicator_t **sub_comm)
{
int var_id;
const int *sav_priority;
int tmp_priority = 100;
/*
* Upgrade the target module priority to make the resulting sub-communicator
* use that collective module
*/
mca_base_var_find_by_name(prio_string, &var_id);
mca_base_var_get_value(var_id, &sav_priority, NULL, NULL);
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
"[%d] %s origin %d\n", my_rank, prio_string,
*sav_priority));
mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(var_id, &tmp_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/*
* Create the sub-communicator
* Since the target priority has been set to the highest value, this
* sub-communicator will inherit it as a collective module.
*/
ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false);
mca_base_var_set_value(var_id, sav_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
return;
}
/*
* Routine that creates the local hierarchical sub-communicators
* Called each time a collective is called.
* comm: input communicator of the collective
*/
void mca_coll_han_comm_create(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module)
{
int low_rank, low_size;
int up_rank;
int w_rank;
int w_size;
ompi_communicator_t **low_comms;
ompi_communicator_t **up_comms;
const int *origin_priority;
int han_var_id;
int tmp_han_priority;
int vrank, *vranks;
mca_coll_base_module_allreduce_fn_t old_allreduce;
mca_coll_base_module_t *old_allreduce_module;
mca_coll_base_module_allgather_fn_t old_allgather;
mca_coll_base_module_t *old_allgather_module;
/* use cached communicators if possible */
if (han_module->cached_comm == comm &&
han_module->cached_low_comms != NULL &&
han_module->cached_up_comms != NULL &&
han_module->cached_vranks != NULL) {
return;
}
/* We cannot use han allreduce and allgather without sub-communicators
* Temporary set previous ones */
old_allreduce = comm->c_coll->coll_allreduce;
old_allreduce_module = comm->c_coll->coll_allreduce_module;
old_allgather = comm->c_coll->coll_allgather;
old_allgather_module = comm->c_coll->coll_allgather_module;
comm->c_coll->coll_allreduce = han_module->previous_allreduce;
comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module;
comm->c_coll->coll_allgather = han_module->previous_allgather;
comm->c_coll->coll_allgather_module = han_module->previous_allgather_module;
/* create communicators if there is no cached communicator */
w_rank = ompi_comm_rank(comm);
w_size = ompi_comm_size(comm);
low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES *
sizeof(struct ompi_communicator_t *));
up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES *
sizeof(struct ompi_communicator_t *));
origin_priority = NULL;
mca_base_var_find_by_name("coll_han_priority", &han_var_id);
mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL);
/*
* Lower down our current priority
*/
tmp_han_priority = 0;
mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true);
mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/*
* Upgrade sm module priority to set up low_comms[0] with sm module
* This sub-communicator contains the ranks that share my node.
*/
create_intranode_comm(comm, "coll_sm_priority", w_rank, &(low_comms[0]));
/*
* Get my local rank and the local size
*/
low_size = ompi_comm_size(low_comms[0]);
low_rank = ompi_comm_rank(low_comms[0]);
/*
* Upgrade shared module priority to set up low_comms[1] with shared module
* This sub-communicator contains the ranks that share my node.
*/
create_intranode_comm(comm, "coll_shared_priority", w_rank, &(low_comms[1]));
/*
* Upgrade libnbc module priority to set up up_comms[0] with libnbc module
* This sub-communicator contains one process per node: processes with the
* same intra-node rank id share such a sub-communicator
*/
create_internode_comm(comm, "coll_libnbc_priority", w_rank, low_rank,
&(up_comms[0]));
up_rank = ompi_comm_rank(up_comms[0]);
/*
* Upgrade adapt module priority to set up up_comms[0] with adapt module
* This sub-communicator contains one process per node.
*/
create_internode_comm(comm, "coll_adapt_priority", w_rank, low_rank,
&(up_comms[1]));
/*
* Set my virtual rank number.
* my rank # = <intra-node comm size> * <inter-node rank number>
* + <intra-node rank number>
* WARNING: this formula works only if the ranks are perfectly spread over
* the nodes
* TODO: find a better way of doing
*/
vrank = low_size * up_rank + low_rank;
vranks = (int *)malloc(sizeof(int) * w_size);
/*
* gather vrank from each process so every process will know other processes
* vrank
*/
comm->c_coll->coll_allgather(&vrank, 1, MPI_INT, vranks, 1, MPI_INT, comm,
comm->c_coll->coll_allgather_module);
/*
* Set the cached info
*/
han_module->cached_comm = comm;
han_module->cached_low_comms = low_comms;
han_module->cached_up_comms = up_comms;
han_module->cached_vranks = vranks;
/*
* Come back to the original han module priority
*/
mca_base_var_set_value(han_var_id, origin_priority, sizeof(int),
MCA_BASE_VAR_SOURCE_SET, NULL);
/* Put allreduce and allgather back */
comm->c_coll->coll_allreduce = old_allreduce;
comm->c_coll->coll_allreduce_module = old_allreduce_module;
comm->c_coll->coll_allgather = old_allgather;
comm->c_coll->coll_allgather_module = old_allgather_module;
}

347
ompi/mca/coll/han/coll_han_topo.c Обычный файл
Просмотреть файл

@ -0,0 +1,347 @@
/*
* Copyright (c) 2018-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Warning: this is not for the faint of heart -- don't even bother
* reading this source code if you don't have a strong understanding
* of nested data structures and pointer math (remember that
* associativity and order of C operations is *critical* in terms of
* pointer math!).
*/
#include "ompi_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#include "mpi.h"
#include "coll_han.h"
/*
* Local functions
*/
static int mca_coll_han_hostname_to_number(char* hostname, int size);
static void mca_coll_han_topo_get(int *topo,
struct ompi_communicator_t* comm,
int num_topo_level);
static void mca_coll_han_topo_sort(int *topo, int start, int end,
int level, int num_topo_level);
static bool mca_coll_han_topo_is_mapbycore(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level);
static void mca_coll_han_topo_print(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level);
/*
* takes the number part of a host: hhh2031 -->2031
*/
static int mca_coll_han_hostname_to_number(char* hostname, int size)
{
int i, j;
char *number_array = (char *)malloc(sizeof(char) * size);
int number = 0;
for (i = 0, j = 0; hostname[i] != '\0'; i++) {
if ('0' <= hostname[i] && '9' >= hostname[i]) {
number_array[j++] = hostname[i];
}
}
number_array[j] = '\0';
number = atoi(number_array);
free(number_array);
return number;
}
/*
* Set the virtual topo id. It is made of num_topo_level ints (2 today):
* . the integer part of the host id
* . the rank in the main communicator
* Gather the virtual topoid from each process so every process will know other
* processes virtual topids
*/
static void mca_coll_han_topo_get(int *topo,
struct ompi_communicator_t* comm,
int num_topo_level)
{
int *self_topo = (int *)malloc(sizeof(int) * num_topo_level);
char hostname[1024];
gethostname(hostname, 1024);
self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024);
self_topo[1] = ompi_comm_rank(comm);
ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT,
topo, num_topo_level, MPI_INT, comm,
comm->c_coll->coll_allgather_module);
free(self_topo);
return;
}
/*
* Sort the topology array in order to have ranks sharing the same node
* contiguous in the topology array.
* Called from topo_init whenever the processes are not mapped by core.
* ex: 4 ranks executing on 2 nodes, mapped by node
* ranks 0 and 2 on hid0
* ranks 1 and 3 on hid1
* On entry the topo array looks like
* hid0 0 hid1 1 hid0 2 hid1 3
* After the sort:
* hid0 0 hid0 2 hid1 1 hid1 3
* This is to have the gather result in the right order
*
* @param topo (IN/OUT) topology description array (sorted in out)
* @param start (IN) where to begin the processing
* The index in topo will actually be:
* start * num_topo_level + level
* topo contains num_topo_level ids per rank.
* @param end (IN) where to stop the processing
* The index in topo will actually be:
* end * num_topo_level + level
* topo contains num_topo_level ids per rank.
* @param level (IN) level number we are currently processing
* @param num_topo_level (IN) number of topological levels
*
*/
static void mca_coll_han_topo_sort(int *topo, int start, int end,
int level, int num_topo_level)
{
int i, j;
int min, min_loc;
int last, new_start, new_end;
if (level > num_topo_level-1 || start >= end) {
return;
}
min = INT_MAX;
min_loc = -1;
for (i = start; i <= end; i++) {
int temp;
/* get the min value for current level and its location */
for (j = i; j <= end; j++) {
/* topo contains num_topo_level ids per rank. */
if (topo[j * num_topo_level + level] < min) {
min = topo[j*num_topo_level+level];
min_loc = j;
}
}
/*
* swap i and min_loc
* We have num_topo_level ids to swap
*/
for (j = 0; j < num_topo_level; j++) {
temp = topo[i * num_topo_level + j];
topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j];
topo[min_loc * num_topo_level + j] = temp;
}
min = INT_MAX;
min_loc = -1;
}
/* Process next level */
last = 0;
new_start = 0;
new_end = 0;
for (i = start; i <= end; i++) {
if (i == start) {
last = topo[i * num_topo_level + level];
new_start = start;
} else if (i == end) {
new_end = end;
mca_coll_han_topo_sort(topo, new_start, new_end, level + 1,
num_topo_level);
} else if (last != topo[i * num_topo_level + level]) {
new_end = i - 1;
mca_coll_han_topo_sort(topo, new_start, new_end, level + 1,
num_topo_level);
new_start = i;
last = topo[i * num_topo_level + level];
}
}
return;
}
/*
* Check whether the ranks in the communicator given as input are mapped by core
* Mapped by core: each node is first filled with as many ranks as needed before
* moving to the next one
* This is checked as follows:
* . 2 contiguous ranks should be either on the same node or on node ids in
* ascending order
* The topology is actually an array of ints:
* +----------+-------+----------+-------+------+----------+-------+-----+
* | host_id0 | rank0 | host_id1 | rank1 | .... | host_idX | rankX | ... |
* +----------+-------+----------+-------+------+----------+-------+-----+
*/
static bool mca_coll_han_topo_is_mapbycore(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level)
{
int i;
int size = ompi_comm_size(comm);
for (i = 1; i < size; i++) {
/*
* The host id for a given rank should be < host id for the next rank
*/
if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level]) {
return false;
}
/*
* For the same host id, consecutive ranks should be sorted in
* ascending order.
*/
if (topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) {
return false;
}
}
return true;
}
/* The topo is supposed sorted by host */
static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level){
int i;
int size = ompi_comm_size(comm);
if (size < 2){
return false;
}
int ppn;
int last_host = topo[0];
/* Find the ppn for the first node */
for (i = 1; i < size; i++) {
if (topo[i * num_topo_level] != last_host){
break;
}
}
ppn = i;
/* All on one node */
if ( size == ppn){
return false;
}
/* Trivial case */
if (size % ppn != 0){
return true;
}
last_host = topo[ppn * num_topo_level];
/* Check that the 2nd and next hosts also this ppn. Since the topo is sorted
* one just need to jump ppn ranks to check the supposed switch of host */
for (i = 2 * ppn; i < size; i += ppn ){
/* the list of ranks for the last known host have ended before */
if (topo[(i-1) * num_topo_level] != last_host){
return true;
}
/* the list of ranks for the last known host are bigger than excpected */
if (topo[(i-1) * num_topo_level] == topo[i*num_topo_level]){
return true;
}
last_host = topo[i * num_topo_level];
}
/* Check the last host */
if (topo[(size-1) * num_topo_level] != last_host){
return true;
}
return false;
}
/**
* Topology initialization phase
* Called each time a collective that needs buffer reordering is called
*
* @param num_topo_level (IN) Number of the topological levels
*/
int *mca_coll_han_topo_init(struct ompi_communicator_t *comm,
mca_coll_han_module_t *han_module,
int num_topo_level)
{
int size;
int *topo;
size = ompi_comm_size(comm);
if (!((han_module->cached_topo) && (han_module->cached_comm == comm))) {
if (han_module->cached_topo) {
free(han_module->cached_topo);
han_module->cached_topo = NULL;
}
topo = (int *)malloc(sizeof(int) * size * num_topo_level);
/* get topo infomation */
mca_coll_han_topo_get(topo, comm, num_topo_level);
mca_coll_han_topo_print(topo, comm, num_topo_level);
/*
* All the ranks now have the topo information
*/
/* check if the processes are mapped by core */
han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level);
/*
* If not, sort the topo such that each group of ids is sorted by rank
* i.e. ids for rank i are contiguous to ids for rank i+1.
* This will be needed for the operations that are order sensitive
* (like gather)
*/
if (!han_module->is_mapbycore) {
mca_coll_han_topo_sort(topo, 0, size-1, 0, num_topo_level);
}
han_module->are_ppn_imbalanced = mca_coll_han_topo_are_ppn_imbalanced(topo, comm , num_topo_level);
han_module->cached_topo = topo;
han_module->cached_comm = comm;
} else {
topo = han_module->cached_topo;
}
mca_coll_han_topo_print(topo, comm, num_topo_level);
return topo;
}
static void mca_coll_han_topo_print(int *topo,
struct ompi_communicator_t *comm,
int num_topo_level)
{
int rank = ompi_comm_rank(comm);
int size = ompi_comm_size(comm);
if (rank == 0) {
int i;
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter topo: ", rank));
for (i=0; i<size*num_topo_level; i++) {
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i]));
}
OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n"));
}
}

Просмотреть файл

@ -29,19 +29,19 @@
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "ompi/mca/coll/base/coll_base_topo.h"
/* need file reading function */
#include "ompi/mca/coll/base/coll_base_util.h"
/* also need the dynamic rule structures */
#include "coll_tuned_dynamic_rules.h"
/* and our own prototypes */
#include "coll_tuned_dynamic_file.h"
#define MYEOF -999
static long getnext (FILE *fptr); /* local function */
static int fileline=0; /* used for verbose error messages */
#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline)
/*
* Reads a rule file called fname
* Builds the algorithm rule table for a max of n_collectives
@ -261,36 +261,3 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
return (-1);
}
static void skiptonewline (FILE *fptr)
{
char val;
int rc;
do {
rc = fread(&val, 1, 1, fptr);
if (0 == rc) return;
if ((1 == rc)&&('\n' == val)) {
fileline++;
return;
}
} while (1);
}
static long getnext (FILE *fptr)
{
long val;
int rc;
char trash;
do {
rc = fscanf(fptr, "%li", &val);
if (rc == EOF) return MYEOF;
if (1 == rc) return val;
/* in all other cases, skip to the end */
rc = fread(&trash, 1, 1, fptr);
if (rc == EOF) return MYEOF;
if ('\n' == trash) fileline++;
if ('#' == trash) skiptonewline (fptr);
} while (1);
}