diff --git a/ompi/mca/coll/base/coll_base_comm_select.c b/ompi/mca/coll/base/coll_base_comm_select.c index b853f1ad26..405bd6b388 100644 --- a/ompi/mca/coll/base/coll_base_comm_select.c +++ b/ompi/mca/coll/base/coll_base_comm_select.c @@ -21,6 +21,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 BULL S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,20 +45,12 @@ #include "opal/mca/base/base.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/base.h" - +#include "ompi/mca/coll/base/coll_base_util.h" /* - * Local types + * Stuff for the OBJ interface */ -struct avail_coll_t { - opal_list_item_t super; - - int ac_priority; - mca_coll_base_module_2_3_0_t *ac_module; - const char * ac_component_name; -}; -typedef struct avail_coll_t avail_coll_t; - +OBJ_CLASS_INSTANCE(mca_coll_base_avail_coll_t, opal_list_item_t, NULL, NULL); /* * Local functions @@ -77,12 +70,6 @@ static int query_2_0_0(const mca_coll_base_component_2_0_0_t * int *priority, mca_coll_base_module_2_3_0_t ** module); -/* - * Stuff for the OBJ interface - */ -static OBJ_CLASS_INSTANCE(avail_coll_t, opal_list_item_t, NULL, NULL); - - #define COPY(module, comm, func) \ do { \ if (NULL != module->coll_ ## func) { \ @@ -138,11 +125,14 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) /* FIX ME - Do some kind of collective operation to find a module that everyone has available */ + /* List to store every valid module */ + comm->c_coll->module_list = OBJ_NEW(opal_list_t); + /* do the selection loop */ for (item = opal_list_remove_first(selectable); NULL != item; item = opal_list_remove_first(selectable)) { - avail_coll_t *avail = (avail_coll_t *) item; + mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item; /* initialize the module */ ret = avail->ac_module->coll_module_enable(avail->ac_module, comm); @@ -153,6 +143,9 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) (OMPI_SUCCESS == ret ? "Enabled": "Disabled") ); if (OMPI_SUCCESS == ret) { + /* Save every component that is initialized, + * queried and enabled successfully */ + opal_list_append(comm->c_coll->module_list, &avail->super); /* copy over any of the pointers */ COPY(avail->ac_module, comm, allgather); @@ -230,10 +223,11 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) COPY(avail->ac_module, comm, neighbor_alltoallw_init); COPY(avail->ac_module, comm, reduce_local); + } else { + /* release the original module reference and the list item */ + OBJ_RELEASE(avail->ac_module); + OBJ_RELEASE(avail); } - /* release the original module reference and the list item */ - OBJ_RELEASE(avail->ac_module); - OBJ_RELEASE(avail); } /* Done with the list from the check_components() call so release it. */ @@ -306,8 +300,8 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) static int avail_coll_compare (opal_list_item_t **a, opal_list_item_t **b) { - avail_coll_t *acoll = (avail_coll_t *) *a; - avail_coll_t *bcoll = (avail_coll_t *) *b; + mca_coll_base_avail_coll_t *acoll = (mca_coll_base_avail_coll_t *) *a; + mca_coll_base_avail_coll_t *bcoll = (mca_coll_base_avail_coll_t *) *b; if (acoll->ac_priority > bcoll->ac_priority) { return 1; @@ -332,7 +326,7 @@ static opal_list_t *check_components(opal_list_t * components, mca_base_component_list_item_t *cli; mca_coll_base_module_2_3_0_t *module; opal_list_t *selectable; - avail_coll_t *avail; + mca_coll_base_avail_coll_t *avail; /* Make a list of the components that query successfully */ selectable = OBJ_NEW(opal_list_t); @@ -345,7 +339,7 @@ static opal_list_t *check_components(opal_list_t * components, if (priority >= 0) { /* We have a component that indicated that it wants to run by giving us a module */ - avail = OBJ_NEW(avail_coll_t); + avail = OBJ_NEW(mca_coll_base_avail_coll_t); avail->ac_priority = priority; avail->ac_module = module; // Point to the string so we don't have to free later diff --git a/ompi/mca/coll/base/coll_base_comm_unselect.c b/ompi/mca/coll/base/coll_base_comm_unselect.c index fea0a53ec7..0e0f1bb5bf 100644 --- a/ompi/mca/coll/base/coll_base_comm_unselect.c +++ b/ompi/mca/coll/base/coll_base_comm_unselect.c @@ -16,6 +16,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 BULL S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +35,7 @@ #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/base/base.h" +#include "ompi/mca/coll/base/coll_base_util.h" #define CLOSE(comm, func) \ do { \ @@ -50,6 +52,8 @@ int mca_coll_base_comm_unselect(ompi_communicator_t * comm) { + opal_list_item_t *item; + CLOSE(comm, allgather); CLOSE(comm, allgatherv); CLOSE(comm, allreduce); @@ -124,6 +128,17 @@ int mca_coll_base_comm_unselect(ompi_communicator_t * comm) CLOSE(comm, reduce_local); + for (item = opal_list_remove_first(comm->c_coll->module_list); + NULL != item; item = opal_list_remove_first(comm->c_coll->module_list)) { + mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item; + + if(avail->ac_module) { + OBJ_RELEASE(avail->ac_module); + } + OBJ_RELEASE(avail); + } + OBJ_RELEASE(comm->c_coll->module_list); + free(comm->c_coll); comm->c_coll = NULL; diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 91dd677dbc..29b4a70cac 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -305,3 +305,39 @@ static void nbc_req_cons(ompi_coll_base_nbc_request_t *req) { } OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, NULL); + +/* File reading functions */ +static void skiptonewline (FILE *fptr, int *fileline) +{ + do { + char val; + int rc; + + rc = fread(&val, 1, 1, fptr); + if (0 == rc) return; + if ((1 == rc)&&('\n' == val)) { + (*fileline)++; + return; + } + } while (1); +} + +long ompi_coll_base_file_getnext (FILE *fptr, int *fileline) +{ + do { + long val; + int rc; + char trash; + + rc = fscanf(fptr, "%li", &val); + if (rc == EOF) return MYEOF; + if (1 == rc) return val; + /* in all other cases, skip to the end */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) return MYEOF; + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index 05eaa41953..239322b022 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -84,6 +84,19 @@ ompi_coll_base_nbc_reserve_tags(ompi_communicator_t* comm, int32_t reserve) typedef struct ompi_coll_base_nbc_request_t ompi_coll_base_nbc_request_t; +/* + * Structure to store an available module + */ +struct mca_coll_base_avail_coll_t { + opal_list_item_t super; + + int ac_priority; + mca_coll_base_module_t *ac_module; + const char * ac_component_name; +}; +typedef struct mca_coll_base_avail_coll_t mca_coll_base_avail_coll_t; +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_avail_coll_t); + /** * A MPI_like function doing a send and a receive simultaneously. * If one of the communications results in a zero-byte message the @@ -164,5 +177,9 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request, ompi_datatype_t * const stypes[], ompi_datatype_t * const rtypes[]); +/* File reading function */ +#define MYEOF -999 +long ompi_coll_base_file_getnext(FILE *fptr, int *fileline); + END_C_DECLS #endif /* MCA_COLL_BASE_UTIL_EXPORT_H */ diff --git a/ompi/mca/coll/coll.h b/ompi/mca/coll/coll.h index f852f26732..57e4af4ac0 100644 --- a/ompi/mca/coll/coll.h +++ b/ompi/mca/coll/coll.h @@ -19,6 +19,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 BULL S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -767,6 +768,9 @@ struct mca_coll_base_comm_coll_t { mca_coll_base_module_reduce_local_fn_t coll_reduce_local; mca_coll_base_module_2_3_0_t *coll_reduce_local_module; + + /* List of modules initialized, queried and enabled */ + opal_list_t *module_list; }; typedef struct mca_coll_base_comm_coll_t mca_coll_base_comm_coll_t; diff --git a/ompi/mca/coll/han/Makefile.am b/ompi/mca/coll/han/Makefile.am index 380b44d615..55892512e3 100644 --- a/ompi/mca/coll/han/Makefile.am +++ b/ompi/mca/coll/han/Makefile.am @@ -12,6 +12,8 @@ sources = \ coll_han.h \ coll_han_trigger.h \ +coll_han_dynamic.h \ +coll_han_dynamic_file.h \ coll_han_bcast.c \ coll_han_reduce.c \ coll_han_scatter.c \ @@ -21,6 +23,10 @@ coll_han_allgather.c \ coll_han_component.c \ coll_han_module.c \ coll_han_trigger.c \ +coll_han_dynamic.c \ +coll_han_dynamic_file.c \ +coll_han_topo.c \ +coll_han_subcomms.c \ coll_han_utils.c # Make the output library in this directory, and name it either diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h index 307fa52044..1af75ffec3 100644 --- a/ompi/mca/coll/han/coll_han.h +++ b/ompi/mca/coll/han/coll_han.h @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,19 +17,23 @@ #include "mpi.h" #include "ompi/mca/mca.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/communicator/communicator.h" -#include "ompi/include/mpi.h" -#include "ompi/mca/coll/base/coll_base_functions.h" -#include "opal/util/info.h" -#include "ompi/op/op.h" -#include "opal/runtime/opal_progress.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/coll/base/coll_tags.h" +#include "opal/util/output.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "coll_han_trigger.h" +#include "ompi/mca/coll/han/coll_han_dynamic.h" -BEGIN_C_DECLS typedef struct { +BEGIN_C_DECLS + +/* + * Today; + * . only 2 modules available for intranode (low) level + * . only 2 modules available for internode (up) level + */ + +#define COLL_HAN_LOW_MODULES 2 +#define COLL_HAN_UP_MODULES 2 + +typedef struct { uint32_t umod; uint32_t lmod; uint32_t fs; @@ -200,6 +205,10 @@ typedef struct mca_coll_han_component_t { uint32_t han_scatter_low_module; /* whether enable auto tune */ uint32_t han_auto_tune; + /* whether we need reproducible results + * (but disables topological optimisations) + */ + uint32_t han_reproducible; /* create a 3D array * num_processes (n): 2 4 8 16 32 64 (6) * num_core (c): 2 4 8 12 (4) @@ -209,8 +218,42 @@ typedef struct mca_coll_han_component_t { uint32_t han_auto_tune_c; uint32_t han_auto_tune_m; selection *han_auto_tuned; + bool use_simple_algorithm[COLLCOUNT]; + + /* Dynamic configuration rules */ + bool use_dynamic_file_rules; + bool dump_dynamic_rules; + char* dynamic_rules_filename; + /* Dynamic rules from file */ + mca_coll_han_dynamic_rules_t dynamic_rules; + /* Dynamic rules from mca parameter */ + COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL]; + int topo_level; + + /* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */ + int max_dynamic_errors; } mca_coll_han_component_t; +typedef void (*previous_dummy_fn_t) (void); + +/* + * Structure used to store what is necessary for the collective operations + * routines in case of fallback. + */ +typedef struct collective_fallback_t { + union { + mca_coll_base_module_allgather_fn_t allgather; + mca_coll_base_module_allgatherv_fn_t allgatherv; + mca_coll_base_module_allreduce_fn_t allreduce; + mca_coll_base_module_bcast_fn_t bcast; + mca_coll_base_module_gather_fn_t gather; + mca_coll_base_module_reduce_fn_t reduce; + mca_coll_base_module_scatter_fn_t scatter; + previous_dummy_fn_t dummy; + } previous_routine; + mca_coll_base_module_t *previous_module; +} collective_fallback_t; + /** Coll han module */ typedef struct mca_coll_han_module_t { /** Base module */ @@ -225,9 +268,56 @@ typedef struct mca_coll_han_module_t { int *cached_vranks; int *cached_topo; bool is_mapbycore; + bool are_ppn_imbalanced; + + /* To be able to fallback when the cases are not supported */ + struct collective_fallback_t previous_routines[COLLCOUNT]; + + /* To be able to fallback on reproducible algorithm */ + mca_coll_base_module_reduce_fn_t reproducible_reduce; + mca_coll_base_module_t *reproducible_reduce_module; + mca_coll_base_module_allreduce_fn_t reproducible_allreduce; + mca_coll_base_module_t *reproducible_allreduce_module; + + /* Topological level of this communicator */ + int topologic_level; + + /* Collective module storage for module choice */ + mca_coll_han_collective_modules_storage_t modules_storage; + bool storage_initialized; + + /* + * Number of dynamic errors encountered + * The first mca_coll_han_component.max_dynamic_errors + * of rank 0 are printed with verbosity = 0 + */ + int dynamic_errors; + + /* Sub-communicator */ + struct ompi_communicator_t *sub_comm[NB_TOPO_LVL]; } mca_coll_han_module_t; OBJ_CLASS_DECLARATION(mca_coll_han_module_t); +/* + * Some defines to stick to the naming used in the other components in terms of + * fallback routines + */ +#define previous_allgather previous_routines[ALLGATHER].previous_routine.allgather +#define previous_allgatherv previous_routines[ALLGATHERV].previous_routine.allgatherv +#define previous_allreduce previous_routines[ALLREDUCE].previous_routine.allreduce +#define previous_bcast previous_routines[BCAST].previous_routine.bcast +#define previous_gather previous_routines[GATHER].previous_routine.gather +#define previous_reduce previous_routines[REDUCE].previous_routine.reduce +#define previous_scatter previous_routines[SCATTER].previous_routine.scatter + +#define previous_allgather_module previous_routines[ALLGATHER].previous_module +#define previous_allgatherv_module previous_routines[ALLGATHERV].previous_module +#define previous_allreduce_module previous_routines[ALLREDUCE].previous_module +#define previous_bcast_module previous_routines[BCAST].previous_module +#define previous_gather_module previous_routines[GATHER].previous_module +#define previous_reduce_module previous_routines[REDUCE].previous_module +#define previous_scatter_module previous_routines[SCATTER].previous_module + /** * Global component instance */ @@ -244,17 +334,10 @@ int han_request_free(ompi_request_t ** request); /* Subcommunicator creation */ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); - +void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); /* Gather topology information */ -int mca_coll_han_pow10_int(int pow_value); -int mca_coll_han_hostname_to_number(char *hostname, int size); -void mca_coll_han_topo_get(int *topo, struct ompi_communicator_t *comm, int num_topo_level); -void mca_coll_han_topo_sort(int *topo, int start, int end, int size, int level, int num_topo_level); -bool mca_coll_han_topo_is_mapbycore(int *topo, struct ompi_communicator_t *comm, - int num_topo_level); int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, int num_topo_level); -void mca_coll_han_topo_print(int *topo, struct ompi_communicator_t *comm, int num_topo_level); /* Utils */ void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, @@ -263,8 +346,47 @@ uint32_t han_auto_tuned_get_n(uint32_t n); uint32_t han_auto_tuned_get_c(uint32_t c); uint32_t han_auto_tuned_get_m(uint32_t m); +const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll); +const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); + +/** Dynamic component choice */ +/* + * Get all the collective modules initialized on this communicator + * This function must be call at the start of every selector implementation + */ +int +mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module); + +int +mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_allgatherv_intra_dynamic(ALLGATHERV_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_intra_dynamic(ALLREDUCE_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_bcast_intra_dynamic(BCAST_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_gather_intra_dynamic(GATHER_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_intra_dynamic(REDUCE_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS, + mca_coll_base_module_t *module); /* Bcast */ +int mca_coll_han_bcast_intra_simple(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, int seg_count, struct ompi_datatype_t *dtype, int root_up_rank, int root_low_rank, @@ -278,6 +400,30 @@ int mca_coll_han_bcast_t0_task(void *task_argu); int mca_coll_han_bcast_t1_task(void *task_argu); /* Reduce */ +int +mca_coll_han_reduce_intra_simple(const void *sbuf, + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + + + void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, int seg_count, struct ompi_datatype_t *dtype, @@ -301,6 +447,26 @@ int mca_coll_han_reduce_t0_task(void *task_argu); int mca_coll_han_reduce_t1_task(void *task_argu); /* Allreduce */ +int +mca_coll_han_allreduce_intra_simple(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, mca_coll_task_t * cur_task, void *sbuf, @@ -355,7 +521,7 @@ void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, struct ompi_communicator_t *low_comm, int w_rank, bool noop, ompi_request_t * req); -/* Gatter */ +/* Gather */ int mca_coll_han_gather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, @@ -380,6 +546,23 @@ void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, struct ompi_communicator_t *up_comm, struct ompi_communicator_t *low_comm, int w_rank, bool noop, ompi_request_t * req); +int +mca_coll_han_gather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +/* reordering after gather, for unordered ranks */ +void +ompi_coll_han_reorder_gather(const void *sbuf, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + int * topo); + + /* Allgather */ int @@ -405,6 +588,13 @@ void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, struct ompi_communicator_t *low_comm, int w_rank, bool noop, bool is_mapbycore, int *topo, ompi_request_t * req); +int +mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); END_C_DECLS #endif /* MCA_COLL_HAN_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_allgather.c b/ompi/mca/coll/han/coll_han_allgather.c index 2f0e3c45bd..50702d28ff 100644 --- a/ompi/mca/coll/han/coll_han_allgather.c +++ b/ompi/mca/coll/han/coll_han_allgather.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,7 +29,10 @@ void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, struct ompi_communicator_t *up_comm, struct ompi_communicator_t *low_comm, int w_rank, - bool noop, bool is_mapbycore, int *topo, ompi_request_t * req) + bool noop, + bool is_mapbycore, + int *topo, + ompi_request_t * req) { argu->cur_task = cur_task; argu->sbuf = sbuf; @@ -53,18 +57,17 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module) + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) { int w_rank; w_rank = ompi_comm_rank(comm); /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - mca_coll_han_comm_create(comm, han_module); - ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_allgather_low_module]; - ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_allgather_up_module]; + mca_coll_han_comm_create_new(comm, han_module); + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; int low_rank = ompi_comm_rank(low_comm); ompi_request_t *temp_request = NULL; @@ -160,7 +163,8 @@ int mca_coll_han_allgather_uag_task(void *task_argu) } else { ptrdiff_t rsize, rgap = 0; rsize = - opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size * up_size, + opal_datatype_span(&t->rdtype->super, + (int64_t) t->rcount * low_size * up_size, &rgap); reorder_buf = (char *) malloc(rsize); reorder_rbuf = reorder_buf - rgap; @@ -190,7 +194,8 @@ int mca_coll_han_allgather_uag_task(void *task_argu) "[%d]: HAN Allgather copy from %d %d\n", t->w_rank, (i * low_size + j) * 2 + 1, t->topo[(i * low_size + j) * 2 + 1])); - ompi_datatype_copy_content_same_ddt(t->rdtype, (ptrdiff_t) t->rcount, + ompi_datatype_copy_content_same_ddt(t->rdtype, + (ptrdiff_t) t->rcount, (char *) t->rbuf + rextent * (ptrdiff_t) t->topo[(i * low_size + j) * 2 + @@ -238,3 +243,108 @@ int mca_coll_han_allgather_lb_task(void *task_argu) return OMPI_SUCCESS; } + +int +mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module){ + + /* create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + mca_coll_han_comm_create_new(comm, han_module); + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + + /* discovery topology */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + /* unbalanced case needs algo adaptation */ + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather with this communicator. It need to fall back on another component\n")); + return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, + comm, han_module->previous_allgather_module); + } + + /* setup up/low coordinates */ + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int up_rank = ompi_comm_rank(up_comm); + int up_size = ompi_comm_size(up_comm); + int root_low_rank = 0; // node leader will be 0 on each rank + + /* allocate the intermediary buffer + * to gather on leaders on the low sub communicator */ + char *tmp_buf = NULL; + char *tmp_buf_start = NULL; + if (low_rank == root_low_rank) { + ptrdiff_t rsize, rgap = 0; + /* Compute the size to receive all the local data, including datatypes empty gaps */ + rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap); + // intermediary buffer on node leaders to gather on low comm + tmp_buf = (char *) malloc(rsize); + tmp_buf_start = tmp_buf - rgap; + } + + /* 1. low gather on node leaders into tmp_buf */ + low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + /* 2. allgather between node leaders, from tmp_buf to reorder_buf */ + if (low_rank == root_low_rank) { + /* allocate buffer to store unordered result on node leaders + * * if the processes are mapped-by core, no need to reorder: + * * distribution of ranks on core first and node next, + * * in a increasing order for both patterns */ + char *reorder_buf = NULL; + char *reorder_buf_start = NULL; + if (han_module->is_mapbycore) { + reorder_buf_start = rbuf; + } else { + if (0 == low_rank && 0 == up_rank) { // first rank displays message + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future Allgather needs reordering: ", w_rank)); + } + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap); + reorder_buf = (char *) malloc(rsize); + reorder_buf_start = reorder_buf - rgap; + } + + /* 2a. inter node allgather */ + up_comm->c_coll->coll_allgather(tmp_buf_start, scount*low_size, sdtype, + reorder_buf_start, rcount*low_size, rdtype, + up_comm, up_comm->c_coll->coll_allgather_module); + + if (tmp_buf != NULL) { + free(tmp_buf); + tmp_buf = NULL; + tmp_buf_start = NULL; + } + + /* 2b. reorder the node leader's into rbuf. + * if ranks are not mapped in topological order, data needs to be reordered + * (see reorder_gather) + */ + if (!han_module->is_mapbycore) { + ompi_coll_han_reorder_gather(reorder_buf_start, + rbuf, rcount, rdtype, + comm, topo); + free(reorder_buf); + reorder_buf = NULL; + } + + } + + /* 3. up broadcast: leaders broadcast on their nodes */ + low_comm->c_coll->coll_bcast(rbuf, rcount*low_size*up_size, rdtype, + root_low_rank, low_comm, + low_comm->c_coll->coll_bcast_module); + + + return OMPI_SUCCESS; + } diff --git a/ompi/mca/coll/han/coll_han_allreduce.c b/ompi/mca/coll/han/coll_han_allreduce.c index 629b93a1c9..6a4fd6038f 100644 --- a/ompi/mca/coll/han/coll_han_allreduce.c +++ b/ompi/mca/coll/han/coll_han_allreduce.c @@ -2,6 +2,8 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -78,6 +80,17 @@ mca_coll_han_allreduce_intra(const void *sbuf, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { + // Fallback to another component if the op cannot commute + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + if (! ompi_op_is_commute(op)) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this communicator." + "It need to fall back on another component\n")); + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); + } + + ptrdiff_t extent, lb; ompi_datatype_get_extent(dtype, &lb, &extent); int w_rank; @@ -87,7 +100,6 @@ mca_coll_han_allreduce_intra(const void *sbuf, ompi_datatype_type_size(dtype, &typelng); /* Create the subcommunicators */ - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -393,3 +405,145 @@ int mca_coll_han_allreduce_t3_task(void *task_argu) return OMPI_SUCCESS; } + +int +mca_coll_han_allreduce_intra_simple(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + ompi_communicator_t *low_comm; + ompi_communicator_t *up_comm; + int root_low_rank = 0; + int low_rank; + int ret; + mca_coll_han_component_t *cs = &mca_coll_han_component; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + OPAL_OUTPUT_VERBOSE((10, cs->han_output, + "[OMPI][han] in mca_coll_han_reduce_intra_simple\n")); + + // Fallback to another component if the op cannot commute + if (! ompi_op_is_commute(op)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "han cannot handle allreduce with this operation." + "It need to fall back on another component\n")); + goto prev_allreduce; + } + + mca_coll_han_comm_create_new(comm, han_module); + + low_comm = han_module->sub_comm[INTRA_NODE]; + up_comm = han_module->sub_comm[INTER_NODE]; + low_rank = ompi_comm_rank(low_comm); + + /* Low_comm reduce */ + ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "HAN/ALLREDUCE: low comm reduce failed. " + "Falling back to another component\n")); + goto prev_allreduce; + } + + /* Local roots perform a allreduce on the upper comm */ + if (low_rank == root_low_rank) { + ret = up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, rbuf, count, dtype, op, + up_comm, up_comm->c_coll->coll_allreduce_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "HAN/ALLREDUCE: up comm allreduce failed. \n")); + /* + * Do not fallback in such a case: only root_low_ranks follow this + * path, the other ranks are in another collective. + * ==> Falling back would potentially lead to a hang. + * Simply return the error + */ + return ret; + } + } + + /* Low_comm bcast */ + ret = low_comm->c_coll->coll_bcast(rbuf, count, dtype, + root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "HAN/ALLREDUCE: low comm bcast failed. " + "Falling back to another component\n")); + goto prev_allreduce; + } + + return OMPI_SUCCESS; + +prev_allreduce: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm, + han_module->previous_allreduce_module); +} + +/* Find a fallback on reproducible algorithm + * use tuned, or if impossible whatever available + */ +int +mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank = ompi_comm_rank(comm); + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* populate previous modules_storage*/ + mca_coll_han_get_all_coll_modules(comm, han_module); + + /* try availability of reproducible modules*/ + int fallbacks[] = {TUNED, BASIC}; + int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks); + int i; + for (i=0; imodules_storage + .modules[fallback] + .module_handler; + if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) { + if (0 == w_rank) { + opal_output_verbose(30, mca_coll_han_component.han_output, + "coll:han:allreduce_reproducible: " + "fallback on %s\n", + components_name[fallback]); + } + han_module->reproducible_allreduce_module = fallback_module; + han_module->reproducible_allreduce = fallback_module->coll_allreduce; + return OMPI_SUCCESS; + } + } + /* fallback of the fallback */ + if (0 == w_rank) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:allreduce_reproducible_decision: " + "no reproducible fallback\n"); + } + han_module->reproducible_allreduce_module = + han_module->previous_allreduce_module; + han_module->reproducible_allreduce = han_module->previous_allreduce; + return OMPI_SUCCESS; +} + +/* Fallback on reproducible algorithm */ +int +mca_coll_han_allreduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + return han_module->reproducible_allreduce(sbuf, rbuf, count, dtype, + op, comm, + han_module + ->reproducible_allreduce_module); +} diff --git a/ompi/mca/coll/han/coll_han_bcast.c b/ompi/mca/coll/han/coll_han_bcast.c index 35c0a461f9..6eebc3b7d3 100644 --- a/ompi/mca/coll/han/coll_han_bcast.c +++ b/ompi/mca/coll/han/coll_han_bcast.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,10 +64,22 @@ mca_coll_han_bcast_intra(void *buff, w_rank = ompi_comm_rank(comm); int seg_count = count; size_t typelng; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. It need to fall back on another component\n")); + return han_module->previous_bcast(buff, count, dtype, root, + comm, han_module->previous_bcast_module); + } + ompi_datatype_type_size(dtype, &typelng); /* Create the subcommunicators */ - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -220,3 +233,60 @@ int mca_coll_han_bcast_t1_task(void *task_argu) return OMPI_SUCCESS; } + +int +mca_coll_han_bcast_intra_simple(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank; + w_rank = ompi_comm_rank(comm); + + /* create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + mca_coll_han_comm_create_new(comm, han_module); + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int root_low_rank; + int root_up_rank; + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. It need to fall back on another component\n")); + return han_module->previous_bcast(buff, count, dtype, root, + comm, han_module->previous_bcast_module); + } else { + OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, + "[OMPI][han] in mca_coll_han_bcast_intra_simple\n")); + } + + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: root_low_rank %d root_up_rank %d\n", + w_rank, root_low_rank, root_up_rank)); + + if (low_rank == root_low_rank) { + up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, up_comm, up_comm->c_coll->coll_bcast_module); + + /* To remove when han has better sub-module selection. + For now switching to ibcast enables to make runs with libnbc. */ + //ompi_request_t req; + //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, up_comm, &req, up_comm->c_coll->coll_ibcast_module); + //ompi_request_wait(&req, MPI_STATUS_IGNORE); + + } + low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index 2aa5bbd7c2..cfb40c7da0 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -22,6 +23,8 @@ #include "ompi/constants.h" #include "ompi/mca/coll/coll.h" #include "coll_han.h" +#include "coll_han_dynamic.h" +#include "coll_han_dynamic_file.h" /* * Public string showing the coll ompi_han component version number @@ -84,6 +87,7 @@ mca_coll_han_component_t mca_coll_han_component = { */ static int han_open(void) { + int param; mca_coll_han_component_t *cs = &mca_coll_han_component; if (cs->han_auto_tune) { cs->han_auto_tuned = @@ -95,7 +99,16 @@ static int han_open(void) 2 * cs->han_auto_tune_n * cs->han_auto_tune_c * cs->han_auto_tune_m, file); fclose(file); } - return OMPI_SUCCESS; + + /* + * Get the global coll verbosity: it will be ours + */ + cs->han_output = ompi_coll_base_framework.framework_output; + opal_output_verbose(1, cs->han_output, + "coll:han:component_open: done!"); + + cs->topo_level = GLOBAL_COMMUNICATOR; + return mca_coll_han_init_dynamic_rules(); } @@ -109,9 +122,89 @@ static int han_close(void) free(cs->han_auto_tuned); cs->han_auto_tuned = NULL; } + mca_coll_han_free_dynamic_rules(); return OMPI_SUCCESS; } +static bool is_simple_implemented(COLLTYPE_T coll) +{ + switch(coll) { + case ALLGATHER: + case ALLREDUCE: + case BCAST: + case GATHER: + case REDUCE: + return true; + default: + return false; + } +} + +const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl) +{ + switch(topo_lvl) { + case INTRA_NODE: + return "intra_node"; + case INTER_NODE: + return "inter_node"; + case GLOBAL_COMMUNICATOR: + return "global_communicator"; + case NB_TOPO_LVL: + default: + return "invalid topologic level"; + } +} +const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll) +{ + switch(coll) { + case ALLGATHER: + return "allgather"; + case ALLGATHERV: + return "allgatherv"; + case ALLREDUCE: + return "allreduce"; + case ALLTOALL: + return "alltoall"; + case ALLTOALLV: + return "alltoallv"; + case ALLTOALLW: + return "alltoallw"; + case BARRIER: + return "barrier"; + case BCAST: + return "bcast"; + case EXSCAN: + return "exscan"; + case GATHER: + return "gather"; + case GATHERV: + return "gatherv"; + case REDUCE: + return "reduce"; + case REDUCESCATTER: + return "reduce_scatter"; + case REDUCESCATTERBLOCK: + return "reduce_scatter_block"; + case SCAN: + return "scan"; + case SCATTER: + return "scatter"; + case SCATTERV: + return "scatterv"; + case NEIGHBOR_ALLGATHER: + return "neighbor_allgather"; + case NEIGHBOR_ALLGATHERV: + return "neighbor_allgatherv"; + case NEIGHBOR_ALLTOALL: + return "neighbor_alltoall"; + case NEIGHBOR_ALLTOALLV: + return "neighbor_alltoallv"; + case NEIGHBOR_ALLTOALLW: + return "neighbor_alltoallw"; + default: + return ""; + } +} /* * Register MCA params @@ -121,21 +214,20 @@ static int han_register(void) mca_base_component_t *c = &mca_coll_han_component.super.collm_version; mca_coll_han_component_t *cs = &mca_coll_han_component; - cs->han_priority = 50; + /* Generated parameters name and description */ + char param_name[100] = ""; + char param_desc[300] = ""; + int param_desc_size; + COLLTYPE_T coll; + TOPO_LVL_T topo_lvl; + COMPONENT_T component; + + cs->han_priority = 0; (void) mca_base_component_var_register(c, "priority", "Priority of the han coll component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); - int coll_han_verbose = 0; - (void) mca_base_component_var_register(c, "verbose", - "Verbose level", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &coll_han_verbose); - cs->han_output = opal_output_open(NULL); - opal_output_set_verbosity(cs->han_output, coll_han_verbose); - cs->han_bcast_segsize = 65536; (void) mca_base_component_var_register(c, "bcast_segsize", "segment size for bcast", @@ -254,6 +346,93 @@ static int han_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune); + cs->han_reproducible = 0; + (void) mca_base_component_var_register(c, "reproducible", + "whether we need reproducible results " + "(enabling this disables optimisations using topology)" + "0 disable 1 enable, default 0", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_reproducible); + + /* Simple algorithms MCA parameters */ + for(coll = 0 ; coll < COLLCOUNT ; coll++) { + cs->use_simple_algorithm[coll] = false; + if(is_simple_implemented(coll)) { + snprintf(param_name, 100, "use_simple_%s", + mca_coll_han_colltype_to_str(coll)); + snprintf(param_desc, 300, "whether to enable simple algo for %s", + mca_coll_han_colltype_to_str(coll)); + mca_base_component_var_register(c, param_name, + param_desc, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->use_simple_algorithm[coll])); + } + } + + /* Dynamic rules MCA parameters */ + /* TODO: Find a way to avoid unused entried */ + memset(cs->mca_rules, 0, + COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T)); + for(coll = 0 ; coll < COLLCOUNT ; coll++) { + if(!mca_coll_han_is_coll_dynamic_implemented(coll)) { + continue; + } + /* + * Default values + * Do not avoid to set correct default parameters + */ + cs->mca_rules[coll][INTRA_NODE] = TUNED; + cs->mca_rules[coll][INTER_NODE] = BASIC; + cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN; + + for(topo_lvl = 0 ; topo_lvl < NB_TOPO_LVL ; topo_lvl++) { + + snprintf(param_name, 100, "%s_dynamic_%s_module", + mca_coll_han_colltype_to_str(coll), + mca_coll_han_topo_lvl_to_str(topo_lvl)); + + param_desc_size = snprintf(param_desc, 300, + "Collective module to use for " + "collective %s on %s topological level: ", + mca_coll_han_colltype_to_str(coll), + mca_coll_han_topo_lvl_to_str(topo_lvl)); + /* + * Exhaustive description: + * 0 = self; 1 = basic; 2 = libnbc; ... + * FIXME: Do not print component not providing this collective + */ + for(component = 0 ; component < COMPONENTS_COUNT ; component++) { + if(HAN == component && GLOBAL_COMMUNICATOR != topo_lvl) { + /* Han can only be used on the global communicator */ + continue; + } + param_desc_size += snprintf(param_desc+param_desc_size, 300, + "%d = %s; ", + component, + components_name[component]); + } + + mca_base_component_var_register(c, param_name, param_desc, + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->mca_rules[coll][topo_lvl])); + } + } + + /* + * TODO: remove the following lines when auto-tune is added back to the code + */ + cs->han_auto_tune = 0; + + cs->han_auto_tune_n = 5; + cs->han_auto_tune_c = 3; + cs->han_auto_tune_m = 21; +#if 0 cs->han_auto_tune_n = 5; (void) mca_base_component_var_register(c, "auto_tune_n", "auto tune n", @@ -273,7 +452,65 @@ static int han_register(void) "auto tune n", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_m); + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_auto_tune_m); +#endif + + /* Dynamic rules */ + cs->use_dynamic_file_rules = false; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "use_dynamic_file_rules", + "Switch used to decide if we use " + "dynamic module choice rules " + "defines by file", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->use_dynamic_file_rules)); + + cs->dynamic_rules_filename = NULL; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "dynamic_rules_filename", + "Filename of configuration file that " + "contains the dynamic module choice rules", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->dynamic_rules_filename)); + + cs->dump_dynamic_rules = false; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "dump_dynamic_rules", + "Switch used to decide if we dump " + "dynamic rules provided by " + "configuration file", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->dump_dynamic_rules)); + + if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename) + && !cs->use_dynamic_file_rules) { + opal_output_verbose(0, cs->han_output, + "coll:han:han_register " + "you asked for dynamic rules " + "but they are not activated. " + "Check coll_han_use_dynamic_file_rules " + "MCA parameter"); + } + + cs->max_dynamic_errors = 10; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "max_dynamic_errors", + "Number of dynamic rules module/function " + "errors printed on rank 0 " + "with a 0 verbosity." + "Useless if coll_base_verbose is 30 or more.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->max_dynamic_errors)); + return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_dynamic.c b/ompi/mca/coll/han/coll_han_dynamic.c new file mode 100644 index 0000000000..2cda40e34b --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic.c @@ -0,0 +1,1338 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal/class/opal_list.h" +#include "ompi/mca/coll/han/coll_han.h" +#include "ompi/mca/coll/han/coll_han_dynamic.h" +#include "ompi/mca/coll/base/coll_base_util.h" + +/* + * Tests if a dynamic collective is implemented + * Usefull for file reading warnings and MCA parameter generation + * When a new dynamic collective is implemented, this function must + * return true for it + */ +bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id) +{ + switch (coll_id){ + case ALLGATHER: + case ALLGATHERV: + case ALLREDUCE: + case BCAST: + case GATHER: + case REDUCE: + case SCATTER: + return true; + default: + return false; + } +} + +static COMPONENT_T +component_name_to_id(const char* name) +{ + int i; + + if(NULL == name) { + return -1; + } + + for(i=SELF ; itopologic_level; + mca_coll_base_module_t *han_base_module = (mca_coll_base_module_t *) han_module; + /* If the modules are get yet, return success */ + if(han_module->storage_initialized) { + return OMPI_SUCCESS; + } + /* This list is populated at communicator creation */ + OPAL_LIST_FOREACH(item, + comm->c_coll->module_list, + mca_coll_base_avail_coll_t) { + mca_coll_base_module_t *module = item->ac_module; + const char *name = item->ac_component_name; + int id = component_name_to_id(name); + + if(id >= 0 && NULL != module && module != han_base_module) { + /* + * The identifier is correct + * Store the module + */ + han_module->modules_storage.modules[id].module_handler = module; + opal_output_verbose(80, mca_coll_han_component.han_output, + "coll:han:get_all_coll_modules " + "Han found module %s with id %d " + "for topological level %d (%s) " + "for communicator (%d/%s)\n", + name, + id, + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + nb_modules++; + } + } + + /* + * Add han_module on global communicator only + * to prevent any recursive call + */ + if(GLOBAL_COMMUNICATOR == han_module->topologic_level) { + han_module->modules_storage.modules[HAN].module_handler = han_base_module; + nb_modules++; + } + + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_all_coll_modules " + "Han sub-communicator modules storage " + "for topological level %d (%s) " + "gets %d modules " + "for communicator (%d/%s)\n", + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + nb_modules, + comm->c_contextid, + comm->c_name); + + assert(0 != nb_modules); + + /* The modules are get */ + han_module->storage_initialized = true; + return OMPI_SUCCESS; +} + +/* + * Find the correct rule in the dynamic rules + * Assume rules are sorted by increasing value + */ +static const msg_size_rule_t* +get_dynamic_rule(COLLTYPE_T collective, + int msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + /* Indexes of the rule */ + int coll_idx; + int topo_idx; + int conf_idx; + int msg_size_idx; + + /* Aliases */ + const mca_coll_han_dynamic_rules_t *dynamic_rules = NULL; + const collective_rule_t *coll_rule = NULL; + const topologic_rule_t *topo_rule = NULL; + const configuration_rule_t *conf_rule = NULL; + const msg_size_rule_t *msg_size_rule = NULL; + + const TOPO_LVL_T topo_lvl = han_module->topologic_level; + const int comm_size = ompi_comm_size(comm); + + COMPONENT_T component; + + /* Find the collective rule */ + dynamic_rules = &(mca_coll_han_component.dynamic_rules); + for(coll_idx = dynamic_rules->nb_collectives-1 ; + coll_idx >= 0 ; coll_idx--) { + if(dynamic_rules->collective_rules[coll_idx].collective_id == collective) { + coll_rule = &(dynamic_rules->collective_rules[coll_idx]); + break; + } + } + if(coll_idx < 0) { + /* + * No dynamic rules for this collective + */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched for collective %d (%s) " + "but did not find any rule for this collective\n", + collective, + mca_coll_han_colltype_to_str(collective)); + return NULL; + } + + /* Find the topologic level rule */ + for(topo_idx = coll_rule->nb_topologic_levels-1 ; + topo_idx >= 0 ; topo_idx--) { + if(coll_rule->topologic_rules[topo_idx].topologic_level == topo_lvl) { + topo_rule = &(coll_rule->topologic_rules[topo_idx]); + break; + } + } + if(topo_idx < 0) { + /* + * No topologic level rules for this collective + */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched for topologic level %d (%s) rule " + "for collective %d (%s) but did not find any rule\n", + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + collective, + mca_coll_han_colltype_to_str(collective)); + return NULL; + } + + /* Find the configuration rule */ + for(conf_idx = topo_rule->nb_rules-1 ; + conf_idx >= 0 ; conf_idx--) { + if(topo_rule->configuration_rules[conf_idx].configuration_size <= comm_size) { + conf_rule = &(topo_rule->configuration_rules[conf_idx]); + break; + } + } + if(conf_idx < 0) { + /* + * No corresponding configuration + * Should not happen with a correct file + */ + + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched a rule for collective %d (%s) " + "on topological level %d (%s) " + "for a %d configuration size " + "but did not manage to find anything. " + "This is the result of an invalid configuration file: " + "the first configuration size of each collective must be 1\n", + collective, + mca_coll_han_colltype_to_str(collective), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size); + return NULL; + } + + /* Find the message size rule */ + for(msg_size_idx = conf_rule->nb_msg_size-1 ; + msg_size_idx >= 0 ; msg_size_idx--) { + if(conf_rule->msg_size_rules[msg_size_idx].msg_size <= msg_size) { + msg_size_rule = &(conf_rule->msg_size_rules[msg_size_idx]); + break; + } + } + if(msg_size_idx < 0) { + /* + * No corresponding message size + * Should not happen with a correct file + */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched a rule for collective %d (%s) " + "on topological level %d (%s) " + "for a %d configuration size " + "for a %d sized message " + "but did not manage to find anything. " + "This is the result of an invalid configuration file: " + "the first message size of each configuration must be 0\n", + collective, + mca_coll_han_colltype_to_str(collective), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, + msg_size); + + return NULL; + } + + component = msg_size_rule->component; + /* + * We have the final rule to use + * Module correctness is checked outside + */ + opal_output_verbose(80, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched a rule for collective %d (%s) " + "on topological level %d (%s) " + "for a %d configuration size " + "for a %d sized message. " + "Found a rule for collective %d (%s) " + "on topological level %d (%s) " + "for a %d configuration size " + "for a %d sized message : component %d (%s)\n", + collective, + mca_coll_han_colltype_to_str(collective), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, + msg_size, + msg_size_rule->collective_id, + mca_coll_han_colltype_to_str(msg_size_rule->collective_id), + msg_size_rule->topologic_level, + mca_coll_han_topo_lvl_to_str(msg_size_rule->topologic_level), + msg_size_rule->configuration_size, + msg_size_rule->msg_size, + component, + components_name[component]); + + return msg_size_rule; +} + +/* + * Return the module to use for the collective coll_id + * for a msg_size sized message on the comm communicator + * following the dynamic rules + */ +mca_coll_base_module_t * +get_module(COLLTYPE_T coll_id, + int msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + const msg_size_rule_t *dynamic_rule; + mca_coll_base_module_t *sub_module = NULL; + TOPO_LVL_T topo_lvl; + COMPONENT_T mca_rule_component; + + topo_lvl = han_module->topologic_level; + mca_rule_component = mca_coll_han_component.mca_rules[coll_id][topo_lvl]; + + mca_coll_han_get_all_coll_modules(comm, han_module); + + /* Find the correct dynamic rule to check */ + dynamic_rule = get_dynamic_rule(coll_id, + msg_size, + comm, + han_module); + if(NULL != dynamic_rule) { + /* Use dynamic rule from file */ + sub_module = han_module->modules_storage + .modules[dynamic_rule->component] + .module_handler; + } else { + /* + * No dynamic rule from file + * Use rule from mca parameter + */ + if(mca_rule_component < 0 || mca_rule_component >= COMPONENTS_COUNT) { + /* + * Invalid MCA parameter value + * Warn the user and return NULL + */ + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:get_module " + "Invalid MCA parameter value %d " + "for collective %d (%s) " + "on topologic level %d (%s)\n", + mca_rule_component, + coll_id, + mca_coll_han_colltype_to_str(coll_id), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl)); + return NULL; + } + sub_module = han_module->modules_storage + .modules[mca_rule_component] + .module_handler; + } + + return sub_module; +} + + +/* + * Allgather selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(sdtype, &dtype_size); + msg_size = dtype_size * scount; + + sub_module = get_module(ALLGATHER, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgather_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + ALLGATHER, + mca_coll_han_colltype_to_str(ALLGATHER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHER: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + han_module + ->previous_allgather_module); + } else if (NULL == sub_module->coll_allgather) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgather_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + ALLGATHER, + mca_coll_han_colltype_to_str(ALLGATHER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHER: the module found for the sub-" + "communicator cannot handle the ALLGATHER operation. " + "Falling back to another component\n")); + return han_module->previous_allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + han_module + ->previous_allgather_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allgather is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_allgather_fn_t allgather; + if(mca_coll_han_component.use_simple_algorithm[ALLGATHER]) { + allgather = mca_coll_han_allgather_intra_simple; + } else { + allgather = mca_coll_han_allgather_intra; + } + + return allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + sub_module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + sub_module); +} + + +/* + * Allgatherv selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + * The allgatherv size is the size of the biggest segment + */ +int +mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, + const int *displs, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size, msg_size; + int rank; + int verbosity; + int comm_size; + int i; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + comm_size = ompi_comm_size(comm); + ompi_datatype_type_size(rdtype, &dtype_size); + + msg_size = 0; + for(i = 0 ; i < comm_size ; i++) { + if(dtype_size * rcounts[i] > msg_size) { + msg_size = dtype_size * rcounts[i]; + } + } + + sub_module = get_module(ALLGATHERV, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgatherv_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + ALLGATHERV, + mca_coll_han_colltype_to_str(ALLGATHERV), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHERV: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + han_module + ->previous_allgatherv_module); + } else if (NULL == sub_module->coll_allgatherv) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgatherv_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + ALLGATHERV, + mca_coll_han_colltype_to_str(ALLGATHERV), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHERV: the module found for the sub-" + "communicator cannot handle the ALLGATHERV operation. " + "Falling back to another component\n")); + return han_module->previous_allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + han_module + ->previous_allgatherv_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allgatherv is valid and point to this function + * Call han topological collective algorithm + */ + opal_output_verbose(30, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgatherv_intra_dynamic " + "Han used for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective on this topologic level\n", + ALLGATHERV, + mca_coll_han_colltype_to_str(ALLGATHERV), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + return han_module->previous_allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + han_module + ->previous_allgatherv_module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgatherv is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + sub_module); +} + + +/* + * Allreduce selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_allreduce_intra_dynamic(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(dtype, &dtype_size); + msg_size = dtype_size * count; + + sub_module = get_module(ALLREDUCE, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allreduce_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + ALLREDUCE, + mca_coll_han_colltype_to_str(ALLREDUCE), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLREDUCE: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, + op, comm, + han_module + ->previous_allreduce_module); + } else if (NULL == sub_module->coll_allreduce) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allreduce_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + ALLREDUCE, + mca_coll_han_colltype_to_str(ALLREDUCE), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLREDUCE: the module found for the sub-" + "communicator cannot handle the ALLREDUCE operation. " + "Falling back to another component\n")); + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, + op, comm, + han_module + ->previous_allreduce_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* Reproducibility: fallback on reproducible algo */ + if (mca_coll_han_component.han_reproducible) { + return mca_coll_han_allreduce_reproducible(sbuf, rbuf, count, dtype, op, + comm, module); + } + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allreduce is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_allreduce_fn_t allreduce; + if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { + allreduce = mca_coll_han_allreduce_intra_simple; + } else { + allreduce = mca_coll_han_allreduce_intra; + } + return allreduce(sbuf, rbuf, count, dtype, + op, comm, module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allreduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_allreduce(sbuf, rbuf, count, dtype, + op, comm, sub_module); +} + + +/* + * Bcast selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_bcast_intra_dynamic(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(dtype, &dtype_size); + msg_size = dtype_size * count; + + sub_module = get_module(BCAST, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_bcast_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + BCAST, + mca_coll_han_colltype_to_str(BCAST), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/BCAST: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_bcast(buff, count, dtype, root, comm, + han_module->previous_bcast_module); + } else if (NULL == sub_module->coll_bcast) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_bcast_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + BCAST, + mca_coll_han_colltype_to_str(BCAST), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/BCAST: the module found for the sub-" + "communicator cannot handle the BCAST operation. " + "Falling back to another component\n")); + return han_module->previous_bcast(buff, count, dtype, root, comm, + han_module->previous_bcast_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_bcast is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_bcast_fn_t bcast; + if(mca_coll_han_component.use_simple_algorithm[BCAST]) { + bcast = mca_coll_han_bcast_intra_simple; + } else { + bcast = mca_coll_han_bcast_intra; + } + return bcast(buff, + count, + dtype, + root, + comm, + module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_bcast is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_bcast(buff, + count, + dtype, + root, + comm, + sub_module); +} + + +/* + * Gather selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(sdtype, &dtype_size); + msg_size = dtype_size * scount; + + sub_module = get_module(GATHER, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_gather_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + GATHER, + mca_coll_han_colltype_to_str(GATHER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/GATHER: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module + ->previous_gather_module); + } else if (NULL == sub_module->coll_gather) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_gather_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + GATHER, + mca_coll_han_colltype_to_str(GATHER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/GATHER: the module found for the sub-" + "communicator cannot handle the GATHER operation. " + "Falling back to another component\n")); + return han_module->previous_gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module + ->previous_gather_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_gather is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_gather_fn_t gather; + if(mca_coll_han_component.use_simple_algorithm[GATHER]) { + gather = mca_coll_han_gather_intra_simple; + } else { + gather = mca_coll_han_gather_intra; + } + + + return gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_gather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); +} + + +/* + * Reduce selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_reduce_intra_dynamic(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(dtype, &dtype_size); + msg_size = dtype_size * count; + + sub_module = get_module(REDUCE, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_reduce_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + REDUCE, + mca_coll_han_colltype_to_str(REDUCE), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_reduce(sbuf, rbuf, count, dtype, + op, root, comm, + han_module + ->previous_reduce_module); + } else if (NULL == sub_module->coll_reduce) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_reduce_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + REDUCE, + mca_coll_han_colltype_to_str(REDUCE), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: the module found for the sub-" + "communicator cannot handle the REDUCE operation. " + "Falling back to another component\n")); + return han_module->previous_reduce(sbuf, rbuf, count, dtype, + op, root, comm, + han_module + ->previous_reduce_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* Reproducibility: fallback on reproducible algo */ + if (mca_coll_han_component.han_reproducible) { + return mca_coll_han_reduce_reproducible(sbuf, rbuf, count, dtype, op, + root, comm, module); + } + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_reduce is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_reduce_fn_t reduce; + if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { + reduce = mca_coll_han_reduce_intra_simple; + } else { + reduce = mca_coll_han_reduce_intra; + } + return reduce(sbuf, rbuf, count, dtype, + op, root, comm, module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_reduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_reduce(sbuf, rbuf, count, dtype, + op, root, comm, sub_module); +} + + +/* + * Scatter selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(rdtype, &dtype_size); + msg_size = dtype_size * rcount; + + sub_module = get_module(SCATTER, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_scatter_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + SCATTER, + mca_coll_han_colltype_to_str(SCATTER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/SCATTER: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module + ->previous_scatter_module); + } else if (NULL == sub_module->coll_scatter) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_scatter_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + SCATTER, + mca_coll_han_colltype_to_str(SCATTER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/SCATTER: the module found for the sub-" + "communicator cannot handle the SCATTER operation. " + "Falling back to another component\n")); + return han_module->previous_scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module + ->previous_scatter_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_scatter is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_scatter_fn_t scatter; + scatter = mca_coll_han_scatter_intra; + /* + * TODO: Uncomment when scatter simple is merged + * if(mca_coll_han_component.use_simple_algorithm[SCATTER]) { + * scatter = mca_coll_han_scatter_intra_simple; + * } else { + * scatter = mca_coll_han_scatter_intra; + * } + */ + return scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_scatter is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); +} + + diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h new file mode 100644 index 0000000000..979b292ba0 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic.h @@ -0,0 +1,214 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_HAN_DYNAMIC_H +#define MCA_COLL_HAN_DYNAMIC_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "opal/util/output.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/han/coll_han.h" + + +/* + * ################################################# + * # Dynamic rules global architecture description # + * ################################################# + * + * Han dynamic rules allow the user to define the collective + * module to call depending the topological configuration of the + * sub-communicators and the collective parameters. This mechanism + * can also be used to fallback the main collective on another module. + * The interface is described in coll_han_dynamic_file.h. + * + * ############################# + * # Collective module storage # + * ############################# + * To be able to switch between multiple collective modules, han + * directly accesses the module on the communicator. This information is + * stored in the collective structure of the communicator during the collective + * module choice at the communicator initialization. When han needs this + * information for the first time, it identifies the modles by their name and + * stores them in its module structure. + * Then, the modules are identified by their identifier. + * + * ######################### + * # Dynamic rules storage # + * ######################### + * There are two types of dynamic rules: + * - MCA parameter defined rules + * - File defined rules + * + * MCA parameter defined rules are stored in mca_coll_han_component.mca_rules. + * This is a double indexed table. The first index is the coresponding collective + * communication and the second index is the topological level aimed by the rule. + * These parameters define the collective component to use for a specific + * collective communication on a specific topologic level. + * + * File defined rules are stored in mca_coll_han_component.dynamic_rules. + * These structures are defined bellow. The rule storage is directy deduced + * from the rule file format. + * + * File defined rules precede MCA parameter defined rules. + * + * ####################### + * # Dynamic rules usage # + * ####################### + * To choose which collective module to use on a specific configuration, han + * adds an indirection on the collective call: dynamic choice functions. These + * functions do not implement any collective. First, they try to find a dynamic + * rule from file for the given collective. If there is not any rule for the + * fiven configuration, MCA parameter defined rules are used. Once the module + * to use is found, the correct collective implementation is called. + * + * This indirection is also used on the global communicator. This allows han + * to provide a fallback mechanism considering the collective parameters. + * + * ############################## + * # Dynamic rules choice logic # + * ############################## + * Dynamic rules choice is made with a stack logic. Each new rule precedes + * already defined rules. MCA parameters rules are the stack base. When + * a rule is needed, rules are read as a stack and the first corresponding + * encountered is chosen. + * + * Consequences: + * - If a collective identifier appears multiple times, only the last + * will be considered + * - If a topological level appears multiple times for a collective, + * only the last will be considered + * - If configuration rules or message size rules are not stored + * by increasing value, some of them will not be considered + */ + +BEGIN_C_DECLS + +/* Dynamic rules support */ +typedef enum COMPONENTS { + SELF=0, + BASIC, + LIBNBC, + TUNED, + SM, + SHARED, + ADAPT, + HAN, + COMPONENTS_COUNT +} COMPONENT_T; + +static const char *components_name[]={"self", + "basic", + "libnbc", + "tuned", + "sm", + "shared", + "adapt", + "han"}; + +/* Topologic levels */ +typedef enum TOPO_LVL { + INTRA_NODE=0, + INTER_NODE, + /* Identifies the global communicator as a topologic level */ + GLOBAL_COMMUNICATOR, + NB_TOPO_LVL +} TOPO_LVL_T; + +/* Rule for a specific msg size + * in a specific configuration + * for a specific collective + * in a specific topologic level */ +typedef struct msg_size_rule_s { + COLLTYPE_T collective_id; + TOPO_LVL_T topologic_level; + int configuration_size; + + /* Message size of the rule */ + int msg_size; + + /* Component to use on this specific configuration + * and message size */ + COMPONENT_T component; +} msg_size_rule_t; + +/* Rule for a specific configuration + * considering a specific collective + * in a specific topologic level */ +typedef struct configuration_rule_s { + COLLTYPE_T collective_id; + TOPO_LVL_T topologic_level; + + /* Number of elements of the actual topologic level + * per element of the upper topologic level */ + int configuration_size; + + /* Number of message size rules for this configuration */ + int nb_msg_size; + + /* Table of message size rules for this configuration */ + msg_size_rule_t *msg_size_rules; +} configuration_rule_t; + +/* Set of dynamic rules for a specific collective + * in a specific topologic level */ +typedef struct topologic_rule_s { + /* Collective identifier */ + COLLTYPE_T collective_id; + + /* Topologic level of the rule */ + TOPO_LVL_T topologic_level; + + /* Rule number */ + int nb_rules; + + /* Table of configuration rules + * for this collective on this topologic level */ + configuration_rule_t *configuration_rules; +} topologic_rule_t; + +/* Set of dynamic rules for a collective */ +typedef struct collective_rule_s { + COLLTYPE_T collective_id; + + /* Number of topologic level for this collective */ + int nb_topologic_levels; + + /* Table of topologic level rules + * for this collective */ + topologic_rule_t *topologic_rules; +} collective_rule_t; + +/* Global dynamic rules structure */ +typedef struct mca_coll_han_dynamic_rule_s { + int nb_collectives; + collective_rule_t *collective_rules; +} mca_coll_han_dynamic_rules_t; + +/* Module storage */ +typedef struct collective_module_storage_s { + /* Module */ + mca_coll_base_module_t *module_handler; +} collective_module_storage_t; + +/* Table of module storage */ +typedef struct mca_coll_han_collective_modules_storage_s { + /* */ + collective_module_storage_t modules[COMPONENTS_COUNT]; +} mca_coll_han_collective_modules_storage_t; + +/* Tests if a dynamic collective is implemented */ +bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id); + +END_C_DECLS +#endif diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c new file mode 100644 index 0000000000..d163071edc --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -0,0 +1,690 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifdef HAVE_STDLIB_H +#include +#endif +#ifdef HAVE_STDIO_H +#include +#endif + +#include "ompi_config.h" +#include "coll_han.h" +#include "coll_han_dynamic.h" +#include "coll_han_dynamic_file.h" + +#include "ompi/mca/coll/base/coll_base_util.h" + +static void check_dynamic_rules(void); + +/* Current file line for verbose message */ +static int fileline = 1; +#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline) + +int +mca_coll_han_init_dynamic_rules(void) +{ + /* File management */ + const char *fname; + FILE *fptr = NULL; + int nb_entries = 0; + + /* Loop counters */ + int i, j, k, l; + + /* Collective informations */ + int nb_coll; + COLLTYPE_T coll_id; + collective_rule_t *coll_rules; + + /* Topo informations */ + int nb_topo; + TOPO_LVL_T topo_lvl; + topologic_rule_t *topo_rules; + + /* Configuration informations */ + int nb_rules, conf_size; + configuration_rule_t *conf_rules; + + /* Message size informations */ + int nb_msg_size, msg_size; + msg_size_rule_t *msg_size_rules; + + /* Component informations */ + COMPONENT_T component; + + /* If the dynamic rules are not used, do not even read the file */ + if(!mca_coll_han_component.use_dynamic_file_rules) { + nb_coll = 0; + return OMPI_SUCCESS; + } + + fname = mca_coll_han_component.dynamic_rules_filename; + + if(NULL == fname) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "coll_han_use_dynamic_file_rules is true but " + "coll_han_dynamic_rules_filename is not set: " + "coll han will use dynamic rules from mca " + "parameters and their default value\n"); + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + return OMPI_SUCCESS; + } + + fptr = fopen(fname, "r"); + + if(NULL == fptr) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "cannot open dynamic file provided by " + "coll_han_dynamic_rules_filename=%s " + "please provide it with full path and " + "check file permissions. Rules from " + "MCA parameters will be used instead\n", + fname); + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + return OMPI_SUCCESS; + } + + /* The first information of the file is the collective count */ + nb_coll = getnext(fptr); + + if(nb_coll <= 0) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for collective count " + "or the reader encountered an unexpected EOF\n", + fname, + fileline, + nb_coll); + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + goto file_reading_error; + } + + mca_coll_han_component.dynamic_rules.nb_collectives = nb_coll; + + /* Allocate collective rules */ + coll_rules = malloc(nb_coll * sizeof(collective_rule_t)); + mca_coll_han_component.dynamic_rules.collective_rules = coll_rules; + if(NULL == coll_rules) { + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + goto cannot_allocate; + } + + /* Iterates on collective rules */ + for(i=0 ; i= COLLCOUNT) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "invalid collective id %d at line %d: the collective " + "must be at least %d and less than %d\n", + coll_id, + fileline, + ALLGATHER, + COLLCOUNT); + coll_rules[i].nb_topologic_levels = 0; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "read collective id %d at line %d " + "but this collective is not implemented yet. " + "This is not an error but this set of rules " + "will not be used\n", + fname, + coll_id, + fileline); + } + + /* + * The first information of a collective rule + * is the number of topologic rules + */ + nb_topo = getnext(fptr); + if(nb_topo < 0) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for topo level count " + "or the reader encountered an unexpected EOF\n", + fname, + fileline, + nb_topo); + coll_rules[i].nb_topologic_levels = 0; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Store the collective rule informations */ + coll_rules[i].collective_id = coll_id; + coll_rules[i].nb_topologic_levels = nb_topo; + + if(0 == nb_topo) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "Warning on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for topo level count\n", + fname, + fileline, + nb_topo); + continue; + } + + /* Allocate topologic rules */ + topo_rules = malloc(nb_topo * sizeof(topologic_rule_t)); + coll_rules[i].topologic_rules = topo_rules; + if(NULL == topo_rules) { + coll_rules[i].nb_topologic_levels = 0; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto cannot_allocate; + } + + /* Iterates on topologic rules */ + for(j=0 ; j= NB_TOPO_LVL) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid topo level %d is given " + "or the reader encountered an unexpected EOF. " + "Topologic level must be at least %d and " + "less than %d\n", + fname, + fileline, + topo_lvl, + INTRA_NODE, + NB_TOPO_LVL); + topo_rules[j].nb_rules = 0; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* + * The first information of a topologic rule + * is the number of configurations + */ + nb_rules = getnext(fptr); + + if(nb_rules < 0) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d " + "is given for rules count " + "or the reader encountered an unexpected EOF\n", + fname, + fileline, + nb_rules); + topo_rules[j].nb_rules = 0; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Store the topologic rule informations */ + topo_rules[j].collective_id = coll_id; + topo_rules[j].topologic_level = topo_lvl; + topo_rules[j].nb_rules = nb_rules; + + if(0 == nb_rules) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "Warning on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for configuration rules count\n", + fname, + fileline, + nb_rules); + continue; + } + + /* Allocate configuration rules */ + conf_rules = malloc(nb_rules * sizeof(configuration_rule_t)); + topo_rules[j].configuration_rules = conf_rules; + if(NULL == conf_rules) { + topo_rules[j].nb_rules = 0; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto cannot_allocate; + } + + /* Iterate on configuration rules */ + for(k=0 ; k 1)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "invalid configuration size %d at line %d " + "or the reader encountered an unexpected EOF " + "the configuration size must be at least %d " + "and the first configuration size " + "of a topologic level must be %d\n", + conf_size, + fileline, + 1, + 1); + conf_rules[k].nb_msg_size = 0; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* + * The first information of a configuration rule + * is the number of message size rules + */ + nb_msg_size = getnext(fptr); + if(nb_msg_size < 0) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d " + "is given for message size rules count " + "or the reader encountered an unexpected EOF\n", + fname, + fileline, + nb_msg_size); + conf_rules[k].nb_msg_size = 0; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Store configuration rule information */ + conf_rules[k].collective_id = coll_id; + conf_rules[k].topologic_level = topo_lvl; + conf_rules[k].configuration_size = conf_size; + conf_rules[k].nb_msg_size = nb_msg_size; + + if(0 == nb_msg_size) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "Warning on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for message size rules count\n", + fname, + fileline, + nb_msg_size); + continue; + } + + /* Allocate message size rules */ + msg_size_rules = malloc(nb_msg_size * sizeof(msg_size_rule_t)); + conf_rules[k].msg_size_rules = msg_size_rules; + if(NULL == msg_size_rules) { + conf_rules[k].nb_msg_size = 0; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto cannot_allocate; + } + + /* Iterate on message size rules */ + for(l=0 ; l 1)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d " + "is given for message size " + "or the reader encountered " + "an unexpected EOF. " + "The first message size rule of " + "a configuration must be 0\n", + fname, + fileline, + msg_size); + conf_rules[k].nb_msg_size = l+1; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Get the component identifier for this message size rule */ + component = getnext(fptr); + if(component < SELF || component >= COMPONENTS_COUNT) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid collective " + "component id %d is given or the " + "reader encountered an unexpected EOF. " + "Collective component id must be at " + "least %d and less than %d\n", + fname, + fileline, + component, + SELF, + COMPONENTS_COUNT); + conf_rules[k].nb_msg_size = l+1; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Store message size rule informations */ + msg_size_rules[l].collective_id = coll_id; + msg_size_rules[l].topologic_level = topo_lvl; + msg_size_rules[l].configuration_size = conf_size; + msg_size_rules[l].msg_size = msg_size; + msg_size_rules[l].component = component; + + nb_entries++; + } + } + } + } + + if(MYEOF != getnext(fptr)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "Warning on file %s at line %d: " + "rule reading is over but reader does not seem " + "to have reached the end of the file\n", + fname, + fileline); + } + + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "read %d rules from %s\n", + nb_entries, + fname); + + if(mca_coll_han_component.dump_dynamic_rules) { + mca_coll_han_dump_dynamic_rules(); + } + + fclose(fptr); + + check_dynamic_rules(); + return OMPI_SUCCESS; + +cannot_allocate: + /* The dynamic rules allocation failed + * Free the already allocated rules and return a failure + */ + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "cannot allocate dynamic rules\n"); + /* Do not check free_dynamic_rules + * because we are returning OMPI_ERROR anyway */ + mca_coll_han_free_dynamic_rules(); + return OMPI_ERROR; + +file_reading_error: + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "could not fully read dynamic rules file. " + "Will use mca parameters defined rules. " + "To see error detail, please set " + "collective verbosity level over 5\n"); + if(fptr) { + fclose (fptr); + } + mca_coll_han_free_dynamic_rules(); + return OMPI_SUCCESS; +} + +void +mca_coll_han_free_dynamic_rules(void) +{ + /* Loop counters */ + int i, j, k; + + /* Loop ranges */ + int nb_coll, nb_topo, nb_conf; + + /* Aliases */ + collective_rule_t *coll_rules; + topologic_rule_t *topo_rules; + configuration_rule_t *conf_rules; + + nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; + coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; + + for(i=0 ; i 0) { + free(conf_rules[k].msg_size_rules); + } + } + + if(nb_conf > 0) { + free(conf_rules); + } + } + + if(nb_topo > 0) { + free(topo_rules); + } + } + + if(nb_coll > 0) { + free(coll_rules); + } + + mca_coll_han_component.dynamic_rules.nb_collectives = 0; +} + +/* + * Try to find any logical issue in dynamic rules + */ +static void check_dynamic_rules(void) +{ + /* Loop counters */ + int i, j, k, l; + + /* Collective informations */ + int nb_coll; + COLLTYPE_T coll_id; + collective_rule_t *coll_rules; + + /* Topo informations */ + int nb_topo; + TOPO_LVL_T topo_lvl; + topologic_rule_t *topo_rules; + + /* Configuration informations */ + int nb_rules, conf_size; + configuration_rule_t *conf_rules; + + /* Message size informations */ + int nb_msg_size, msg_size; + msg_size_rule_t *msg_size_rules; + + /* Component informations */ + COMPONENT_T component; + + nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; + coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; + + for(i=0 ; i=1 && conf_rules[k-1].configuration_size > conf_size) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:check_dynamic_rules " + "Han found an issue on dynamic rules " + "for collective %d " + "on topological level %d: " + "configuration sizes %d and %d are " + "not sorted by increasing value\n", + coll_id, + topo_lvl, + conf_rules[k-1].configuration_size, + conf_size); + } + + for(l=0 ; l=1 && msg_size_rules[l-1].msg_size > msg_size) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:check_dynamic_rules " + "Han found an issue on dynamic rules " + "for collective %d " + "on topological level %d " + "with configuration size %d: " + "message sizes %d and %d are " + "not sorted by increasing value\n", + coll_id, + topo_lvl, + conf_size, + msg_size_rules[l-1].msg_size, + msg_size); + } + + if(HAN == component + && GLOBAL_COMMUNICATOR != topo_lvl) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:check_dynamic_rules " + "Han found an issue on dynamic rules " + "for collective %d " + "on topological level %d " + "with configuration size %d " + "for message size %d: " + "han collective component %d " + "can only be activated for " + "topology level %d\n", + coll_id, + topo_lvl, + conf_size, + msg_size, + HAN, + GLOBAL_COMMUNICATOR); + } + } + } + } + } +} + +void mca_coll_han_dump_dynamic_rules(void) +{ + int nb_entries = 0; + + /* Loop counters */ + int i, j, k, l; + + /* Collective informations */ + int nb_coll; + COLLTYPE_T coll_id; + collective_rule_t *coll_rules; + + /* Topo informations */ + int nb_topo; + TOPO_LVL_T topo_lvl; + topologic_rule_t *topo_rules; + + /* Configuration informations */ + int nb_rules, conf_size; + configuration_rule_t *conf_rules; + + /* Message size informations */ + int nb_msg_size, msg_size; + msg_size_rule_t *msg_size_rules; + + /* Component informations */ + COMPONENT_T component; + + nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; + coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; + + for(i=0 ; i collective component %d (%s)\n", + nb_entries, + coll_id, + mca_coll_han_colltype_to_str(coll_id), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + conf_size, + msg_size, + component, + components_name[component]); + + nb_entries++; + } + } + } + } +} diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.h b/ompi/mca/coll/han/coll_han_dynamic_file.h new file mode 100644 index 0000000000..846b9b74cc --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic_file.h @@ -0,0 +1,111 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef MCA_COLL_HAN_DYNAMIC_FILE_H +#define MCA_COLL_HAN_DYNAMIC_FILE_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "opal/util/output.h" + +/* + * ############################ + * # MCA parameters interface # + * ############################ + * An MCA parameter defined rule allows the user to choose which collective + * module will be used for a specific collective communication on a specific + * topological level. The standard name for these parameters is: + * [collective]_dynamic_[topologic_level]_module + * + * ####################### + * # Dynamic file format # + * ####################### + * File defined rules precede MCA parameter defined rule. + * To activate file reader, the MCA parameter use_dynamic_file_rules must + * be set to true. The path to the dynamic file is given by the MCA + * parameter dynamic_rules_filename. If there is any issue reading the file, + * the file is considered as invalid and only MCA parameter defined rules are + * used. If a potential logical issue is identified in the file, a + * warning is printed but the file is not considered as invalid. + * + * The file is built recursively. + * A set of rules of a type is built as follows: + * Number of rules of the set + * Rule1 + * Rule2 + * ... + * + * A rule of the level i is built as follows (excluding message size rule): + * Rule property + * Set of rules of level i+1 + * + * A message size rule is built as follows: + * Message_size Component + * + * Rule properties are (by increasing level): + * - Collective identifier: + * Defined in ompi/mca/coll/base/coll_base_functions.h. + * - Topologic level: + * Defined in coll_han_dynamic.h. It defines the communicator + * topology level. This is GLOBAL_COMMUNICATOR for the user + * communicator and the corresponding level for sub-communicators + * created by han. + * - Configuration size: + * The configuration size is the number of elements of the actual + * topology level in the upper topology level. For example, if + * topology levels are intra-node and inter-node, it can be the + * number of MPI ranks per node or the number of nodes in the global + * communicator. For the GLOBAL_COMMUNICATOR topologic level, + * the configuration size is the communicator size. + * - Message_size Component: + * This is the message size, in bytes, of the message. Component is + * the component identifier to use for this collective on this + * communicator with this message size. Components identifier are + * defined in coll_han_dynamic.h + * + * Here is an example of a dynamic rules file: + * 2 # Collective count + * 7 # Collective identifier 1 (defined in ompi/mca/coll/base/coll_base_functions.h) + * 2 # Topologic level count + * 0 # Topologic level identifier 1 + * 1 # Configuration count + * 1 # Configuration size 1 + * 2 # Message size rules count + * 0 3 # Message size 1 and component identifier + * 128 1 # Message size 2 and component identifier + * 1 # Topologic level identifier 2 + * 1 # Configuration count + * 1 # Configuration size 1 + * 1 # Message size rules count + * 0 1 # Message size 1 and component identifier + * 3 # Collective identifier 2 + * # Set of topological rules + * + * Note that configuration size and message size rules define minimal + * values and each new rule precede every other rules. This property + * implies that this types of rules must be sorted by increasing value. + * If they are not, some rules wont be used. + * + * The counts define a stack. If the count is set to x, the reader will + * attempt to read x rules of the corresponding type. If a set of rules + * has an invalid count, this is an error and it might not be detected by + * the reader. + */ + +BEGIN_C_DECLS + +int mca_coll_han_init_dynamic_rules(void); +void mca_coll_han_free_dynamic_rules(void); +void mca_coll_han_dump_dynamic_rules(void); + +END_C_DECLS +#endif diff --git a/ompi/mca/coll/han/coll_han_gather.c b/ompi/mca/coll/han/coll_han_gather.c index 5188d2aca6..2cbd6d976c 100644 --- a/ompi/mca/coll/han/coll_han_gather.c +++ b/ompi/mca/coll/han/coll_han_gather.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,29 +54,39 @@ void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, int mca_coll_han_gather_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) { - int i, j; - int w_rank, w_size; + int i; + int w_rank, w_size; /* information about the global communicator */ + int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ + char *reorder_buf = NULL, *reorder_rbuf = NULL; + ptrdiff_t rsize, rgap = 0, rextent; + int *vranks, low_rank, low_size; + int * topo; + + ompi_request_t *temp_request = NULL; + w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - mca_coll_han_comm_create(comm, han_module); - ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; - ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; - int *vranks = han_module->cached_vranks; - int low_rank = ompi_comm_rank(low_comm); - int low_size = ompi_comm_size(low_comm); - int up_size = ompi_comm_size(up_comm); + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + topo = mca_coll_han_topo_init(comm, han_module, 2); + + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. It need to fall back on another component\n")); + return han_module->previous_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, han_module->previous_gather_module); + } - ompi_request_t *temp_request = NULL; /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); OMPI_REQUEST_INIT(temp_request, false); @@ -88,27 +99,44 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, temp_request->req_status._cancelled = 0; temp_request->req_status._ucount = 0; - int root_low_rank; - int root_up_rank; - mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Gather root %d root_low_rank %d root_up_rank %d\n", w_rank, - root, root_low_rank, root_up_rank)); + /* create the subcommunicators */ + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; + + /* Get the 'virtual ranks' mapping correspondong to the communicators */ + vranks = han_module->cached_vranks; + /* information about sub-communicators */ + low_rank = ompi_comm_rank(low_comm); + low_size = ompi_comm_size(low_comm); + /* Get root ranks for low and up comms */ + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", + w_rank, root, root_low_rank, root_up_rank)); - char *reorder_buf = NULL; - char *reorder_rbuf = NULL; - ptrdiff_t rsize, rgap = 0, rextent; ompi_datatype_type_extent(rdtype, &rextent); - int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + /* Allocate reorder buffers */ if (w_rank == root) { - /* If the processes are mapped-by core, no need to reorder */ + /* if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns */ if (han_module->is_mapbycore) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Gather is_bycore: ", w_rank)); - reorder_rbuf = (char *) rbuf; + "[%d]: Han Gather is_bycore: ", w_rank)); + reorder_rbuf = (char *)rbuf; + } else { - rsize = opal_datatype_span(&rdtype->super, (int64_t) rcount * w_size, &rgap); - reorder_buf = (char *) malloc(rsize); //TODO:free + /* Need a buffer to store unordered final result */ + rsize = opal_datatype_span(&rdtype->super, + (int64_t)rcount * w_size, + &rgap); + reorder_buf = (char *)malloc(rsize); //TODO:free + /* rgap is the size of unused space at the start of the datatype */ reorder_rbuf = reorder_buf - rgap; } } @@ -128,27 +156,29 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); - /* Reorder rbuf based on rank. - * Suppose, message is 0 1 2 3 4 5 6 7, - * and the processes are mapped on 2 nodes (the processes on the node 0 is 0 2 4 6 and the processes on the node 1 is 1 3 5 7), - * so the message needs to be reordered to 0 2 4 6 1 3 5 7 + /* Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are + * mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from + * low gather will be 0 2 4 6 and 1 3 5 7. + * So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered. + * The 3rd element (4) must be recopied at the 4th place. In general, the + * i-th element must be recopied at the place given by the i-th entry of the + * topology, which is topo[i*topolevel +1] */ + /* reorder rbuf based on rank */ if (w_rank == root && !han_module->is_mapbycore) { - for (i = 0; i < up_size; i++) { - for (j = 0; j < low_size; j++) { + for (i=0; iw_rank)); OBJ_RELEASE(t->cur_task); @@ -168,16 +198,29 @@ int mca_coll_han_gather_lg_task(void *task_argu) char *tmp_buf = NULL; char *tmp_rbuf = NULL; if (!t->noop) { + /* if the process is one of the node leader, allocate the intermediary + * buffer to gather on the low sub communicator */ int low_size = ompi_comm_size(t->low_comm); ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); + rsize = opal_datatype_span(&t->rdtype->super, + (int64_t)t->rcount * low_size, + &rgap); tmp_buf = (char *) malloc(rsize); tmp_rbuf = tmp_buf - rgap; } - /* Shared memory gather */ - t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, - t->rdtype, t->root_low_rank, t->low_comm, - t->low_comm->c_coll->coll_gather_module); + + /* shared memory node gather */ + t->low_comm->c_coll->coll_gather((char *)t->sbuf, + t->scount, + t->sdtype, + tmp_rbuf, + t->rcount, + t->rdtype, + t->root_low_rank, + t->low_comm, + t->low_comm->c_coll->coll_gather_module); + + /* Prepare up comm gather */ t->sbuf = tmp_rbuf; t->sbuf_inter_free = tmp_buf; @@ -201,24 +244,192 @@ int mca_coll_han_gather_ug_task(void *task_argu) if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] Future Gather: ug noop\n", t->w_rank)); + "[%d] Han Gather: ug noop\n", t->w_rank)); } else { int low_size = ompi_comm_size(t->low_comm); - /* Inter node gather */ - t->up_comm->c_coll->coll_gather((char *) t->sbuf, t->scount * low_size, t->sdtype, - (char *) t->rbuf, t->rcount * low_size, t->rdtype, - t->root_up_rank, t->up_comm, - t->up_comm->c_coll->coll_gather_module); + /* inter node gather */ + t->up_comm->c_coll->coll_gather((char *)t->sbuf, + t->scount*low_size, + t->sdtype, + (char *)t->rbuf, + t->rcount*low_size, + t->rdtype, + t->root_up_rank, + t->up_comm, + t->up_comm->c_coll->coll_gather_module); if (t->sbuf_inter_free != NULL) { free(t->sbuf_inter_free); t->sbuf_inter_free = NULL; } OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] Future Gather: ug gather finish\n", t->w_rank)); + "[%d] Han Gather: ug gather finish\n", t->w_rank)); } ompi_request_t *temp_req = t->req; free(t); ompi_request_complete(temp_req, 1); return OMPI_SUCCESS; } + +/* only work with regular situation (each node has equal number of processes) */ +int +mca_coll_han_gather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank = ompi_comm_rank(comm); + int w_size = ompi_comm_size(comm); + + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + /* Here root needs to reach all nodes on up_comm. + * But in case of unbalance some up_comms are smaller, + * as the comm_split is made on the base of low_rank */ + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. It need to fall back on another component\n")); + return han_module->previous_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, han_module->previous_gather_module); + } + + /* create the subcommunicators */ + mca_coll_han_comm_create_new(comm, han_module); + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + + /* Get the 'virtual ranks' mapping corresponding to the communicators */ + int *vranks = han_module->cached_vranks; + /* information about sub-communicators */ + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + /* Get root ranks for low and up comms */ + int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + + /* allocate buffer to store unordered result on root + * if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns */ + char *reorder_buf = NULL; // allocated memory + char *reorder_buf_start = NULL; // start of the data + if (w_rank == root) { + if (han_module->is_mapbycore) { + reorder_buf_start = (char *)rbuf; + } else { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future Gather needs reordering: ", w_rank)); + ptrdiff_t rgap = 0; + ptrdiff_t rsize = opal_datatype_span(&rdtype->super, + (int64_t)rcount * w_size, + &rgap); + reorder_buf = (char *)malloc(rsize); + /* rgap is the size of unused space at the start of the datatype */ + reorder_buf_start = reorder_buf - rgap; + } + + } + + /* allocate the intermediary buffer + * * to gather on leaders on the low sub communicator */ + char *tmp_buf = NULL; // allocated memory + char *tmp_buf_start = NULL; // start of the data + if (low_rank == root_low_rank) { + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&rdtype->super, + (int64_t)rcount * low_size, + &rgap); + tmp_buf = (char *) malloc(rsize); + tmp_buf_start = tmp_buf - rgap; + } + + /* 1. low gather on nodes leaders */ + low_comm->c_coll->coll_gather((char *)sbuf, + scount, + sdtype, + tmp_buf_start, + rcount, + rdtype, + root_low_rank, + low_comm, + low_comm->c_coll->coll_gather_module); + + /* 2. upper gather (inter-node) between node leaders */ + if (low_rank == root_low_rank) { + up_comm->c_coll->coll_gather((char *)tmp_buf_start, + scount*low_size, + sdtype, + (char *)reorder_buf_start, + rcount*low_size, + rdtype, + root_up_rank, + up_comm, + up_comm->c_coll->coll_gather_module); + + if (tmp_buf != NULL) { + free(tmp_buf); + tmp_buf = NULL; + tmp_buf_start = NULL; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] Future Gather: ug gather finish\n", t->w_rank)); + } + + /* 3. reorder data on root into rbuf + * if ranks are not mapped in topological order, data needs to be reordered + * (see reorder_gather) + */ + if (w_rank == root && !han_module->is_mapbycore) { + ompi_coll_han_reorder_gather(reorder_buf_start, + rbuf, rcount, rdtype, + comm, topo); + free(reorder_buf); + } + + return OMPI_SUCCESS; +} + +/* Reorder after gather operation, for unordered ranks + * + * Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are + * mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from + * low gather will be 0 2 4 6 and 1 3 5 7. + * So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered. + * The 3rd element (4) must be recopied at the 4th place. In general, the + * i-th element must be recopied at the place given by the i-th entry of the + * topology, which is topo[i*topolevel +1] + */ +void +ompi_coll_han_reorder_gather(const void *sbuf, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + int * topo) { + int i; + int topolevel = 2; // always 2 levels in topo + int w_rank = ompi_comm_rank(comm); + int w_size = ompi_comm_size(comm); + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); + for (i=0; i -#include -#ifdef HAVE_SCHED_H -#include -#endif -#include -#ifdef HAVE_SYS_MMAN_H -#include -#endif /* HAVE_SYS_MMAN_H */ -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ - #include "mpi.h" -#include "opal_stdint.h" -#include "opal/mca/hwloc/base/base.h" -#include "opal/util/os_path.h" - -#include "ompi/communicator/communicator.h" -#include "ompi/group/group.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/coll/base/base.h" -#include "ompi/proc/proc.h" #include "coll_han.h" - -#include "ompi/mca/coll/base/coll_tags.h" -#include "ompi/mca/pml/pml.h" -#include -#include +#include "coll_han_dynamic.h" /* * Local functions */ -static int han_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); +static int han_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm); static int mca_coll_han_module_disable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); /* * Module constructor */ +static void han_module_clear(mca_coll_han_module_t *han_module) +{ + int i; + + for (i = 0; i < COLLCOUNT; i++) { + /* + * Since the previous routines function pointers are declared as + * a union, initializing the dummy routineis enough + */ + han_module->previous_routines[i].previous_routine.dummy = NULL; + han_module->previous_routines[i].previous_module = NULL; + } + han_module->reproducible_reduce = NULL; + han_module->reproducible_reduce_module = NULL; + han_module->reproducible_allreduce = NULL; + han_module->reproducible_allreduce_module = NULL; +} + static void mca_coll_han_module_construct(mca_coll_han_module_t * module) { + int i; + module->enabled = false; module->super.coll_module_disable = mca_coll_han_module_disable; module->cached_comm = NULL; @@ -62,27 +58,47 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module) module->cached_vranks = NULL; module->cached_topo = NULL; module->is_mapbycore = false; + module->storage_initialized = false; + for (i = 0 ; i < NB_TOPO_LVL ; i++) { + module->sub_comm[i] = NULL; + } + for (i=SELF ; imodules_storage.modules[i].module_handler = NULL; + } + + module->dynamic_errors = 0; + + han_module_clear(module); } + +#define OBJ_RELEASE_IF_NOT_NULL(obj) do { \ + if (NULL != (obj)) { \ + OBJ_RELEASE(obj); \ + } \ +} while (0) + /* * Module destructor */ static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) { + int i; + module->enabled = false; if (module->cached_low_comms != NULL) { - ompi_comm_free(&(module->cached_low_comms[0])); - ompi_comm_free(&(module->cached_low_comms[1])); - module->cached_low_comms[0] = NULL; - module->cached_low_comms[1] = NULL; + for (i = 0; i < COLL_HAN_LOW_MODULES; i++) { + ompi_comm_free(&(module->cached_low_comms[i])); + module->cached_low_comms[i] = NULL; + } free(module->cached_low_comms); module->cached_low_comms = NULL; } if (module->cached_up_comms != NULL) { - ompi_comm_free(&(module->cached_up_comms[0])); - ompi_comm_free(&(module->cached_up_comms[1])); - module->cached_up_comms[0] = NULL; - module->cached_up_comms[1] = NULL; + for (i = 0; i < COLL_HAN_UP_MODULES; i++) { + ompi_comm_free(&(module->cached_up_comms[i])); + module->cached_up_comms[i] = NULL; + } free(module->cached_up_comms); module->cached_up_comms = NULL; } @@ -94,21 +110,27 @@ static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) free(module->cached_topo); module->cached_topo = NULL; } -} + for(i=0 ; isub_comm[i]) { + ompi_comm_free(&(module->sub_comm[i])); + } + } -/* - * Module disable - */ -static int mca_coll_han_module_disable(mca_coll_base_module_t * module, - struct ompi_communicator_t *comm) -{ - return OMPI_SUCCESS; + OBJ_RELEASE_IF_NOT_NULL(module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_scatter_module); + + han_module_clear(module); } OBJ_CLASS_INSTANCE(mca_coll_han_module_t, mca_coll_base_module_t, - mca_coll_han_module_construct, mca_coll_han_module_destruct); + mca_coll_han_module_construct, + mca_coll_han_module_destruct); /* * Initial query function that is invoked during MPI_INIT, allowing @@ -116,7 +138,8 @@ OBJ_CLASS_INSTANCE(mca_coll_han_module_t, * required level of thread support. This function is invoked exactly * once. */ -int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_threads) +int mca_coll_han_init_query(bool enable_progress_threads, + bool enable_mpi_threads) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:han:init_query: pick me! pick me!"); @@ -129,16 +152,23 @@ int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_thread * Look at the communicator and decide which set of functions and * priority we want to return. */ -mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) +mca_coll_base_module_t * +mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) { mca_coll_han_module_t *han_module; - /* If we're intercomm, or if there's only one process in the - communicator */ - if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) - || !ompi_group_have_remote_peers(comm->c_local_group)) { + /* + * If we're intercomm, or if there's only one process in the communicator + */ + if (OMPI_COMM_IS_INTER(comm)) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, - "coll:han:comm_query (%d/%s): intercomm, comm is too small, only on one node; disqualifying myself", + "coll:han:comm_query (%d/%s): intercomm; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + if (1 == ompi_comm_size(comm)) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): comm is too small; disqualifying myself", comm->c_contextid, comm->c_name); return NULL; } @@ -159,24 +189,53 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * com } /* All is good -- return a module */ - han_module->super.coll_module_enable = han_module_enable; - han_module->super.ft_event = NULL; - han_module->super.coll_allgather = NULL; //mca_coll_han_allgather_intra; - han_module->super.coll_allgatherv = NULL; - han_module->super.coll_allreduce = mca_coll_han_allreduce_intra; - han_module->super.coll_alltoall = NULL; - han_module->super.coll_alltoallv = NULL; - han_module->super.coll_alltoallw = NULL; - han_module->super.coll_barrier = NULL; - han_module->super.coll_bcast = mca_coll_han_bcast_intra; - han_module->super.coll_exscan = NULL; - han_module->super.coll_gather = mca_coll_han_gather_intra; - han_module->super.coll_gatherv = NULL; - han_module->super.coll_reduce = mca_coll_han_reduce_intra; - han_module->super.coll_reduce_scatter = NULL; - han_module->super.coll_scan = NULL; - han_module->super.coll_scatter = mca_coll_han_scatter_intra; - han_module->super.coll_scatterv = NULL; + han_module->topologic_level = mca_coll_han_component.topo_level; + + /* + * TODO: When the selector is fully implemented, + * this if will be meaningless + */ + if (GLOBAL_COMMUNICATOR == han_module->topologic_level) { + /* We are on the global communicator, return topological algorithms */ + han_module->super.coll_module_enable = han_module_enable; + han_module->super.ft_event = NULL; + han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; + han_module->super.coll_allgatherv = NULL; + han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; + han_module->super.coll_alltoall = NULL; + han_module->super.coll_alltoallv = NULL; + han_module->super.coll_alltoallw = NULL; + han_module->super.coll_barrier = NULL; + han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; + han_module->super.coll_exscan = NULL; + han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; + han_module->super.coll_gatherv = NULL; + han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; + han_module->super.coll_reduce_scatter = NULL; + han_module->super.coll_scan = NULL; + han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; + han_module->super.coll_scatterv = NULL; + } else { + /* We are on a topologic sub-communicator, return only the selector */ + han_module->super.coll_module_enable = han_module_enable; + han_module->super.ft_event = NULL; + han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; + han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic; + han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; + han_module->super.coll_alltoall = NULL; + han_module->super.coll_alltoallv = NULL; + han_module->super.coll_alltoallw = NULL; + han_module->super.coll_barrier = NULL; + han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; + han_module->super.coll_exscan = NULL; + han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; + han_module->super.coll_gatherv = NULL; + han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; + han_module->super.coll_reduce_scatter = NULL; + han_module->super.coll_scan = NULL; + han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; + han_module->super.coll_scatterv = NULL; + } opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:han:comm_query (%d/%s): pick me! pick me!", @@ -185,14 +244,71 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * com } +/* + * In this macro, the following variables are supposed to have been declared + * in the caller: + * . ompi_communicator_t *comm + * . mca_coll_han_module_t *han_module + */ +#define HAN_SAVE_PREV_COLL_API(__api) do { \ + han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ + han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module;\ + if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ + opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ + "(%d/%s): no underlying " # __api"; disqualifying myself", \ + comm->c_contextid, comm->c_name); \ + return OMPI_ERROR; \ + } \ + /* TODO add a OBJ_RELEASE at module disabling */ \ + /* + FIXME find why releasing generates memory corruption */ \ + OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ + } while(0) + /* * Init module on the communicator */ -static int han_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm) +static int han_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) { + mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module; + + HAN_SAVE_PREV_COLL_API(allgather); + HAN_SAVE_PREV_COLL_API(allgatherv); + HAN_SAVE_PREV_COLL_API(allreduce); + HAN_SAVE_PREV_COLL_API(bcast); + HAN_SAVE_PREV_COLL_API(gather); + HAN_SAVE_PREV_COLL_API(reduce); + HAN_SAVE_PREV_COLL_API(scatter); + + /* set reproducible algos */ + mca_coll_han_reduce_reproducible_decision(comm, module); + mca_coll_han_allreduce_reproducible_decision(comm, module); + return OMPI_SUCCESS; } +/* + * Module disable + */ +static int mca_coll_han_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) +{ + mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module; + + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module); + + han_module_clear(han_module); + + return OMPI_SUCCESS; +} + + /* * Free the han request */ @@ -203,266 +319,3 @@ int han_request_free(ompi_request_t ** request) *request = MPI_REQUEST_NULL; return OMPI_SUCCESS; } - -/* Create the communicators used in the HAN module */ -void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module) -{ - /* Use cached communicators if possible */ - if (han_module->cached_comm == comm && han_module->cached_low_comms != NULL - && han_module->cached_up_comms != NULL && han_module->cached_vranks != NULL) { - return; - } - /* Create communicators if there is no cached communicator */ - else { - int low_rank, low_size; - int up_rank; - int w_rank = ompi_comm_rank(comm); - int w_size = ompi_comm_size(comm); - ompi_communicator_t **low_comms = - (struct ompi_communicator_t **) malloc(sizeof(struct ompi_communicator_t *) * 2); - ompi_communicator_t **up_comms = - (struct ompi_communicator_t **) malloc(sizeof(struct ompi_communicator_t *) * 2); - /* Create low_comms which contain all the process on a node */ - const int *origin_priority = NULL; - /* Lower the priority of HAN module */ - int han_var_id; - int tmp_han_priority = 0; - int tmp_han_origin = 0; - mca_base_var_find_by_name("coll_han_priority", &han_var_id); - mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); - tmp_han_origin = *origin_priority; - mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, - NULL); - comm->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling; - comm->c_coll->coll_allgather = ompi_coll_base_allgather_intra_bruck; - - int var_id; - int tmp_priority = 100; - int tmp_origin = 0; - /* Set up low_comms[0] with sm module */ - mca_base_var_find_by_name("coll_sm_priority", &var_id); - mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); - tmp_origin = *origin_priority; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] sm_priority origin %d %d\n", w_rank, *origin_priority, - tmp_origin)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, (opal_info_t *) (&ompi_mpi_info_null), - &(low_comms[0])); - mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - low_size = ompi_comm_size(low_comms[0]); - low_rank = ompi_comm_rank(low_comms[0]); - - /* Set up low_comms[1] with solo module */ - mca_base_var_find_by_name("coll_solo_priority", &var_id); - mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); - tmp_origin = *origin_priority; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] solo_priority origin %d %d\n", w_rank, *origin_priority, - tmp_origin)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, (opal_info_t *) (&ompi_mpi_info_null), - &(low_comms[1])); - mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - - /* Create up_comms[0] with libnbc which contain one process per node (across nodes) */ - mca_base_var_find_by_name("coll_libnbc_priority", &var_id); - mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); - tmp_origin = *origin_priority; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] libnbc_priority origin %d %d\n", w_rank, *origin_priority, - tmp_origin)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - ompi_comm_split(comm, low_rank, w_rank, &(up_comms[0]), false); - mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - up_rank = ompi_comm_rank(up_comms[0]); - - /* Create up_comms[1] with adapt which contain one process per node (across nodes) */ - mca_base_var_find_by_name("coll_adapt_priority", &var_id); - mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); - tmp_origin = *origin_priority; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] adapt_priority origin %d %d\n", w_rank, *origin_priority, - tmp_origin)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - ompi_comm_split(comm, low_rank, w_rank, &(up_comms[1]), false); - mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - - int *vranks = malloc(sizeof(int) * w_size); - /* Do allgather to gather vrank from each process so every process knows other processes' vrank */ - int vrank = low_size * up_rank + low_rank; - ompi_coll_base_allgather_intra_bruck(&vrank, 1, MPI_INT, vranks, 1, MPI_INT, comm, - comm->c_coll->coll_allgather_module); - han_module->cached_comm = comm; - han_module->cached_low_comms = low_comms; - han_module->cached_up_comms = up_comms; - han_module->cached_vranks = vranks; - - mca_base_var_set_value(han_var_id, &tmp_han_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, - NULL); - comm->c_coll->coll_allreduce = mca_coll_han_allreduce_intra; - comm->c_coll->coll_allgather = mca_coll_han_allgather_intra; - } -} - -int mca_coll_han_pow10_int(int pow_value) -{ - int i, result = 1; - for (i = 0; i < pow_value; i++) { - result *= 10; - } - return result; -} - -int mca_coll_han_hostname_to_number(char *hostname, int size) -{ - int i = 0, j = 0; - char *number_array = (char *) malloc(sizeof(char) * size); - while (hostname[i] != '\0') { - if (hostname[i] >= '0' && hostname[i] <= '9') { - number_array[j++] = hostname[i]; - } - i++; - } - int number = 0; - for (i = 0; i < j; i++) { - number += (number_array[i] - '0') * mca_coll_han_pow10_int(j - 1 - i); - } - free(number_array); - return number; -} - -void mca_coll_han_topo_get(int *topo, struct ompi_communicator_t *comm, int num_topo_level) -{ - int *self_topo = (int *) malloc(sizeof(int) * num_topo_level); - /* Set daemon vpid */ - char hostname[1024]; - gethostname(hostname, 1024); - self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024); - /* Set core id */ - self_topo[1] = ompi_comm_rank(comm); - - /* Allgather all the topology information */ - ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT, topo, num_topo_level, - MPI_INT, comm, comm->c_coll->coll_allgather_module); - free(self_topo); - return; -} - -void mca_coll_han_topo_sort(int *topo, int start, int end, int size, int level, int num_topo_level) -{ - if (level > num_topo_level - 1 || start >= end) { - return; - } - int i, j; - int min = INT_MAX; - int min_loc = -1; - for (i = start; i <= end; i++) { - /* Find min */ - for (j = i; j <= end; j++) { - if (topo[j * num_topo_level + level] < min) { - min = topo[j * num_topo_level + level]; - min_loc = j; - - } - } - /* Swap i and min_loc */ - int temp; - for (j = 0; j < num_topo_level; j++) { - temp = topo[i * num_topo_level + j]; - topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j]; - topo[min_loc * num_topo_level + j] = temp; - } - min = INT_MAX; - min_loc = -1; - } - int last = 0; - int new_start = 0; - int new_end = 0; - for (i = start; i <= end; i++) { - if (i == start) { - last = topo[i * num_topo_level + level]; - new_start = start; - } else if (i == end) { - new_end = end; - mca_coll_han_topo_sort(topo, new_start, new_end, size, level + 1, num_topo_level); - } else if (last != topo[i * num_topo_level + level]) { - new_end = i - 1; - mca_coll_han_topo_sort(topo, new_start, new_end, size, level + 1, num_topo_level); - new_start = i; - last = topo[i * num_topo_level + level]; - } - } - return; -} - -/* Check if the current processes are mapped by core */ -bool mca_coll_han_topo_is_mapbycore(int *topo, struct ompi_communicator_t * comm, - int num_topo_level) -{ - int i; - int size = ompi_comm_size(comm); - for (i = 1; i < size; i++) { - if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level] - || topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) { - return false; - - } - } - return true; -} - -int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, - int num_topo_level) -{ - int size; - size = ompi_comm_size(comm); - int *topo; - if ((han_module->cached_topo != NULL) && (han_module->cached_comm == comm)) { - topo = han_module->cached_topo; - } - else { - if (han_module->cached_topo != NULL) { - free(han_module->cached_topo); - han_module->cached_topo = NULL; - } - topo = (int *) malloc(sizeof(int) * size * num_topo_level); - /* Get topo infomation */ - mca_coll_han_topo_get(topo, comm, num_topo_level); - mca_coll_han_topo_print(topo, comm, num_topo_level); - - /* Check if the processes are mapped by core */ - han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level); - /* Sort the topo such that each group is contiguous */ - if (!han_module->is_mapbycore) { - mca_coll_han_topo_sort(topo, 0, size - 1, size, 0, num_topo_level); - } - han_module->cached_topo = topo; - han_module->cached_comm = comm; - } - - mca_coll_han_topo_print(topo, comm, num_topo_level); - return topo; -} - -/* Print out the topology info, for debugging purpose */ -void mca_coll_han_topo_print(int *topo, struct ompi_communicator_t *comm, int num_topo_level) -{ - int rank = ompi_comm_rank(comm); - int size = ompi_comm_size(comm); - - if (rank == 0) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: HAN topo: ", rank)); - int i; - for (i = 0; i < size * num_topo_level; i++) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i])); - } - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n")); - - } -} diff --git a/ompi/mca/coll/han/coll_han_reduce.c b/ompi/mca/coll/han/coll_han_reduce.c index f6137a8cd0..d0dc337ce8 100644 --- a/ompi/mca/coll/han/coll_han_reduce.c +++ b/ompi/mca/coll/han/coll_han_reduce.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -69,8 +70,24 @@ mca_coll_han_reduce_intra(const void *sbuf, size_t typelng; ompi_datatype_type_size(dtype, &typelng); - /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + /* Do not initialize topology if the operation cannot commute */ + if(!ompi_op_is_commute(op)){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this operation. It needs to fall back on another component\n")); + goto prev_reduce_intra; + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); + goto prev_reduce_intra; + } + + /* Create the subcommunicators */ mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -133,6 +150,11 @@ mca_coll_han_reduce_intra(const void *sbuf, free(t); return OMPI_SUCCESS; + +prev_reduce_intra: + return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, + comm, + han_module->previous_reduce_module); } /* t0 task: issue and wait for the low level reduce of segment 0 */ @@ -189,4 +211,178 @@ int mca_coll_han_reduce_t1_task(void *task_argu) { } return OMPI_SUCCESS; -} \ No newline at end of file +} + +/* In case of non regular situation (imbalanced number of processes per nodes), + * a fallback is made on the next component that provides a reduce in priority order */ +int +mca_coll_han_reduce_intra_simple(const void *sbuf, + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank; /* information about the global communicator */ + int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ + int ret; + int *vranks, low_rank, low_size; + ptrdiff_t rsize, rgap = 0; + void * tmp_buf; + + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* Do not initialize topology if the operation cannot commute */ + if(!ompi_op_is_commute(op)){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this operation. It needs to fall back on another component\n")); + goto prev_reduce_intra_simple; + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); + goto prev_reduce_intra_simple; + } + + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module]; + + /* Get the 'virtual ranks' mapping corresponding to the communicators */ + vranks = han_module->cached_vranks; + w_rank = ompi_comm_rank(comm); + low_rank = ompi_comm_rank(low_comm); + + low_size = ompi_comm_size(low_comm); + /* Get root ranks for low and up comms */ + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + + if (root_low_rank == low_rank && w_rank != root) { + rsize = opal_datatype_span(&dtype->super, (int64_t)count, &rgap); + tmp_buf = malloc(rsize); + if (NULL == tmp_buf) { + return OMPI_ERROR; + } + } else { + /* global root rbuf is valid, local non-root do not need buffers */ + tmp_buf = rbuf; + } + /* No need to handle MPI_IN_PLACE: only the global root may ask for it and + * it is ok to use it for intermediary reduces since it is also a local root*/ + + /* Low_comm reduce */ + ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)tmp_buf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){ + if (root_low_rank == low_rank && w_rank != root){ + free(tmp_buf); + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: low comm reduce failed. " + "Falling back to another component\n")); + goto prev_reduce_intra_simple; + } + + /* Up_comm reduce */ + if (root_low_rank == low_rank ){ + if(w_rank != root){ + ret = up_comm->c_coll->coll_reduce((char *)tmp_buf, NULL, + count, dtype, op, root_up_rank, + up_comm, up_comm->c_coll->coll_reduce_module); + free(tmp_buf); + } else { + /* Take advantage of any optimisation made for IN_PLACE + * communcations */ + ret = up_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)tmp_buf, + count, dtype, op, root_up_rank, + up_comm, up_comm->c_coll->coll_reduce_module); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: low comm reduce failed.\n")); + return ret; + } + + } + return OMPI_SUCCESS; + +prev_reduce_intra_simple: + return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, + comm, + han_module->previous_reduce_module); +} + + +/* Find a fallback on reproducible algorithm + * use tuned or basic or if impossible whatever available + */ +int +mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank = ompi_comm_rank(comm); + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* populate previous modules_storage*/ + mca_coll_han_get_all_coll_modules(comm, han_module); + + /* try availability of reproducible modules */ + int fallbacks[] = {TUNED, BASIC}; + int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks); + int i; + for (i=0; imodules_storage + .modules[fallback] + .module_handler; + if (fallback_module != NULL && fallback_module->coll_reduce != NULL) { + if (0 == w_rank) { + opal_output_verbose(30, mca_coll_han_component.han_output, + "coll:han:reduce_reproducible: " + "fallback on %s\n", + components_name[fallback]); + } + han_module->reproducible_reduce_module = fallback_module; + han_module->reproducible_reduce = fallback_module->coll_reduce; + return OMPI_SUCCESS; + } + } + /* fallback of the fallback */ + if (0 == w_rank) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:reduce_reproducible_decision: " + "no reproducible fallback\n"); + } + han_module->reproducible_reduce_module = + han_module->previous_reduce_module; + han_module->reproducible_reduce = han_module->previous_reduce; + return OMPI_SUCCESS; +} + + +/* Fallback on reproducible algorithm */ +int +mca_coll_han_reduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + return han_module->reproducible_reduce(sbuf, rbuf, count, dtype, + op, root, comm, + han_module + ->reproducible_reduce_module); +} diff --git a/ompi/mca/coll/han/coll_han_scatter.c b/ompi/mca/coll/han/coll_han_scatter.c index 90d92659e1..b2a8752938 100644 --- a/ompi/mca/coll/han/coll_han_scatter.c +++ b/ompi/mca/coll/han/coll_han_scatter.c @@ -66,13 +66,23 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); - /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle scatter with this communicator. It needs to fall back on another component\n")); + goto prev_scatter_intra; + } + + /* Create the subcommunicators */ mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; + han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; + han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); @@ -93,6 +103,8 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, int root_low_rank; int root_up_rank; + + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank, @@ -105,7 +117,6 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, */ char *reorder_buf = NULL; char *reorder_sbuf = NULL; - int *topo = mca_coll_han_topo_init(comm, han_module, 2); if (w_rank == root) { /* If the processes are mapped-by core, no need to reorder */ @@ -154,6 +165,11 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); return OMPI_SUCCESS; +prev_scatter_intra: + return han_module->previous_scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module->previous_scatter_module); } /* us: upper level (intra-node) scatter task */ diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c new file mode 100644 index 0000000000..e99f3e614b --- /dev/null +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Warning: this is not for the faint of heart -- don't even bother + * reading this source code if you don't have a strong understanding + * of nested data structures and pointer math (remember that + * associativity and order of C operations is *critical* in terms of + * pointer math!). + */ + +#include "ompi_config.h" + +#include "mpi.h" +#include "coll_han.h" +#include "coll_han_dynamic.h" + + +/* + * Local functions + */ +static void create_intranode_comm_new(ompi_communicator_t *, + ompi_communicator_t **); +static void create_internode_comm_new(ompi_communicator_t *, + int, int, + ompi_communicator_t **); +static void create_intranode_comm(ompi_communicator_t *, + const char *, + int, + ompi_communicator_t **); +static void create_internode_comm(ompi_communicator_t *, + const char *, + int, int, + ompi_communicator_t **); + +/** + * Create a sub-communicator containing the ranks that share my node. + * + * @param comm (IN) original communicator for the collective + * target module priority + * @param sub_comm (OUT) created sub-communicator + */ +static void create_intranode_comm_new(ompi_communicator_t *comm, + ompi_communicator_t **sub_comm) +{ + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + (opal_info_t *)(&ompi_mpi_info_null), sub_comm); + return; +} + +/** + * Create a sub-communicator containing one rank per node. + * + * @param comm (IN) original communicator for the collective + * @param my_rank (IN) my rank in comm + * @param intra_rank (IN) local rank in the intra-node sub-communicator + * @param sub_comm (OUT) created sub-communicator + */ +static void create_internode_comm_new(ompi_communicator_t *comm, + int my_rank, + int intra_rank, + ompi_communicator_t **sub_comm) +{ + ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); + return; +} + +/* + * Routine that creates the local hierarchical sub-communicators + * Called each time a collective is called. + * comm: input communicator of the collective + */ +void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + int low_rank, low_size; + int up_rank; + int w_rank; + int w_size; + ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]); + ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]); + const int *origin_priority; + int han_var_id; + int tmp_han_priority; + int vrank, *vranks; + + mca_coll_base_module_allreduce_fn_t old_allreduce; + mca_coll_base_module_t *old_allreduce_module; + + mca_coll_base_module_allgather_fn_t old_allgather; + mca_coll_base_module_t *old_allgather_module; + + mca_coll_base_module_bcast_fn_t old_bcast; + mca_coll_base_module_t *old_bcast_module; + + mca_coll_base_module_gather_fn_t old_gather; + mca_coll_base_module_t *old_gather_module; + + mca_coll_base_module_reduce_fn_t old_reduce; + mca_coll_base_module_t *old_reduce_module; + + /* The sub communicators have already been created */ + if (NULL != han_module->sub_comm[INTRA_NODE] + && NULL != han_module->sub_comm[INTER_NODE] + && NULL != han_module->cached_vranks) { + return; + } + + /* + * We cannot use han allreduce and allgather without sub-communicators + * Temporary set previous ones + * + * Allgather is used to compute vranks + * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new + * Reduce + Bcast may be called by the allreduce implementation + * Gather + Bcast may be called by the allgather implementation + */ + old_allreduce = comm->c_coll->coll_allreduce; + old_allreduce_module = comm->c_coll->coll_allreduce_module; + + old_allgather = comm->c_coll->coll_allgather; + old_allgather_module = comm->c_coll->coll_allgather_module; + + old_reduce = comm->c_coll->coll_reduce; + old_reduce_module = comm->c_coll->coll_reduce_module; + + old_bcast = comm->c_coll->coll_bcast; + old_bcast_module = comm->c_coll->coll_bcast_module; + + old_gather = comm->c_coll->coll_gather; + old_gather_module = comm->c_coll->coll_gather_module; + + comm->c_coll->coll_allreduce = han_module->previous_allreduce; + comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; + + comm->c_coll->coll_allgather = han_module->previous_allgather; + comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; + + comm->c_coll->coll_reduce = han_module->previous_reduce; + comm->c_coll->coll_reduce_module = han_module->previous_reduce_module; + + comm->c_coll->coll_bcast = han_module->previous_bcast; + comm->c_coll->coll_bcast_module = han_module->previous_bcast_module; + + comm->c_coll->coll_gather = han_module->previous_gather; + comm->c_coll->coll_gather_module = han_module->previous_gather_module; + + /* Create topological sub-communicators */ + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + + origin_priority = NULL; + mca_base_var_find_by_name("coll_han_priority", &han_var_id); + mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); + + /* + * Maximum priority for selector on sub-communicators + */ + tmp_han_priority = 100; + mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* + * This sub-communicator contains the ranks that share my node. + */ + mca_coll_han_component.topo_level = INTRA_NODE; + create_intranode_comm_new(comm, low_comm); + + /* + * Get my local rank and the local size + */ + low_size = ompi_comm_size(*low_comm); + low_rank = ompi_comm_rank(*low_comm); + + /* + * This sub-communicator contains one process per node: processes with the + * same intra-node rank id share such a sub-communicator + */ + mca_coll_han_component.topo_level = INTER_NODE; + create_internode_comm_new(comm, w_rank, low_rank, up_comm); + + up_rank = ompi_comm_rank(*up_comm); + + /* + * Set my virtual rank number. + * my rank # = * + * + + * WARNING: this formula works only if the ranks are perfectly spread over + * the nodes + * TODO: find a better way of doing + */ + vrank = low_size * up_rank + low_rank; + vranks = (int *)malloc(sizeof(int) * w_size); + /* + * gather vrank from each process so every process will know other processes + * vrank + */ + comm->c_coll->coll_allgather(&vrank, + 1, + MPI_INT, + vranks, + 1, + MPI_INT, + comm, + comm->c_coll->coll_allgather_module); + + /* + * Set the cached info + */ + han_module->cached_vranks = vranks; + + /* + * Come back to the original han module priority + */ + mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* Put allreduce, allgather, reduce and bcast back */ + comm->c_coll->coll_allreduce = old_allreduce; + comm->c_coll->coll_allreduce_module = old_allreduce_module; + + comm->c_coll->coll_allgather = old_allgather; + comm->c_coll->coll_allgather_module = old_allgather_module; + + comm->c_coll->coll_reduce = old_reduce; + comm->c_coll->coll_reduce_module = old_reduce_module; + + comm->c_coll->coll_bcast = old_bcast; + comm->c_coll->coll_bcast_module = old_bcast_module; + + comm->c_coll->coll_gather = old_gather; + comm->c_coll->coll_gather_module = old_gather_module; + + mca_coll_han_component.topo_level = GLOBAL_COMMUNICATOR; +} + +/** + * Create a sub-communicator containing the ranks that share my node. + * Associate this sub-communicator a given collective module. + * module can be one of: + * . sm + * . shared + * + * @param comm (IN) original communicator for the collective + * @param prio_string (IN) string containing the mca variable associated to + * target module priority + * @param my_rank (IN) my rank in comm + * @param sub_comm (OUT) created sub-communicator + */ +static void create_intranode_comm(ompi_communicator_t *comm, + const char *prio_string, + int my_rank, + ompi_communicator_t **sub_comm) +{ + int var_id; + const int *sav_priority; + int tmp_priority = 100; + + /* + * Upgrade the target module priority to make the resulting sub-communicator + * use that collective module + */ + mca_base_var_find_by_name(prio_string, &var_id); + mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] %s origin %d\n", + my_rank, prio_string, *sav_priority)); + + mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + /* + * Create the sub-communicator + * Since the target priority has been set to the highest value, this + * sub-communicator will inherit it as a collective module. + */ + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + (opal_info_t *)(&ompi_mpi_info_null), sub_comm); + /* + * Come back to the target module's original priority + */ + mca_base_var_set_value(var_id, sav_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + return; +} + +/** + * Create a sub-communicator containing one rank per node. + * Associate this sub-communicator a given collective module. + * module can be one of: + * . libnbc + * . adapt + * + * @param comm (IN) original communicator for the collective + * @param prio_string (IN) string containing the mca variable associated to + * target module priority + * @param my_rank (IN) my rank in comm + * @param intra_rank (IN) local rank in the intra-node sub-communicator + * @param sub_comm (OUT) created sub-communicator + */ +static void create_internode_comm(ompi_communicator_t *comm, + const char *prio_string, + int my_rank, + int intra_rank, + ompi_communicator_t **sub_comm) +{ + int var_id; + const int *sav_priority; + int tmp_priority = 100; + + /* + * Upgrade the target module priority to make the resulting sub-communicator + * use that collective module + */ + mca_base_var_find_by_name(prio_string, &var_id); + mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] %s origin %d\n", my_rank, prio_string, + *sav_priority)); + mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* + * Create the sub-communicator + * Since the target priority has been set to the highest value, this + * sub-communicator will inherit it as a collective module. + */ + ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); + mca_base_var_set_value(var_id, sav_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + return; +} + + +/* + * Routine that creates the local hierarchical sub-communicators + * Called each time a collective is called. + * comm: input communicator of the collective + */ +void mca_coll_han_comm_create(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + int low_rank, low_size; + int up_rank; + int w_rank; + int w_size; + ompi_communicator_t **low_comms; + ompi_communicator_t **up_comms; + const int *origin_priority; + int han_var_id; + int tmp_han_priority; + int vrank, *vranks; + + mca_coll_base_module_allreduce_fn_t old_allreduce; + mca_coll_base_module_t *old_allreduce_module; + mca_coll_base_module_allgather_fn_t old_allgather; + mca_coll_base_module_t *old_allgather_module; + + /* use cached communicators if possible */ + if (han_module->cached_comm == comm && + han_module->cached_low_comms != NULL && + han_module->cached_up_comms != NULL && + han_module->cached_vranks != NULL) { + return; + } + + /* We cannot use han allreduce and allgather without sub-communicators + * Temporary set previous ones */ + old_allreduce = comm->c_coll->coll_allreduce; + old_allreduce_module = comm->c_coll->coll_allreduce_module; + + old_allgather = comm->c_coll->coll_allgather; + old_allgather_module = comm->c_coll->coll_allgather_module; + + comm->c_coll->coll_allreduce = han_module->previous_allreduce; + comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; + + comm->c_coll->coll_allgather = han_module->previous_allgather; + comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; + + /* create communicators if there is no cached communicator */ + + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES * + sizeof(struct ompi_communicator_t *)); + up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES * + sizeof(struct ompi_communicator_t *)); + origin_priority = NULL; + mca_base_var_find_by_name("coll_han_priority", &han_var_id); + mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); + + /* + * Lower down our current priority + */ + tmp_han_priority = 0; + mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* + * Upgrade sm module priority to set up low_comms[0] with sm module + * This sub-communicator contains the ranks that share my node. + */ + create_intranode_comm(comm, "coll_sm_priority", w_rank, &(low_comms[0])); + + /* + * Get my local rank and the local size + */ + low_size = ompi_comm_size(low_comms[0]); + low_rank = ompi_comm_rank(low_comms[0]); + + /* + * Upgrade shared module priority to set up low_comms[1] with shared module + * This sub-communicator contains the ranks that share my node. + */ + create_intranode_comm(comm, "coll_shared_priority", w_rank, &(low_comms[1])); + + /* + * Upgrade libnbc module priority to set up up_comms[0] with libnbc module + * This sub-communicator contains one process per node: processes with the + * same intra-node rank id share such a sub-communicator + */ + create_internode_comm(comm, "coll_libnbc_priority", w_rank, low_rank, + &(up_comms[0])); + + up_rank = ompi_comm_rank(up_comms[0]); + + /* + * Upgrade adapt module priority to set up up_comms[0] with adapt module + * This sub-communicator contains one process per node. + */ + create_internode_comm(comm, "coll_adapt_priority", w_rank, low_rank, + &(up_comms[1])); + + /* + * Set my virtual rank number. + * my rank # = * + * + + * WARNING: this formula works only if the ranks are perfectly spread over + * the nodes + * TODO: find a better way of doing + */ + vrank = low_size * up_rank + low_rank; + vranks = (int *)malloc(sizeof(int) * w_size); + /* + * gather vrank from each process so every process will know other processes + * vrank + */ + comm->c_coll->coll_allgather(&vrank, 1, MPI_INT, vranks, 1, MPI_INT, comm, + comm->c_coll->coll_allgather_module); + + /* + * Set the cached info + */ + han_module->cached_comm = comm; + han_module->cached_low_comms = low_comms; + han_module->cached_up_comms = up_comms; + han_module->cached_vranks = vranks; + + /* + * Come back to the original han module priority + */ + mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* Put allreduce and allgather back */ + comm->c_coll->coll_allreduce = old_allreduce; + comm->c_coll->coll_allreduce_module = old_allreduce_module; + + comm->c_coll->coll_allgather = old_allgather; + comm->c_coll->coll_allgather_module = old_allgather_module; +} + + diff --git a/ompi/mca/coll/han/coll_han_topo.c b/ompi/mca/coll/han/coll_han_topo.c new file mode 100644 index 0000000000..cbcfd698d0 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_topo.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Warning: this is not for the faint of heart -- don't even bother + * reading this source code if you don't have a strong understanding + * of nested data structures and pointer math (remember that + * associativity and order of C operations is *critical* in terms of + * pointer math!). + */ + +#include "ompi_config.h" + +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ + +#ifdef HAVE_STDLIB_H +#include +#endif /* HAVE_STDLIB_H */ + + +#include "mpi.h" +#include "coll_han.h" + + +/* + * Local functions + */ + +static int mca_coll_han_hostname_to_number(char* hostname, int size); +static void mca_coll_han_topo_get(int *topo, + struct ompi_communicator_t* comm, + int num_topo_level); +static void mca_coll_han_topo_sort(int *topo, int start, int end, + int level, int num_topo_level); +static bool mca_coll_han_topo_is_mapbycore(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level); +static void mca_coll_han_topo_print(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level); + + +/* + * takes the number part of a host: hhh2031 -->2031 + */ +static int mca_coll_han_hostname_to_number(char* hostname, int size) +{ + int i, j; + char *number_array = (char *)malloc(sizeof(char) * size); + int number = 0; + + for (i = 0, j = 0; hostname[i] != '\0'; i++) { + if ('0' <= hostname[i] && '9' >= hostname[i]) { + number_array[j++] = hostname[i]; + } + } + number_array[j] = '\0'; + number = atoi(number_array); + free(number_array); + return number; +} + +/* + * Set the virtual topo id. It is made of num_topo_level ints (2 today): + * . the integer part of the host id + * . the rank in the main communicator + * Gather the virtual topoid from each process so every process will know other + * processes virtual topids + */ +static void mca_coll_han_topo_get(int *topo, + struct ompi_communicator_t* comm, + int num_topo_level) +{ + int *self_topo = (int *)malloc(sizeof(int) * num_topo_level); + char hostname[1024]; + + gethostname(hostname, 1024); + self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024); + self_topo[1] = ompi_comm_rank(comm); + + ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT, + topo, num_topo_level, MPI_INT, comm, + comm->c_coll->coll_allgather_module); + free(self_topo); + + return; +} + +/* + * Sort the topology array in order to have ranks sharing the same node + * contiguous in the topology array. + * Called from topo_init whenever the processes are not mapped by core. + * ex: 4 ranks executing on 2 nodes, mapped by node + * ranks 0 and 2 on hid0 + * ranks 1 and 3 on hid1 + * On entry the topo array looks like + * hid0 0 hid1 1 hid0 2 hid1 3 + * After the sort: + * hid0 0 hid0 2 hid1 1 hid1 3 + * This is to have the gather result in the right order + * + * @param topo (IN/OUT) topology description array (sorted in out) + * @param start (IN) where to begin the processing + * The index in topo will actually be: + * start * num_topo_level + level + * topo contains num_topo_level ids per rank. + * @param end (IN) where to stop the processing + * The index in topo will actually be: + * end * num_topo_level + level + * topo contains num_topo_level ids per rank. + * @param level (IN) level number we are currently processing + * @param num_topo_level (IN) number of topological levels + * + */ +static void mca_coll_han_topo_sort(int *topo, int start, int end, + int level, int num_topo_level) +{ + int i, j; + int min, min_loc; + int last, new_start, new_end; + + if (level > num_topo_level-1 || start >= end) { + return; + } + + min = INT_MAX; + min_loc = -1; + for (i = start; i <= end; i++) { + int temp; + /* get the min value for current level and its location */ + for (j = i; j <= end; j++) { + /* topo contains num_topo_level ids per rank. */ + if (topo[j * num_topo_level + level] < min) { + min = topo[j*num_topo_level+level]; + min_loc = j; + + } + } + /* + * swap i and min_loc + * We have num_topo_level ids to swap + */ + for (j = 0; j < num_topo_level; j++) { + temp = topo[i * num_topo_level + j]; + topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j]; + topo[min_loc * num_topo_level + j] = temp; + } + min = INT_MAX; + min_loc = -1; + } + + /* Process next level */ + last = 0; + new_start = 0; + new_end = 0; + for (i = start; i <= end; i++) { + if (i == start) { + last = topo[i * num_topo_level + level]; + new_start = start; + } else if (i == end) { + new_end = end; + mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, + num_topo_level); + } else if (last != topo[i * num_topo_level + level]) { + new_end = i - 1; + mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, + num_topo_level); + new_start = i; + last = topo[i * num_topo_level + level]; + } + } + return; +} + +/* + * Check whether the ranks in the communicator given as input are mapped by core + * Mapped by core: each node is first filled with as many ranks as needed before + * moving to the next one + * This is checked as follows: + * . 2 contiguous ranks should be either on the same node or on node ids in + * ascending order + * The topology is actually an array of ints: + * +----------+-------+----------+-------+------+----------+-------+-----+ + * | host_id0 | rank0 | host_id1 | rank1 | .... | host_idX | rankX | ... | + * +----------+-------+----------+-------+------+----------+-------+-----+ + */ +static bool mca_coll_han_topo_is_mapbycore(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) +{ + int i; + int size = ompi_comm_size(comm); + + for (i = 1; i < size; i++) { + /* + * The host id for a given rank should be < host id for the next rank + */ + if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level]) { + return false; + } + /* + * For the same host id, consecutive ranks should be sorted in + * ascending order. + */ + if (topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) { + return false; + } + } + return true; +} + +/* The topo is supposed sorted by host */ +static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level){ + int i; + int size = ompi_comm_size(comm); + if (size < 2){ + return false; + } + int ppn; + int last_host = topo[0]; + + /* Find the ppn for the first node */ + for (i = 1; i < size; i++) { + if (topo[i * num_topo_level] != last_host){ + break; + } + } + ppn = i; + + /* All on one node */ + if ( size == ppn){ + return false; + } + /* Trivial case */ + if (size % ppn != 0){ + return true; + } + + last_host = topo[ppn * num_topo_level]; + /* Check that the 2nd and next hosts also this ppn. Since the topo is sorted + * one just need to jump ppn ranks to check the supposed switch of host */ + for (i = 2 * ppn; i < size; i += ppn ){ + /* the list of ranks for the last known host have ended before */ + if (topo[(i-1) * num_topo_level] != last_host){ + return true; + } + /* the list of ranks for the last known host are bigger than excpected */ + if (topo[(i-1) * num_topo_level] == topo[i*num_topo_level]){ + return true; + } + last_host = topo[i * num_topo_level]; + } + /* Check the last host */ + if (topo[(size-1) * num_topo_level] != last_host){ + return true; + } + + return false; +} + + +/** + * Topology initialization phase + * Called each time a collective that needs buffer reordering is called + * + * @param num_topo_level (IN) Number of the topological levels + */ +int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module, + int num_topo_level) +{ + int size; + int *topo; + + size = ompi_comm_size(comm); + + if (!((han_module->cached_topo) && (han_module->cached_comm == comm))) { + if (han_module->cached_topo) { + free(han_module->cached_topo); + han_module->cached_topo = NULL; + } + + topo = (int *)malloc(sizeof(int) * size * num_topo_level); + + /* get topo infomation */ + mca_coll_han_topo_get(topo, comm, num_topo_level); + mca_coll_han_topo_print(topo, comm, num_topo_level); + + /* + * All the ranks now have the topo information + */ + + /* check if the processes are mapped by core */ + han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level); + + /* + * If not, sort the topo such that each group of ids is sorted by rank + * i.e. ids for rank i are contiguous to ids for rank i+1. + * This will be needed for the operations that are order sensitive + * (like gather) + */ + if (!han_module->is_mapbycore) { + mca_coll_han_topo_sort(topo, 0, size-1, 0, num_topo_level); + } + han_module->are_ppn_imbalanced = mca_coll_han_topo_are_ppn_imbalanced(topo, comm , num_topo_level); + han_module->cached_topo = topo; + han_module->cached_comm = comm; + } else { + topo = han_module->cached_topo; + } + + mca_coll_han_topo_print(topo, comm, num_topo_level); + return topo; +} + +static void mca_coll_han_topo_print(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) +{ + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); + + if (rank == 0) { + int i; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter topo: ", rank)); + for (i=0; i