From 182c333b21a5c58f57c3ec75fb81bb5b59f00767 Mon Sep 17 00:00:00 2001 From: Xi Luo Date: Mon, 9 Mar 2020 14:08:34 -0400 Subject: [PATCH 1/4] Initial import of the HAN collective module a hierarchical, architecture-aware collective communication module. Add Reduce and remove up_seg_size and low_seg_size in Bcast Increase HAN's priority Signed-off-by: Xi Luo Signed-off-by: George Bosilca --- ompi/mca/coll/han/Makefile.am | 49 +++ ompi/mca/coll/han/coll_han.h | 410 ++++++++++++++++++++++ ompi/mca/coll/han/coll_han_allgather.c | 240 +++++++++++++ ompi/mca/coll/han/coll_han_allreduce.c | 395 +++++++++++++++++++++ ompi/mca/coll/han/coll_han_bcast.c | 222 ++++++++++++ ompi/mca/coll/han/coll_han_component.c | 279 +++++++++++++++ ompi/mca/coll/han/coll_han_gather.c | 224 ++++++++++++ ompi/mca/coll/han/coll_han_module.c | 468 +++++++++++++++++++++++++ ompi/mca/coll/han/coll_han_reduce.c | 192 ++++++++++ ompi/mca/coll/han/coll_han_scatter.c | 222 ++++++++++++ ompi/mca/coll/han/coll_han_trigger.c | 42 +++ ompi/mca/coll/han/coll_han_trigger.h | 46 +++ ompi/mca/coll/han/coll_han_utils.c | 58 +++ 13 files changed, 2847 insertions(+) create mode 100644 ompi/mca/coll/han/Makefile.am create mode 100644 ompi/mca/coll/han/coll_han.h create mode 100644 ompi/mca/coll/han/coll_han_allgather.c create mode 100644 ompi/mca/coll/han/coll_han_allreduce.c create mode 100644 ompi/mca/coll/han/coll_han_bcast.c create mode 100644 ompi/mca/coll/han/coll_han_component.c create mode 100644 ompi/mca/coll/han/coll_han_gather.c create mode 100644 ompi/mca/coll/han/coll_han_module.c create mode 100644 ompi/mca/coll/han/coll_han_reduce.c create mode 100644 ompi/mca/coll/han/coll_han_scatter.c create mode 100644 ompi/mca/coll/han/coll_han_trigger.c create mode 100644 ompi/mca/coll/han/coll_han_trigger.h create mode 100644 ompi/mca/coll/han/coll_han_utils.c diff --git a/ompi/mca/coll/han/Makefile.am b/ompi/mca/coll/han/Makefile.am new file mode 100644 index 0000000000..380b44d615 --- /dev/null +++ b/ompi/mca/coll/han/Makefile.am @@ -0,0 +1,49 @@ +# +# Copyright (c) 2018-2020 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ +coll_han.h \ +coll_han_trigger.h \ +coll_han_bcast.c \ +coll_han_reduce.c \ +coll_han_scatter.c \ +coll_han_gather.c \ +coll_han_allreduce.c \ +coll_han_allgather.c \ +coll_han_component.c \ +coll_han_module.c \ +coll_han_trigger.c \ +coll_han_utils.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +component_noinst = +component_install = +if MCA_BUILD_ompi_coll_han_DSO +component_install += mca_coll_han.la +else +component_noinst += libmca_coll_han.la +endif + +# See ompi/mca/btl/sm/Makefile.am for an explanation of +# libmca_common_sm.la. + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_coll_han_la_SOURCES = $(sources) +mca_coll_han_la_LDFLAGS = -module -avoid-version +mca_coll_han_la_LIBADD = + +noinst_LTLIBRARIES = $(component_noinst) +libmca_coll_han_la_SOURCES =$(sources) +libmca_coll_han_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h new file mode 100644 index 0000000000..307fa52044 --- /dev/null +++ b/ompi/mca/coll/han/coll_han.h @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_HAN_EXPORT_H +#define MCA_COLL_HAN_EXPORT_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/communicator/communicator.h" +#include "ompi/include/mpi.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "opal/util/info.h" +#include "ompi/op/op.h" +#include "opal/runtime/opal_progress.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "coll_han_trigger.h" + +BEGIN_C_DECLS typedef struct { + uint32_t umod; + uint32_t lmod; + uint32_t fs; + uint32_t ualg; + uint32_t us; +} selection; + +struct mca_bcast_argu_s { + mca_coll_task_t *cur_task; + void *buff; + int seg_count; + struct ompi_datatype_t *dtype; + int root_low_rank; + int root_up_rank; + struct ompi_communicator_t *up_comm; + struct ompi_communicator_t *low_comm; + int num_segments; + int cur_seg; + int w_rank; + int last_seg_count; + bool noop; +}; +typedef struct mca_bcast_argu_s mca_bcast_argu_t; + +struct mca_reduce_argu_s { + mca_coll_task_t *cur_task; + void *sbuf; + void *rbuf; + int seg_count; + struct ompi_datatype_t *dtype; + struct ompi_op_t *op; + int root_low_rank; + int root_up_rank; + struct ompi_communicator_t *up_comm; + struct ompi_communicator_t *low_comm; + int num_segments; + int cur_seg; + int w_rank; + int last_seg_count; + bool noop; +}; +typedef struct mca_reduce_argu_s mca_reduce_argu_t; + +struct mca_allreduce_argu_s { + mca_coll_task_t *cur_task; + void *sbuf; + void *rbuf; + int seg_count; + struct ompi_datatype_t *dtype; + struct ompi_op_t *op; + int root_up_rank; + int root_low_rank; + struct ompi_communicator_t *up_comm; + struct ompi_communicator_t *low_comm; + int num_segments; + int cur_seg; + int w_rank; + int last_seg_count; + bool noop; + ompi_request_t *req; + int *completed; +}; +typedef struct mca_allreduce_argu_s mca_allreduce_argu_t; + +struct mca_scatter_argu_s { + mca_coll_task_t *cur_task; + void *sbuf; + void *sbuf_inter_free; + void *sbuf_reorder_free; + int scount; + struct ompi_datatype_t *sdtype; + void *rbuf; + int rcount; + struct ompi_datatype_t *rdtype; + int root; + int root_up_rank; + int root_low_rank; + struct ompi_communicator_t *up_comm; + struct ompi_communicator_t *low_comm; + int w_rank; + bool noop; + ompi_request_t *req; +}; +typedef struct mca_scatter_argu_s mca_scatter_argu_t; + +struct mca_gather_argu_s { + mca_coll_task_t *cur_task; + void *sbuf; + void *sbuf_inter_free; + int scount; + struct ompi_datatype_t *sdtype; + void *rbuf; + int rcount; + struct ompi_datatype_t *rdtype; + int root; + int root_up_rank; + int root_low_rank; + struct ompi_communicator_t *up_comm; + struct ompi_communicator_t *low_comm; + int w_rank; + bool noop; + ompi_request_t *req; +}; +typedef struct mca_gather_argu_s mca_gather_argu_t; + +struct mca_allgather_argu_s { + mca_coll_task_t *cur_task; + void *sbuf; + void *sbuf_inter_free; + int scount; + struct ompi_datatype_t *sdtype; + void *rbuf; + int rcount; + struct ompi_datatype_t *rdtype; + int root_low_rank; + struct ompi_communicator_t *up_comm; + struct ompi_communicator_t *low_comm; + int w_rank; + bool noop; + bool is_mapbycore; + int *topo; + ompi_request_t *req; +}; +typedef struct mca_allgather_argu_s mca_allgather_argu_t; + +/** + * Structure to hold the han coll component. First it holds the + * base coll component, and then holds a bunch of + * han-coll-component-specific stuff (e.g., current MCA param + * values). + */ +typedef struct mca_coll_han_component_t { + /** Base coll component */ + mca_coll_base_component_2_0_0_t super; + + /** MCA parameter: Priority of this component */ + int han_priority; + /* whether output the log message */ + int han_output; + /* segment size for bcast */ + uint32_t han_bcast_segsize; + /* up level module for bcast */ + uint32_t han_bcast_up_module; + /* low level module for bcast */ + uint32_t han_bcast_low_module; + /* segment size for reduce */ + uint32_t han_reduce_segsize; + /* up level module for reduce */ + uint32_t han_reduce_up_module; + /* low level module for reduce */ + uint32_t han_reduce_low_module; + /* segment size for allreduce */ + uint32_t han_allreduce_segsize; + /* up level module for allreduce */ + uint32_t han_allreduce_up_module; + /* low level module for allreduce */ + uint32_t han_allreduce_low_module; + /* up level module for allgather */ + uint32_t han_allgather_up_module; + /* low level module for allgather */ + uint32_t han_allgather_low_module; + /* up level module for gather */ + uint32_t han_gather_up_module; + /* low level module for gather */ + uint32_t han_gather_low_module; + /* up level module for scatter */ + uint32_t han_scatter_up_module; + /* low level module for scatter */ + uint32_t han_scatter_low_module; + /* whether enable auto tune */ + uint32_t han_auto_tune; + /* create a 3D array + * num_processes (n): 2 4 8 16 32 64 (6) + * num_core (c): 2 4 8 12 (4) + * message size (m): 1 - 4194304 (23) + */ + uint32_t han_auto_tune_n; + uint32_t han_auto_tune_c; + uint32_t han_auto_tune_m; + selection *han_auto_tuned; +} mca_coll_han_component_t; + +/** Coll han module */ +typedef struct mca_coll_han_module_t { + /** Base module */ + mca_coll_base_module_t super; + + /* Whether this module has been lazily initialized or not yet */ + bool enabled; + + struct ompi_communicator_t *cached_comm; + struct ompi_communicator_t **cached_low_comms; + struct ompi_communicator_t **cached_up_comms; + int *cached_vranks; + int *cached_topo; + bool is_mapbycore; +} mca_coll_han_module_t; +OBJ_CLASS_DECLARATION(mca_coll_han_module_t); + +/** + * Global component instance + */ +OMPI_MODULE_DECLSPEC extern mca_coll_han_component_t mca_coll_han_component; + +/* + * coll module functions + */ +int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_threads); + +mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t *comm, int *priority); + +int han_request_free(ompi_request_t ** request); + +/* Subcommunicator creation */ +void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); + +/* Gather topology information */ +int mca_coll_han_pow10_int(int pow_value); +int mca_coll_han_hostname_to_number(char *hostname, int size); +void mca_coll_han_topo_get(int *topo, struct ompi_communicator_t *comm, int num_topo_level); +void mca_coll_han_topo_sort(int *topo, int start, int end, int size, int level, int num_topo_level); +bool mca_coll_han_topo_is_mapbycore(int *topo, struct ompi_communicator_t *comm, + int num_topo_level); +int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, + int num_topo_level); +void mca_coll_han_topo_print(int *topo, struct ompi_communicator_t *comm, int num_topo_level); + +/* Utils */ +void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, + int *root_up_rank); +uint32_t han_auto_tuned_get_n(uint32_t n); +uint32_t han_auto_tuned_get_c(uint32_t c); +uint32_t han_auto_tuned_get_m(uint32_t m); + + +/* Bcast */ +void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, + int seg_count, struct ompi_datatype_t *dtype, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop); +int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int mca_coll_han_bcast_t0_task(void *task_argu); +int mca_coll_han_bcast_t1_task(void *task_argu); + +/* Reduce */ +void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, + void *sbuf, + void *rbuf, int seg_count, struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop); + +int mca_coll_han_reduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t* op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module); + +int mca_coll_han_reduce_t0_task(void *task_argu); +int mca_coll_han_reduce_t1_task(void *task_argu); + +/* Allreduce */ +void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, + mca_coll_task_t * cur_task, + void *sbuf, + void *rbuf, + int seg_count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, + int cur_seg, + int w_rank, + int last_seg_count, + bool noop, ompi_request_t * req, int *completed); +int mca_coll_han_allreduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int mca_coll_han_allreduce_t0_task(void *task_argu); +int mca_coll_han_allreduce_t1_task(void *task_argu); +int mca_coll_han_allreduce_t2_task(void *task_argu); +int mca_coll_han_allreduce_t3_task(void *task_argu); + +/* Scatter */ +int +mca_coll_han_scatter_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int mca_coll_han_scatter_us_task(void *task_argu); +int mca_coll_han_scatter_ls_task(void *task_argu); +void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + void *sbuf_reorder_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, ompi_request_t * req); + +/* Gatter */ +int +mca_coll_han_gather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int mca_coll_han_gather_lg_task(void *task_argu); +int mca_coll_han_gather_ug_task(void *task_argu); +void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, ompi_request_t * req); + +/* Allgather */ +int +mca_coll_han_allgather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); +int mca_coll_han_allgather_lg_task(void *task_argu); +int mca_coll_han_allgather_uag_task(void *task_argu); +int mca_coll_han_allgather_lb_task(void *task_argu); +void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, + bool noop, bool is_mapbycore, int *topo, ompi_request_t * req); + +END_C_DECLS +#endif /* MCA_COLL_HAN_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_allgather.c b/ompi/mca/coll/han/coll_han_allgather.c new file mode 100644 index 0000000000..2f0e3c45bd --- /dev/null +++ b/ompi/mca/coll/han/coll_han_allgather.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, + bool noop, bool is_mapbycore, int *topo, ompi_request_t * req) +{ + argu->cur_task = cur_task; + argu->sbuf = sbuf; + argu->sbuf_inter_free = sbuf_inter_free; + argu->scount = scount; + argu->sdtype = sdtype; + argu->rbuf = rbuf; + argu->rcount = rcount; + argu->rdtype = rdtype; + argu->root_low_rank = root_low_rank; + argu->up_comm = up_comm; + argu->low_comm = low_comm; + argu->w_rank = w_rank; + argu->noop = noop; + argu->is_mapbycore = is_mapbycore; + argu->topo = topo; + argu->req = req; +} + +int +mca_coll_han_allgather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + int w_rank; + w_rank = ompi_comm_rank(comm); + + /* Create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_allgather_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_allgather_up_module]; + int low_rank = ompi_comm_rank(low_comm); + + ompi_request_t *temp_request = NULL; + /* Set up request */ + temp_request = OBJ_NEW(ompi_request_t); + OMPI_REQUEST_INIT(temp_request, false); + temp_request->req_state = OMPI_REQUEST_ACTIVE; + temp_request->req_type = 0; + temp_request->req_free = han_request_free; + temp_request->req_status.MPI_SOURCE = 0; + temp_request->req_status.MPI_TAG = 0; + temp_request->req_status.MPI_ERROR = 0; + temp_request->req_status._cancelled = 0; + temp_request->req_status._ucount = 0; + + /* Init topo */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + int root_low_rank = 0; + /* Create lg (lower level gather) task */ + mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); + /* Setup lg task arguments */ + mca_allgather_argu_t *lg_argu = malloc(sizeof(mca_allgather_argu_t)); + mac_coll_han_set_allgather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount, + rdtype, root_low_rank, up_comm, low_comm, w_rank, + low_rank != root_low_rank, han_module->is_mapbycore, topo, + temp_request); + /* Init lg task */ + init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_argu)); + /* Issure lg task */ + issue_task(lg); + + ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); + + return OMPI_SUCCESS; +} + +/* lg: lower level (shared memory) gather task */ +int mca_coll_han_allgather_lg_task(void *task_argu) +{ + mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: lg\n", + t->w_rank)); + OBJ_RELEASE(t->cur_task); + + /* If the process is one of the node leader */ + char *tmp_buf = NULL; + char *tmp_rbuf = NULL; + if (!t->noop) { + int low_size = ompi_comm_size(t->low_comm); + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); + tmp_buf = (char *) malloc(rsize); + tmp_rbuf = tmp_buf - rgap; + } + /* Shared memory gather */ + t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, + t->rdtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_gather_module); + t->sbuf = tmp_rbuf; + t->sbuf_inter_free = tmp_buf; + + /* Create uag (upper level all-gather) task */ + mca_coll_task_t *uag = OBJ_NEW(mca_coll_task_t); + /* Setup uag task arguments */ + t->cur_task = uag; + /* Init uag task */ + init_task(uag, mca_coll_han_allgather_uag_task, (void *) t); + /* Issure uag task */ + issue_task(uag); + + return OMPI_SUCCESS; +} + +/* uag: upper level (inter-node) all-gather task */ +int mca_coll_han_allgather_uag_task(void *task_argu) +{ + mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; + OBJ_RELEASE(t->cur_task); + + if (t->noop) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allgather: uag noop\n", t->w_rank)); + } else { + int low_size = ompi_comm_size(t->low_comm); + int up_size = ompi_comm_size(t->up_comm); + char *reorder_buf = NULL; + char *reorder_rbuf = NULL; + if (t->is_mapbycore) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: HAN Allgather is bycore: ", t->w_rank)); + reorder_rbuf = (char *) t->rbuf; + } else { + ptrdiff_t rsize, rgap = 0; + rsize = + opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size * up_size, + &rgap); + reorder_buf = (char *) malloc(rsize); + reorder_rbuf = reorder_buf - rgap; + } + + /* Inter node allgather */ + t->up_comm->c_coll->coll_allgather((char *) t->sbuf, t->scount * low_size, t->sdtype, + reorder_rbuf, t->rcount * low_size, t->rdtype, + t->up_comm, t->up_comm->c_coll->coll_allgather_module); + + if (t->sbuf_inter_free != NULL) { + free(t->sbuf_inter_free); + t->sbuf_inter_free = NULL; + } + + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allgather: ug allgather finish\n", t->w_rank)); + + /* Reorder the node leader's rbuf, copy data from tmp_rbuf to rbuf */ + if (!t->is_mapbycore) { + int i, j; + ptrdiff_t rextent; + ompi_datatype_type_extent(t->rdtype, &rextent); + for (i = 0; i < up_size; i++) { + for (j = 0; j < low_size; j++) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: HAN Allgather copy from %d %d\n", t->w_rank, + (i * low_size + j) * 2 + 1, + t->topo[(i * low_size + j) * 2 + 1])); + ompi_datatype_copy_content_same_ddt(t->rdtype, (ptrdiff_t) t->rcount, + (char *) t->rbuf + + rextent * + (ptrdiff_t) t->topo[(i * low_size + j) * 2 + + 1] * + (ptrdiff_t) t->rcount, + reorder_rbuf + rextent * (i * low_size + + j) * + (ptrdiff_t) t->rcount); + } + } + free(reorder_buf); + reorder_buf = NULL; + } + } + + + /* Create lb (low level broadcast) task */ + mca_coll_task_t *lb = OBJ_NEW(mca_coll_task_t); + /* Setup lb task arguments */ + t->cur_task = lb; + /* Init lb task */ + init_task(lb, mca_coll_han_allgather_lb_task, (void *) t); + /* Issure lb task */ + issue_task(lb); + + return OMPI_SUCCESS; +} + +/* lb: low level (shared-memory) broadcast task */ +int mca_coll_han_allgather_lb_task(void *task_argu) +{ + mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: uag noop\n", + t->w_rank)); + OBJ_RELEASE(t->cur_task); + int low_size = ompi_comm_size(t->low_comm); + int up_size = ompi_comm_size(t->up_comm); + t->low_comm->c_coll->coll_bcast((char *) t->rbuf, t->rcount * low_size * up_size, t->rdtype, + t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_bcast_module); + + ompi_request_t *temp_req = t->req; + free(t); + ompi_request_complete(temp_req, 1); + return OMPI_SUCCESS; + +} diff --git a/ompi/mca/coll/han/coll_han_allreduce.c b/ompi/mca/coll/han/coll_han_allreduce.c new file mode 100644 index 0000000000..629b93a1c9 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_allreduce.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +/* Only work with regular situation (each node has equal number of processes) */ + +void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, + mca_coll_task_t * cur_task, + void *sbuf, + void *rbuf, + int seg_count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, + int cur_seg, + int w_rank, + int last_seg_count, + bool noop, ompi_request_t * req, int *completed) +{ + argu->cur_task = cur_task; + argu->sbuf = sbuf; + argu->rbuf = rbuf; + argu->seg_count = seg_count; + argu->dtype = dtype; + argu->op = op; + argu->root_up_rank = root_up_rank; + argu->root_low_rank = root_low_rank; + argu->up_comm = up_comm; + argu->low_comm = low_comm; + argu->num_segments = num_segments; + argu->cur_seg = cur_seg; + argu->w_rank = w_rank; + argu->last_seg_count = last_seg_count; + argu->noop = noop; + argu->req = req; + argu->completed = completed; +} + +/* + * Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce: + * lr: lower level (shared-memory or intra-node) reduce, + * ur: upper level (inter-node) reduce, + * ub: upper level (inter-node) bcast, + * lb: lower level (shared-memory or intra-node) bcast. + * Hence, in each iteration, there is a combination of collective operations which is called a task. + * | seg 0 | seg 1 | seg 2 | seg 3 | + * iter 0 | lr | | | | task: t0, contains lr + * iter 1 | ur | lr | | | task: t1, contains ur and lr + * iter 2 | ub | ur | lr | | task: t2, contains ub, ur and lr + * iter 3 | lb | ub | ur | lr | task: t3, contains lb, ub, ur and lr + * iter 4 | | lb | ub | ur | task: t3, contains lb, ub and ur + * iter 5 | | | lb | ub | task: t3, contains lb and ub + * iter 6 | | | | lb | task: t3, contains lb + */ + +int +mca_coll_han_allreduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + ptrdiff_t extent, lb; + ompi_datatype_get_extent(dtype, &lb, &extent); + int w_rank; + w_rank = ompi_comm_rank(comm); + int seg_count = count; + size_t typelng; + ompi_datatype_type_size(dtype, &typelng); + + /* Create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm; + ompi_communicator_t *up_comm; + /* Auto tune is enabled */ + if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) { + uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0])); + uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0])); + uint32_t m = han_auto_tuned_get_m(typelng * count); + uint32_t id = + n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m + + c * mca_coll_han_component.han_auto_tune_m + m + + mca_coll_han_component.han_auto_tune_n * mca_coll_han_component.han_auto_tune_c * + mca_coll_han_component.han_auto_tune_m; + uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod; + uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod; + uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs; + /* ualg and us are only available when using ADAPT */ + /* + uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg; + uint32_t us = mca_coll_han_component.han_auto_tuned[id].us; + */ + /* Set up umod */ + up_comm = han_module->cached_up_comms[umod]; + /* Set up lmod */ + low_comm = han_module->cached_low_comms[lmod]; + /* Set up fs */ + COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count); + /* Set up ualg and us, which is only available when using ADAPT */ + /* + if (umod == 1) { + ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> + adapt_ibcast_algorithm = ualg; + ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> + adapt_ibcast_algorithm = ualg; + ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> + adapt_ibcast_segment_size = us; + ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> + adapt_ibcast_segment_size = us; + } + */ + } else { + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, typelng, + seg_count); + } + + /* Determine number of elements sent per task. */ + OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, + "In HAN Allreduce seg_size %d seg_count %d count %d\n", + mca_coll_han_component.han_allreduce_segsize, seg_count, count)); + int num_segments = (count + seg_count - 1) / seg_count; + + int low_rank = ompi_comm_rank(low_comm); + int root_up_rank = 0; + int root_low_rank = 0; + /* Create t0 task for the first segment */ + mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); + /* Setup up t0 task arguments */ + int *completed = (int *) malloc(sizeof(int)); + completed[0] = 0; + mca_allreduce_argu_t *t = malloc(sizeof(mca_allreduce_argu_t)); + mac_coll_han_set_allreduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, + root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, + w_rank, count - (num_segments - 1) * seg_count, + low_rank != root_low_rank, NULL, completed); + /* Init t0 task */ + init_task(t0, mca_coll_han_allreduce_t0_task, (void *) (t)); + /* Issure t0 task */ + issue_task(t0); + + /* Create t1 tasks for the current segment */ + mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); + /* Setup up t1 task arguments */ + t->cur_task = t1; + /* Init t1 task */ + init_task(t1, mca_coll_han_allreduce_t1_task, (void *) t); + /* Issue t1 task */ + issue_task(t1); + + /* Create t2 tasks for the current segment */ + mca_coll_task_t *t2 = OBJ_NEW(mca_coll_task_t); + /* Setup up t2 task arguments */ + t->cur_task = t2; + /* Init t2 task */ + init_task(t2, mca_coll_han_allreduce_t2_task, (void *) t); + issue_task(t2); + + /* Create t3 tasks for the current segment */ + mca_coll_task_t *t3 = OBJ_NEW(mca_coll_task_t); + /* Setup up t3 task arguments */ + t->cur_task = t3; + /* Init t3 task */ + init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t); + issue_task(t3); + + while (t->completed[0] != t->num_segments) { + /* Create t3 tasks for the current segment */ + mca_coll_task_t *t3 = OBJ_NEW(mca_coll_task_t); + /* Setup up t3 task arguments */ + t->cur_task = t3; + t->sbuf = (char *) t->sbuf + extent * t->seg_count; + t->rbuf = (char *) t->rbuf + extent * t->seg_count; + t->cur_seg = t->cur_seg + 1; + /* Init t3 task */ + init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t); + issue_task(t3); + } + if (t->completed != NULL) { + free(t->completed); + t->completed = NULL; + } + free(t); + + return OMPI_SUCCESS; +} + +/* t0 task */ +int mca_coll_han_allreduce_t0_task(void *task_argu) +{ + mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t0 %d r_buf %d\n", t->w_rank, t->cur_seg, + ((int *) t->rbuf)[0])); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + return OMPI_SUCCESS; +} + +/* t1 task */ +int mca_coll_han_allreduce_t1_task(void *task_argu) +{ + mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t1 %d r_buf %d\n", t->w_rank, t->cur_seg, + ((int *) t->rbuf)[0])); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *ireduce_req; + int tmp_count = t->seg_count; + if (!t->noop) { + int up_rank = ompi_comm_rank(t->up_comm); + /* ur of cur_seg */ + if (up_rank == t->root_up_rank) { + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_up_rank, t->up_comm, &ireduce_req, + t->up_comm->c_coll->coll_ireduce_module); + } else { + t->up_comm->c_coll->coll_ireduce((char *) t->rbuf, (char *) t->rbuf, t->seg_count, + t->dtype, t->op, t->root_up_rank, t->up_comm, + &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); + } + } + /* lr of cur_seg+1 */ + if (t->cur_seg <= t->num_segments - 2) { + if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count, + (char *) t->rbuf + extent * t->seg_count, tmp_count, + t->dtype, t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + + } + if (!t->noop) { + ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); + } + + return OMPI_SUCCESS; +} + +/* t2 task */ +int mca_coll_han_allreduce_t2_task(void *task_argu) +{ + mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t2 %d r_buf %d\n", t->w_rank, t->cur_seg, + ((int *) t->rbuf)[0])); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *reqs[2]; + int req_count = 0; + int tmp_count = t->seg_count; + if (!t->noop) { + int up_rank = ompi_comm_rank(t->up_comm); + /* ub of cur_seg */ + t->up_comm->c_coll->coll_ibcast((char *) t->rbuf, t->seg_count, t->dtype, t->root_up_rank, + t->up_comm, &(reqs[0]), + t->up_comm->c_coll->coll_ibcast_module); + req_count++; + /* ur of cur_seg+1 */ + if (t->cur_seg <= t->num_segments - 2) { + if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + if (up_rank == t->root_up_rank) { + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, + (char *) t->rbuf + extent * t->seg_count, + tmp_count, t->dtype, t->op, t->root_up_rank, + t->up_comm, &(reqs[1]), + t->up_comm->c_coll->coll_ireduce_module); + } else { + t->up_comm->c_coll->coll_ireduce((char *) t->rbuf + extent * t->seg_count, + (char *) t->rbuf + extent * t->seg_count, + tmp_count, t->dtype, t->op, t->root_up_rank, + t->up_comm, &(reqs[1]), + t->up_comm->c_coll->coll_ireduce_module); + } + req_count++; + } + } + /* lr of cur_seg+2 */ + if (t->cur_seg <= t->num_segments - 3) { + if (t->cur_seg == t->num_segments - 3 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->low_comm->c_coll->coll_reduce((char *) t->sbuf + 2 * extent * t->seg_count, + (char *) t->rbuf + 2 * extent * t->seg_count, tmp_count, + t->dtype, t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + if (!t->noop && req_count > 0) { + ompi_request_wait_all(req_count, reqs, MPI_STATUSES_IGNORE); + } + + + return OMPI_SUCCESS; +} + +/* t3 task */ +int mca_coll_han_allreduce_t3_task(void *task_argu) +{ + mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t3 %d r_buf %d\n", t->w_rank, t->cur_seg, + ((int *) t->rbuf)[0])); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *reqs[2]; + int req_count = 0; + int tmp_count = t->seg_count; + if (!t->noop) { + int up_rank = ompi_comm_rank(t->up_comm); + /* ub of cur_seg+1 */ + if (t->cur_seg <= t->num_segments - 2) { + if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->up_comm->c_coll->coll_ibcast((char *) t->rbuf + extent * t->seg_count, t->seg_count, + t->dtype, t->root_up_rank, t->up_comm, &(reqs[0]), + t->up_comm->c_coll->coll_ibcast_module); + req_count++; + } + /* ur of cur_seg+2 */ + if (t->cur_seg <= t->num_segments - 3) { + if (t->cur_seg == t->num_segments - 3 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + if (up_rank == t->root_up_rank) { + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, + (char *) t->rbuf + 2 * extent * t->seg_count, + tmp_count, t->dtype, t->op, t->root_up_rank, + t->up_comm, &(reqs[1]), + t->up_comm->c_coll->coll_ireduce_module); + } else { + t->up_comm->c_coll->coll_ireduce((char *) t->rbuf + 2 * extent * t->seg_count, + (char *) t->rbuf + 2 * extent * t->seg_count, + tmp_count, t->dtype, t->op, t->root_up_rank, + t->up_comm, &(reqs[1]), + t->up_comm->c_coll->coll_ireduce_module); + } + req_count++; + } + } + /* lr of cur_seg+3 */ + if (t->cur_seg <= t->num_segments - 4) { + if (t->cur_seg == t->num_segments - 4 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->low_comm->c_coll->coll_reduce((char *) t->sbuf + 3 * extent * t->seg_count, + (char *) t->rbuf + 3 * extent * t->seg_count, tmp_count, + t->dtype, t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + /* lb of cur_seg */ + t->low_comm->c_coll->coll_bcast((char *) t->rbuf, t->seg_count, t->dtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_bcast_module); + if (!t->noop && req_count > 0) { + ompi_request_wait_all(req_count, reqs, MPI_STATUSES_IGNORE); + } + + t->completed[0]++; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] HAN Allreduce: t3 %d total %d\n", t->w_rank, t->cur_seg, + t->completed[0])); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_bcast.c b/ompi/mca/coll/han/coll_han_bcast.c new file mode 100644 index 0000000000..35c0a461f9 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_bcast.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, + int seg_count, struct ompi_datatype_t *dtype, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop) +{ + argu->cur_task = cur_task; + argu->buff = buff; + argu->seg_count = seg_count; + argu->dtype = dtype; + argu->root_low_rank = root_low_rank; + argu->root_up_rank = root_up_rank; + argu->up_comm = up_comm; + argu->low_comm = low_comm; + argu->num_segments = num_segments; + argu->cur_seg = cur_seg; + argu->w_rank = w_rank; + argu->last_seg_count = last_seg_count; + argu->noop = noop; +} + +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast: + * ub: upper level (inter-node) bcast + * lb: low level (shared-memory or intra-node) bcast. + * Hence, in each iteration, there is a combination of collective operations which is called a task. + * | seg 0 | seg 1 | seg 2 | seg 3 | + * iter 0 | ub | | | | task: t0, contains ub + * iter 1 | lb | ub | | | task: t1, contains ub and lb + * iter 2 | | lb | ub | | task: t1, contains ub and lb + * iter 3 | | | lb | ub | task: t1, contains ub and lb + * iter 4 | | | | lb | task: t1, contains lb + */ +int +mca_coll_han_bcast_intra(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + ptrdiff_t extent, lb; + ompi_datatype_get_extent(dtype, &lb, &extent); + int w_rank; + w_rank = ompi_comm_rank(comm); + int seg_count = count; + size_t typelng; + ompi_datatype_type_size(dtype, &typelng); + + /* Create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm; + ompi_communicator_t *up_comm; + /* Auto tune is enabled */ + if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) { + uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0])); + uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0])); + uint32_t m = han_auto_tuned_get_m(typelng * count); + uint32_t id = + n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m + + c * mca_coll_han_component.han_auto_tune_m + m; + uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod; + uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod; + uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs; + /* ualg and us are only available when using ADAPT */ + /* + uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg; + uint32_t us = mca_coll_han_component.han_auto_tuned[id].us; + */ + /* Set up umod */ + up_comm = han_module->cached_up_comms[umod]; + /* Set up lmod */ + low_comm = han_module->cached_low_comms[lmod]; + /* Set up fs */ + COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count); + /* Set up ualg and us, which is only available when using ADAPT */ + /* + if (umod == 1) { + ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> + adapt_ibcast_algorithm = ualg; + ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> + adapt_ibcast_segment_size = us; + } + */ + + } else { + /* If auto tune is disabled, use MCA parameters */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, typelng, + seg_count); + } + + int num_segments = (count + seg_count - 1) / seg_count; + OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output, + "In HAN seg_count %d count %d num_seg %d\n", + seg_count, count, num_segments)); + + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + + int root_low_rank; + int root_up_rank; + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, + root_up_rank)); + + /* Create t0 tasks for the first segment */ + mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); + /* Setup up t0 task arguments */ + mca_bcast_argu_t *t = malloc(sizeof(mca_bcast_argu_t)); + mac_coll_han_set_bcast_argu(t, t0, (char *) buff, seg_count, dtype, + root_up_rank, root_low_rank, up_comm, low_comm, + num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, + low_rank != root_low_rank); + /* Init the first task */ + init_task(t0, mca_coll_han_bcast_t0_task, (void *) t); + issue_task(t0); + + /* Create t1 task */ + mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); + /* Setup up t1 task arguments */ + t->cur_task = t1; + /* Init the t1 task */ + init_task(t1, mca_coll_han_bcast_t1_task, (void *) t); + issue_task(t1); + + while (t->cur_seg <= t->num_segments - 2) { + /* Create t1 task */ + mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); + /* Setup up t1 task arguments */ + t->cur_task = t1; + t->buff = (char *) t->buff + extent * seg_count; + t->cur_seg = t->cur_seg + 1; + /* Init the t1 task */ + init_task(t1, mca_coll_han_bcast_t1_task, (void *) t); + issue_task(t1); + } + + free(t); + + return OMPI_SUCCESS; +} + +/* t0 task: issue and wait for the upper level ibcast of segment 0 */ +int mca_coll_han_bcast_t0_task(void *task_argu) +{ + mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, + t->cur_seg)); + OBJ_RELEASE(t->cur_task); + if (t->noop) { + return OMPI_SUCCESS; + } else { + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *ibcast_req; + t->up_comm->c_coll->coll_ibcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank, + t->up_comm, &ibcast_req, t->up_comm->c_coll->coll_ibcast_module); + ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE); + return OMPI_SUCCESS; + } +} + +/* t1 task: + * 1. issue the upper level ibcast of segment cur_seg + 1 + * 2. issue the low level bcast of segment cur_seg + * 3. wait for the completion of the ibcast + */ +int mca_coll_han_bcast_t1_task(void *task_argu) +{ + mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, + t->cur_seg)); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *ibcast_req = NULL; + int tmp_count = t->seg_count; + if (!t->noop) { + if (t->cur_seg <= t->num_segments - 2 ) { + if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->up_comm->c_coll->coll_ibcast((char *) t->buff + extent * t->seg_count, + tmp_count, t->dtype, t->root_up_rank, + t->up_comm, &ibcast_req, + t->up_comm->c_coll->coll_ibcast_module); + } + } + + t->low_comm->c_coll->coll_bcast((char *) t->buff, + t->seg_count, t->dtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_bcast_module); + + if (!t->noop && ibcast_req != NULL) { + ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE); + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c new file mode 100644 index 0000000000..2aa5bbd7c2 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_component.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Most of the description of the data layout is in the + * coll_han_module.c file. + */ + +#include "ompi_config.h" + +#include "opal/util/show_help.h" +#include "ompi/constants.h" +#include "ompi/mca/coll/coll.h" +#include "coll_han.h" + +/* + * Public string showing the coll ompi_han component version number + */ +const char *mca_coll_han_component_version_string = + "Open MPI han collective MCA component version " OMPI_VERSION; + + +/* + * Local functions + */ +static int han_open(void); +static int han_close(void); +static int han_register(void); + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +mca_coll_han_component_t mca_coll_han_component = { + + /* First, fill in the super */ + + { + /* First, the mca_component_t struct containing meta + information about the component itself */ + + .collm_version = { + MCA_COLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + .mca_component_name = "han", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + + /* Component functions */ + .mca_open_component = han_open, + .mca_close_component = han_close, + .mca_register_component_params = han_register, + }, + .collm_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE}, + + /* Initialization / querying functions */ + + .collm_init_query = mca_coll_han_init_query, + .collm_comm_query = mca_coll_han_comm_query, + }, + + /* han-component specifc information */ + + /* (default) priority */ + 20, +}; + +/* + * Init the component + */ +static int han_open(void) +{ + mca_coll_han_component_t *cs = &mca_coll_han_component; + if (cs->han_auto_tune) { + cs->han_auto_tuned = + (selection *) malloc(2 * cs->han_auto_tune_n * cs->han_auto_tune_c * + cs->han_auto_tune_m * sizeof(selection)); + char *filename = "/home/dycz0fx/results/auto/auto_tuned.bin"; + FILE *file = fopen(filename, "r"); + fread(cs->han_auto_tuned, sizeof(selection), + 2 * cs->han_auto_tune_n * cs->han_auto_tune_c * cs->han_auto_tune_m, file); + fclose(file); + } + return OMPI_SUCCESS; +} + + +/* + * Shut down the component + */ +static int han_close(void) +{ + mca_coll_han_component_t *cs = &mca_coll_han_component; + if (cs->han_auto_tune && cs->han_auto_tuned != NULL) { + free(cs->han_auto_tuned); + cs->han_auto_tuned = NULL; + } + return OMPI_SUCCESS; +} + + +/* + * Register MCA params + */ +static int han_register(void) +{ + mca_base_component_t *c = &mca_coll_han_component.super.collm_version; + mca_coll_han_component_t *cs = &mca_coll_han_component; + + cs->han_priority = 50; + (void) mca_base_component_var_register(c, "priority", "Priority of the han coll component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); + + int coll_han_verbose = 0; + (void) mca_base_component_var_register(c, "verbose", + "Verbose level", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &coll_han_verbose); + cs->han_output = opal_output_open(NULL); + opal_output_set_verbosity(cs->han_output, coll_han_verbose); + + cs->han_bcast_segsize = 65536; + (void) mca_base_component_var_register(c, "bcast_segsize", + "segment size for bcast", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_segsize); + + cs->han_bcast_up_module = 0; + (void) mca_base_component_var_register(c, "bcast_up_module", + "up level module for bcast, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_up_module); + + cs->han_bcast_low_module = 0; + (void) mca_base_component_var_register(c, "bcast_low_module", + "low level module for bcast, 0 sm, 1 solo", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_low_module); + + cs->han_reduce_segsize = 524288; + (void) mca_base_component_var_register(c, "reduce_segsize", + "segment size for reduce", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_segsize); + + cs->han_reduce_up_module = 0; + (void) mca_base_component_var_register(c, "reduce_up_module", + "up level module for allreduce, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_reduce_up_module); + + cs->han_reduce_low_module = 0; + (void) mca_base_component_var_register(c, "reduce_low_module", + "low level module for allreduce, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_reduce_low_module); + cs->han_allreduce_segsize = 524288; + (void) mca_base_component_var_register(c, "allreduce_segsize", + "segment size for allreduce", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_segsize); + + cs->han_allreduce_up_module = 0; + (void) mca_base_component_var_register(c, "allreduce_up_module", + "up level module for allreduce, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_allreduce_up_module); + + cs->han_allreduce_low_module = 0; + (void) mca_base_component_var_register(c, "allreduce_low_module", + "low level module for allreduce, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_allreduce_low_module); + + cs->han_allgather_up_module = 0; + (void) mca_base_component_var_register(c, "allgather_up_module", + "up level module for allgather, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_allgather_up_module); + + cs->han_allgather_low_module = 0; + (void) mca_base_component_var_register(c, "allgather_low_module", + "low level module for allgather, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_allgather_low_module); + + cs->han_gather_up_module = 0; + (void) mca_base_component_var_register(c, "gather_up_module", + "up level module for gather, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_gather_up_module); + + cs->han_gather_low_module = 0; + (void) mca_base_component_var_register(c, "gather_low_module", + "low level module for gather, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_gather_low_module); + + cs->han_scatter_up_module = 0; + (void) mca_base_component_var_register(c, "scatter_up_module", + "up level module for scatter, 0 libnbc, 1 adapt", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_up_module); + + cs->han_scatter_low_module = 0; + (void) mca_base_component_var_register(c, "scatter_low_module", + "low level module for scatter, 0 sm, 1 shared", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_scatter_low_module); + + cs->han_auto_tune = 0; + (void) mca_base_component_var_register(c, "auto_tune", + "whether enable auto tune, 0 disable, 1 enable, default 0", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune); + + cs->han_auto_tune_n = 5; + (void) mca_base_component_var_register(c, "auto_tune_n", + "auto tune n", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_n); + + cs->han_auto_tune_c = 3; + (void) mca_base_component_var_register(c, "auto_tune_c", + "auto tune c", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_c); + + cs->han_auto_tune_m = 21; + (void) mca_base_component_var_register(c, "auto_tune_m", + "auto tune n", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_m); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_gather.c b/ompi/mca/coll/han/coll_han_gather.c new file mode 100644 index 0000000000..5188d2aca6 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_gather.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +/* only work with regular situation (each node has equal number of processes) */ + +void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, ompi_request_t * req) +{ + argu->cur_task = cur_task; + argu->sbuf = sbuf; + argu->sbuf_inter_free = sbuf_inter_free; + argu->scount = scount; + argu->sdtype = sdtype; + argu->rbuf = rbuf; + argu->rcount = rcount; + argu->rdtype = rdtype; + argu->root = root; + argu->root_up_rank = root_up_rank; + argu->root_low_rank = root_low_rank; + argu->up_comm = up_comm; + argu->low_comm = low_comm; + argu->w_rank = w_rank; + argu->noop = noop; + argu->req = req; +} + +int +mca_coll_han_gather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + int i, j; + int w_rank, w_size; + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + /* Create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int up_size = ompi_comm_size(up_comm); + + ompi_request_t *temp_request = NULL; + /* Set up request */ + temp_request = OBJ_NEW(ompi_request_t); + OMPI_REQUEST_INIT(temp_request, false); + temp_request->req_state = OMPI_REQUEST_ACTIVE; + temp_request->req_type = 0; + temp_request->req_free = han_request_free; + temp_request->req_status.MPI_SOURCE = 0; + temp_request->req_status.MPI_TAG = 0; + temp_request->req_status.MPI_ERROR = 0; + temp_request->req_status._cancelled = 0; + temp_request->req_status._ucount = 0; + + int root_low_rank; + int root_up_rank; + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future Gather root %d root_low_rank %d root_up_rank %d\n", w_rank, + root, root_low_rank, root_up_rank)); + + char *reorder_buf = NULL; + char *reorder_rbuf = NULL; + ptrdiff_t rsize, rgap = 0, rextent; + ompi_datatype_type_extent(rdtype, &rextent); + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + if (w_rank == root) { + /* If the processes are mapped-by core, no need to reorder */ + if (han_module->is_mapbycore) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future Gather is_bycore: ", w_rank)); + reorder_rbuf = (char *) rbuf; + } else { + rsize = opal_datatype_span(&rdtype->super, (int64_t) rcount * w_size, &rgap); + reorder_buf = (char *) malloc(rsize); //TODO:free + reorder_rbuf = reorder_buf - rgap; + } + } + + + /* Create lg task */ + mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); + /* Setup lg task arguments */ + mca_gather_argu_t *lg_argu = malloc(sizeof(mca_gather_argu_t)); + mac_coll_han_set_gather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, + rcount, rdtype, root, root_up_rank, root_low_rank, up_comm, + low_comm, w_rank, low_rank != root_low_rank, temp_request); + /* Init lg task */ + init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_argu)); + /* Issure lg task */ + issue_task(lg); + + ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); + + /* Reorder rbuf based on rank. + * Suppose, message is 0 1 2 3 4 5 6 7, + * and the processes are mapped on 2 nodes (the processes on the node 0 is 0 2 4 6 and the processes on the node 1 is 1 3 5 7), + * so the message needs to be reordered to 0 2 4 6 1 3 5 7 + */ + if (w_rank == root && !han_module->is_mapbycore) { + for (i = 0; i < up_size; i++) { + for (j = 0; j < low_size; j++) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future Gather copy from %d %d\n", w_rank, + (i * low_size + j) * 2 + 1, topo[(i * low_size + j) * 2 + 1])); + ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t) rcount, + (char *) rbuf + + rextent * (ptrdiff_t) topo[(i * low_size + j) * + 2 + + 1] * + (ptrdiff_t) rcount, + reorder_rbuf + rextent * (i * low_size + + j) * + (ptrdiff_t) rcount); + } + } + free(reorder_buf); + } + + return OMPI_SUCCESS; +} + +/* lg: lower level (shared memory) gather task */ +int mca_coll_han_gather_lg_task(void *task_argu) +{ + mca_gather_argu_t *t = (mca_gather_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Future Gather: lg\n", + t->w_rank)); + OBJ_RELEASE(t->cur_task); + + /* If the process is one of the node leader */ + char *tmp_buf = NULL; + char *tmp_rbuf = NULL; + if (!t->noop) { + int low_size = ompi_comm_size(t->low_comm); + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); + tmp_buf = (char *) malloc(rsize); + tmp_rbuf = tmp_buf - rgap; + } + /* Shared memory gather */ + t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, + t->rdtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_gather_module); + t->sbuf = tmp_rbuf; + t->sbuf_inter_free = tmp_buf; + + /* Create ug (upper level all-gather) task */ + mca_coll_task_t *ug = OBJ_NEW(mca_coll_task_t); + /* Setup ug task arguments */ + t->cur_task = ug; + /* Init ug task */ + init_task(ug, mca_coll_han_gather_ug_task, (void *) t); + /* Issure ug task */ + issue_task(ug); + + return OMPI_SUCCESS; +} + +/* ug: upper level (intra-node) gather task */ +int mca_coll_han_gather_ug_task(void *task_argu) +{ + mca_gather_argu_t *t = (mca_gather_argu_t *) task_argu; + OBJ_RELEASE(t->cur_task); + + if (t->noop) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] Future Gather: ug noop\n", t->w_rank)); + } else { + int low_size = ompi_comm_size(t->low_comm); + /* Inter node gather */ + t->up_comm->c_coll->coll_gather((char *) t->sbuf, t->scount * low_size, t->sdtype, + (char *) t->rbuf, t->rcount * low_size, t->rdtype, + t->root_up_rank, t->up_comm, + t->up_comm->c_coll->coll_gather_module); + + if (t->sbuf_inter_free != NULL) { + free(t->sbuf_inter_free); + t->sbuf_inter_free = NULL; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] Future Gather: ug gather finish\n", t->w_rank)); + } + ompi_request_t *temp_req = t->req; + free(t); + ompi_request_complete(temp_req, 1); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_module.c b/ompi/mca/coll/han/coll_han_module.c new file mode 100644 index 0000000000..a63de4aa59 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_module.c @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include +#include +#ifdef HAVE_SCHED_H +#include +#endif +#include +#ifdef HAVE_SYS_MMAN_H +#include +#endif /* HAVE_SYS_MMAN_H */ +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ + +#include "mpi.h" +#include "opal_stdint.h" +#include "opal/mca/hwloc/base/base.h" +#include "opal/util/os_path.h" + +#include "ompi/communicator/communicator.h" +#include "ompi/group/group.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" +#include "ompi/proc/proc.h" +#include "coll_han.h" + +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include +#include + + +/* + * Local functions + */ +static int han_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); +static int mca_coll_han_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm); + +/* + * Module constructor + */ +static void mca_coll_han_module_construct(mca_coll_han_module_t * module) +{ + module->enabled = false; + module->super.coll_module_disable = mca_coll_han_module_disable; + module->cached_comm = NULL; + module->cached_low_comms = NULL; + module->cached_up_comms = NULL; + module->cached_vranks = NULL; + module->cached_topo = NULL; + module->is_mapbycore = false; +} + +/* + * Module destructor + */ +static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) +{ + module->enabled = false; + if (module->cached_low_comms != NULL) { + ompi_comm_free(&(module->cached_low_comms[0])); + ompi_comm_free(&(module->cached_low_comms[1])); + module->cached_low_comms[0] = NULL; + module->cached_low_comms[1] = NULL; + free(module->cached_low_comms); + module->cached_low_comms = NULL; + } + if (module->cached_up_comms != NULL) { + ompi_comm_free(&(module->cached_up_comms[0])); + ompi_comm_free(&(module->cached_up_comms[1])); + module->cached_up_comms[0] = NULL; + module->cached_up_comms[1] = NULL; + free(module->cached_up_comms); + module->cached_up_comms = NULL; + } + if (module->cached_vranks != NULL) { + free(module->cached_vranks); + module->cached_vranks = NULL; + } + if (module->cached_topo != NULL) { + free(module->cached_topo); + module->cached_topo = NULL; + } +} + +/* + * Module disable + */ +static int mca_coll_han_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) +{ + return OMPI_SUCCESS; +} + + +OBJ_CLASS_INSTANCE(mca_coll_han_module_t, + mca_coll_base_module_t, + mca_coll_han_module_construct, mca_coll_han_module_destruct); + +/* + * Initial query function that is invoked during MPI_INIT, allowing + * this component to disqualify itself if it doesn't support the + * required level of thread support. This function is invoked exactly + * once. + */ +int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_threads) +{ + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:init_query: pick me! pick me!"); + return OMPI_SUCCESS; +} + + +/* + * Invoked when there's a new communicator that has been created. + * Look at the communicator and decide which set of functions and + * priority we want to return. + */ +mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) +{ + mca_coll_han_module_t *han_module; + + /* If we're intercomm, or if there's only one process in the + communicator */ + if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) + || !ompi_group_have_remote_peers(comm->c_local_group)) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): intercomm, comm is too small, only on one node; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + + /* Get the priority level attached to this module. If priority is less + * than or equal to 0, then the module is unavailable. */ + *priority = mca_coll_han_component.han_priority; + if (mca_coll_han_component.han_priority <= 0) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): priority too low; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + + han_module = OBJ_NEW(mca_coll_han_module_t); + if (NULL == han_module) { + return NULL; + } + + /* All is good -- return a module */ + han_module->super.coll_module_enable = han_module_enable; + han_module->super.ft_event = NULL; + han_module->super.coll_allgather = NULL; //mca_coll_han_allgather_intra; + han_module->super.coll_allgatherv = NULL; + han_module->super.coll_allreduce = mca_coll_han_allreduce_intra; + han_module->super.coll_alltoall = NULL; + han_module->super.coll_alltoallv = NULL; + han_module->super.coll_alltoallw = NULL; + han_module->super.coll_barrier = NULL; + han_module->super.coll_bcast = mca_coll_han_bcast_intra; + han_module->super.coll_exscan = NULL; + han_module->super.coll_gather = mca_coll_han_gather_intra; + han_module->super.coll_gatherv = NULL; + han_module->super.coll_reduce = mca_coll_han_reduce_intra; + han_module->super.coll_reduce_scatter = NULL; + han_module->super.coll_scan = NULL; + han_module->super.coll_scatter = mca_coll_han_scatter_intra; + han_module->super.coll_scatterv = NULL; + + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): pick me! pick me!", + comm->c_contextid, comm->c_name); + return &(han_module->super); +} + + +/* + * Init module on the communicator + */ +static int han_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm) +{ + return OMPI_SUCCESS; +} + +/* + * Free the han request + */ +int han_request_free(ompi_request_t ** request) +{ + (*request)->req_state = OMPI_REQUEST_INVALID; + OBJ_RELEASE(*request); + *request = MPI_REQUEST_NULL; + return OMPI_SUCCESS; +} + +/* Create the communicators used in the HAN module */ +void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module) +{ + /* Use cached communicators if possible */ + if (han_module->cached_comm == comm && han_module->cached_low_comms != NULL + && han_module->cached_up_comms != NULL && han_module->cached_vranks != NULL) { + return; + } + /* Create communicators if there is no cached communicator */ + else { + int low_rank, low_size; + int up_rank; + int w_rank = ompi_comm_rank(comm); + int w_size = ompi_comm_size(comm); + ompi_communicator_t **low_comms = + (struct ompi_communicator_t **) malloc(sizeof(struct ompi_communicator_t *) * 2); + ompi_communicator_t **up_comms = + (struct ompi_communicator_t **) malloc(sizeof(struct ompi_communicator_t *) * 2); + /* Create low_comms which contain all the process on a node */ + const int *origin_priority = NULL; + /* Lower the priority of HAN module */ + int han_var_id; + int tmp_han_priority = 0; + int tmp_han_origin = 0; + mca_base_var_find_by_name("coll_han_priority", &han_var_id); + mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); + tmp_han_origin = *origin_priority; + mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, + NULL); + comm->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling; + comm->c_coll->coll_allgather = ompi_coll_base_allgather_intra_bruck; + + int var_id; + int tmp_priority = 100; + int tmp_origin = 0; + /* Set up low_comms[0] with sm module */ + mca_base_var_find_by_name("coll_sm_priority", &var_id); + mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); + tmp_origin = *origin_priority; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] sm_priority origin %d %d\n", w_rank, *origin_priority, + tmp_origin)); + mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, (opal_info_t *) (&ompi_mpi_info_null), + &(low_comms[0])); + mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + low_size = ompi_comm_size(low_comms[0]); + low_rank = ompi_comm_rank(low_comms[0]); + + /* Set up low_comms[1] with solo module */ + mca_base_var_find_by_name("coll_solo_priority", &var_id); + mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); + tmp_origin = *origin_priority; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] solo_priority origin %d %d\n", w_rank, *origin_priority, + tmp_origin)); + mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, (opal_info_t *) (&ompi_mpi_info_null), + &(low_comms[1])); + mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + + /* Create up_comms[0] with libnbc which contain one process per node (across nodes) */ + mca_base_var_find_by_name("coll_libnbc_priority", &var_id); + mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); + tmp_origin = *origin_priority; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] libnbc_priority origin %d %d\n", w_rank, *origin_priority, + tmp_origin)); + mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + ompi_comm_split(comm, low_rank, w_rank, &(up_comms[0]), false); + mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + up_rank = ompi_comm_rank(up_comms[0]); + + /* Create up_comms[1] with adapt which contain one process per node (across nodes) */ + mca_base_var_find_by_name("coll_adapt_priority", &var_id); + mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); + tmp_origin = *origin_priority; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] adapt_priority origin %d %d\n", w_rank, *origin_priority, + tmp_origin)); + mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + ompi_comm_split(comm, low_rank, w_rank, &(up_comms[1]), false); + mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); + + int *vranks = malloc(sizeof(int) * w_size); + /* Do allgather to gather vrank from each process so every process knows other processes' vrank */ + int vrank = low_size * up_rank + low_rank; + ompi_coll_base_allgather_intra_bruck(&vrank, 1, MPI_INT, vranks, 1, MPI_INT, comm, + comm->c_coll->coll_allgather_module); + han_module->cached_comm = comm; + han_module->cached_low_comms = low_comms; + han_module->cached_up_comms = up_comms; + han_module->cached_vranks = vranks; + + mca_base_var_set_value(han_var_id, &tmp_han_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, + NULL); + comm->c_coll->coll_allreduce = mca_coll_han_allreduce_intra; + comm->c_coll->coll_allgather = mca_coll_han_allgather_intra; + } +} + +int mca_coll_han_pow10_int(int pow_value) +{ + int i, result = 1; + for (i = 0; i < pow_value; i++) { + result *= 10; + } + return result; +} + +int mca_coll_han_hostname_to_number(char *hostname, int size) +{ + int i = 0, j = 0; + char *number_array = (char *) malloc(sizeof(char) * size); + while (hostname[i] != '\0') { + if (hostname[i] >= '0' && hostname[i] <= '9') { + number_array[j++] = hostname[i]; + } + i++; + } + int number = 0; + for (i = 0; i < j; i++) { + number += (number_array[i] - '0') * mca_coll_han_pow10_int(j - 1 - i); + } + free(number_array); + return number; +} + +void mca_coll_han_topo_get(int *topo, struct ompi_communicator_t *comm, int num_topo_level) +{ + int *self_topo = (int *) malloc(sizeof(int) * num_topo_level); + /* Set daemon vpid */ + char hostname[1024]; + gethostname(hostname, 1024); + self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024); + /* Set core id */ + self_topo[1] = ompi_comm_rank(comm); + + /* Allgather all the topology information */ + ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT, topo, num_topo_level, + MPI_INT, comm, comm->c_coll->coll_allgather_module); + free(self_topo); + return; +} + +void mca_coll_han_topo_sort(int *topo, int start, int end, int size, int level, int num_topo_level) +{ + if (level > num_topo_level - 1 || start >= end) { + return; + } + int i, j; + int min = INT_MAX; + int min_loc = -1; + for (i = start; i <= end; i++) { + /* Find min */ + for (j = i; j <= end; j++) { + if (topo[j * num_topo_level + level] < min) { + min = topo[j * num_topo_level + level]; + min_loc = j; + + } + } + /* Swap i and min_loc */ + int temp; + for (j = 0; j < num_topo_level; j++) { + temp = topo[i * num_topo_level + j]; + topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j]; + topo[min_loc * num_topo_level + j] = temp; + } + min = INT_MAX; + min_loc = -1; + } + int last = 0; + int new_start = 0; + int new_end = 0; + for (i = start; i <= end; i++) { + if (i == start) { + last = topo[i * num_topo_level + level]; + new_start = start; + } else if (i == end) { + new_end = end; + mca_coll_han_topo_sort(topo, new_start, new_end, size, level + 1, num_topo_level); + } else if (last != topo[i * num_topo_level + level]) { + new_end = i - 1; + mca_coll_han_topo_sort(topo, new_start, new_end, size, level + 1, num_topo_level); + new_start = i; + last = topo[i * num_topo_level + level]; + } + } + return; +} + +/* Check if the current processes are mapped by core */ +bool mca_coll_han_topo_is_mapbycore(int *topo, struct ompi_communicator_t * comm, + int num_topo_level) +{ + int i; + int size = ompi_comm_size(comm); + for (i = 1; i < size; i++) { + if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level] + || topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) { + return false; + + } + } + return true; +} + +int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, + int num_topo_level) +{ + int size; + size = ompi_comm_size(comm); + int *topo; + if ((han_module->cached_topo != NULL) && (han_module->cached_comm == comm)) { + topo = han_module->cached_topo; + } + else { + if (han_module->cached_topo != NULL) { + free(han_module->cached_topo); + han_module->cached_topo = NULL; + } + topo = (int *) malloc(sizeof(int) * size * num_topo_level); + /* Get topo infomation */ + mca_coll_han_topo_get(topo, comm, num_topo_level); + mca_coll_han_topo_print(topo, comm, num_topo_level); + + /* Check if the processes are mapped by core */ + han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level); + /* Sort the topo such that each group is contiguous */ + if (!han_module->is_mapbycore) { + mca_coll_han_topo_sort(topo, 0, size - 1, size, 0, num_topo_level); + } + han_module->cached_topo = topo; + han_module->cached_comm = comm; + } + + mca_coll_han_topo_print(topo, comm, num_topo_level); + return topo; +} + +/* Print out the topology info, for debugging purpose */ +void mca_coll_han_topo_print(int *topo, struct ompi_communicator_t *comm, int num_topo_level) +{ + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); + + if (rank == 0) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: HAN topo: ", rank)); + int i; + for (i = 0; i < size * num_topo_level; i++) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i])); + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n")); + + } +} diff --git a/ompi/mca/coll/han/coll_han_reduce.c b/ompi/mca/coll/han/coll_han_reduce.c new file mode 100644 index 0000000000..f6137a8cd0 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_reduce.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, + int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop) +{ + argu->cur_task = cur_task; + argu->sbuf = sbuf; + argu->rbuf = rbuf; + argu->seg_count = seg_count; + argu->dtype = dtype; + argu->op = op; + argu->root_low_rank = root_low_rank; + argu->root_up_rank = root_up_rank; + argu->up_comm = up_comm; + argu->low_comm = low_comm; + argu->num_segments = num_segments; + argu->cur_seg = cur_seg; + argu->w_rank = w_rank; + argu->last_seg_count = last_seg_count; + argu->noop = noop; +} + +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce: + * lb: low level (shared-memory or intra-node) reduce. +* ub: upper level (inter-node) reduce + * Hence, in each iteration, there is a combination of collective operations which is called a task. + * | seg 0 | seg 1 | seg 2 | seg 3 | + * iter 0 | lr | | | | task: t0, contains lr + * iter 1 | ur | lr | | | task: t1, contains ur and lr + * iter 2 | | ur | lr | | task: t1, contains ur and lr + * iter 3 | | | ur | lr | task: t1, contains ur and lr + * iter 4 | | | | ur | task: t1, contains ur + */ +int +mca_coll_han_reduce_intra(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t* op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) +{ + ptrdiff_t extent, lb; + ompi_datatype_get_extent(dtype, &lb, &extent); + int w_rank; + w_rank = ompi_comm_rank(comm); + int seg_count = count; + size_t typelng; + ompi_datatype_type_size(dtype, &typelng); + + /* Create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm; + ompi_communicator_t *up_comm; + + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, typelng, + seg_count); + + int num_segments = (count + seg_count - 1) / seg_count; + OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output, + "In HAN seg_count %d count %d num_seg %d\n", + seg_count, count, num_segments)); + + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + + int root_low_rank; + int root_up_rank; + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, + root_up_rank)); + + /* Create t0 tasks for the first segment */ + mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); + /* Setup up t0 task arguments */ + mca_reduce_argu_t *t = malloc(sizeof(mca_reduce_argu_t)); + mac_coll_han_set_reduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, + op, root_up_rank, root_low_rank, up_comm, low_comm, + num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, + low_rank != root_low_rank); + /* Init the first task */ + init_task(t0, mca_coll_han_reduce_t0_task, (void *) t); + issue_task(t0); + + /* Create t1 task */ + mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); + /* Setup up t1 task arguments */ + t->cur_task = t1; + /* Init the t1 task */ + init_task(t1, mca_coll_han_reduce_t1_task, (void *) t); + issue_task(t1); + + while (t->cur_seg <= t->num_segments - 2) { + /* Create t1 task */ + mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); + /* Setup up t1 task arguments */ + t->cur_task = t1; + t->sbuf = (char *) t->sbuf + extent * t->seg_count; + t->rbuf = (char *) t->rbuf + extent * t->seg_count; + t->cur_seg = t->cur_seg + 1; + /* Init the t1 task */ + init_task(t1, mca_coll_han_reduce_t1_task, (void *) t); + issue_task(t1); + } + + free(t); + + return OMPI_SUCCESS; +} + +/* t0 task: issue and wait for the low level reduce of segment 0 */ +int mca_coll_han_reduce_t0_task(void *task_argu) +{ + mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, + t->cur_seg)); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + return OMPI_SUCCESS; +} + +/* t1 task */ +int mca_coll_han_reduce_t1_task(void *task_argu) { + mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, + t->cur_seg)); + OBJ_RELEASE(t->cur_task); + ptrdiff_t extent, lb; + ompi_datatype_get_extent(t->dtype, &lb, &extent); + ompi_request_t *ireduce_req = NULL; + int tmp_count = t->seg_count; + if (!t->noop) { + int up_rank = ompi_comm_rank(t->up_comm); + /* ur of cur_seg */ + if (up_rank == t->root_up_rank) { + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_up_rank, t->up_comm, &ireduce_req, + t->up_comm->c_coll->coll_ireduce_module); + } else { + t->up_comm->c_coll->coll_ireduce((char *) t->rbuf, (char *) t->rbuf, t->seg_count, + t->dtype, t->op, t->root_up_rank, t->up_comm, + &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); + } + } + /* lr of cur_seg+1 */ + if (t->cur_seg <= t->num_segments - 2) { + if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } + t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count, + (char *) t->rbuf + extent * t->seg_count, tmp_count, + t->dtype, t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + + } + if (!t->noop && ireduce_req) { + ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); + } + + return OMPI_SUCCESS; +} \ No newline at end of file diff --git a/ompi/mca/coll/han/coll_han_scatter.c b/ompi/mca/coll/han/coll_han_scatter.c new file mode 100644 index 0000000000..90d92659e1 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_scatter.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/mca/pml/pml.h" +#include "coll_han_trigger.h" + +/* Only work with regular situation (each node has equal number of processes) */ + +void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + void *sbuf_reorder_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, ompi_request_t * req) +{ + argu->cur_task = cur_task; + argu->sbuf = sbuf; + argu->sbuf_inter_free = sbuf_inter_free; + argu->sbuf_reorder_free = sbuf_reorder_free; + argu->scount = scount; + argu->sdtype = sdtype; + argu->rbuf = rbuf; + argu->rcount = rcount; + argu->rdtype = rdtype; + argu->root = root; + argu->root_up_rank = root_up_rank; + argu->root_low_rank = root_low_rank; + argu->up_comm = up_comm; + argu->low_comm = low_comm; + argu->w_rank = w_rank; + argu->noop = noop; + argu->req = req; +} + +int +mca_coll_han_scatter_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) +{ + int i, j; + int w_rank, w_size; + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + + /* Create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int up_size = ompi_comm_size(up_comm); + + ompi_request_t *temp_request = NULL; + /* Set up request */ + temp_request = OBJ_NEW(ompi_request_t); + OMPI_REQUEST_INIT(temp_request, false); + temp_request->req_state = OMPI_REQUEST_ACTIVE; + temp_request->req_type = 0; + temp_request->req_free = han_request_free; + temp_request->req_status.MPI_SOURCE = 0; + temp_request->req_status.MPI_TAG = 0; + temp_request->req_status.MPI_ERROR = 0; + temp_request->req_status._cancelled = 0; + temp_request->req_status._ucount = 0; + + int root_low_rank; + int root_up_rank; + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank, + root, root_low_rank, root_up_rank)); + + /* Reorder sbuf based on rank. + * Suppose, message is 0 1 2 3 4 5 6 7 + * and the processes are mapped on 2 nodes (the processes on the node 0 is 0 2 4 6 and the processes on the node 1 is 1 3 5 7), + * so the message needs to be reordered to 0 2 4 6 1 3 5 7 + */ + char *reorder_buf = NULL; + char *reorder_sbuf = NULL; + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + if (w_rank == root) { + /* If the processes are mapped-by core, no need to reorder */ + if (han_module->is_mapbycore) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Scatter is_bycore: ", w_rank)); + reorder_sbuf = (char *) sbuf; + } else { + ptrdiff_t ssize, sgap = 0, sextent; + ompi_datatype_type_extent(sdtype, &sextent); + ssize = opal_datatype_span(&sdtype->super, (int64_t) scount * w_size, &sgap); + reorder_buf = (char *) malloc(ssize); + reorder_sbuf = reorder_buf - sgap; + for (i = 0; i < up_size; i++) { + for (j = 0; j < low_size; j++) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Scatter copy from %d %d\n", w_rank, + (i * low_size + j) * 2 + 1, + topo[(i * low_size + j) * 2 + 1])); + ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t) scount, + reorder_sbuf + sextent * (i * low_size + + j) * + (ptrdiff_t) scount, + (char *) sbuf + + sextent * + (ptrdiff_t) topo[(i * low_size + j) * 2 + + 1] * (ptrdiff_t) scount); + } + } + } + } + + /* Create us task */ + mca_coll_task_t *us = OBJ_NEW(mca_coll_task_t); + /* Setup us task arguments */ + mca_scatter_argu_t *us_argu = malloc(sizeof(mca_scatter_argu_t)); + mac_coll_han_set_scatter_argu(us_argu, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, + (char *) rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, + up_comm, low_comm, w_rank, low_rank != root_low_rank, + temp_request); + /* Init us task */ + init_task(us, mca_coll_han_scatter_us_task, (void *) (us_argu)); + /* Issure us task */ + issue_task(us); + + ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); + return OMPI_SUCCESS; + +} + +/* us: upper level (intra-node) scatter task */ +int mca_coll_han_scatter_us_task(void *task_argu) +{ + mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; + OBJ_RELEASE(t->cur_task); + + if (t->noop) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: us noop\n", + t->w_rank)); + } else { + int low_size = ompi_comm_size(t->low_comm); + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); + char *tmp_buf = (char *) malloc(rsize); + char *tmp_rbuf = tmp_buf - rgap; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] Han Scatter: us scatter\n", t->w_rank)); + /* Inter node scatter */ + t->up_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount * low_size, t->sdtype, + tmp_rbuf, t->rcount * low_size, t->rdtype, t->root_up_rank, + t->up_comm, t->up_comm->c_coll->coll_scatter_module); + t->sbuf = tmp_rbuf; + t->sbuf_inter_free = tmp_buf; + } + + if (t->sbuf_reorder_free != NULL && t->root == t->w_rank) { + free(t->sbuf_reorder_free); + t->sbuf_reorder_free = NULL; + } + /* Create ls tasks for the current union segment */ + mca_coll_task_t *ls = OBJ_NEW(mca_coll_task_t); + /* Setup up ls task arguments */ + t->cur_task = ls; + /* Init ls task */ + init_task(ls, mca_coll_han_scatter_ls_task, (void *) t); + /* Issure ls task */ + issue_task(ls); + + return OMPI_SUCCESS; +} + +/* ls: lower level (shared memory) scatter task */ +int mca_coll_han_scatter_ls_task(void *task_argu) +{ + mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls\n", + t->w_rank)); + OBJ_RELEASE(t->cur_task); + /* Shared memory scatter */ + t->low_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount, t->sdtype, (char *) t->rbuf, + t->rcount, t->rdtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_scatter_module); + + if (t->sbuf_inter_free != NULL && t->noop != true) { + free(t->sbuf_inter_free); + t->sbuf_inter_free = NULL; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls finish\n", + t->w_rank)); + ompi_request_t *temp_req = t->req; + free(t); + ompi_request_complete(temp_req, 1); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_trigger.c b/ompi/mca/coll/han/coll_han_trigger.c new file mode 100644 index 0000000000..458a1da52e --- /dev/null +++ b/ompi/mca/coll/han/coll_han_trigger.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han_trigger.h" + +static void mca_coll_task_constructor(mca_coll_task_t * t) +{ + t->func_ptr = NULL; + t->func_argu = NULL; +} + +static void mca_coll_task_destructor(mca_coll_task_t * t) +{ + t->func_ptr = NULL; + t->func_argu = NULL; +} + +OBJ_CLASS_INSTANCE(mca_coll_task_t, opal_object_t, mca_coll_task_constructor, + mca_coll_task_destructor); + +/* Init task */ +int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu) +{ + t->func_ptr = func_ptr; + t->func_argu = func_argu; + return OMPI_SUCCESS; +} + +/* Issue the task */ +int issue_task(mca_coll_task_t * t) +{ + t->func_ptr(t->func_argu); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_trigger.h b/ompi/mca/coll/han/coll_han_trigger.h new file mode 100644 index 0000000000..c7314d25fb --- /dev/null +++ b/ompi/mca/coll/han/coll_han_trigger.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_HAN_TRIGGER_EXPORT_H +#define MCA_COLL_HAN_TRIGGER_EXPORT_H + +#include "ompi_config.h" +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/communicator/communicator.h" +#include "ompi/win/win.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "opal/util/info.h" +#include "ompi/op/op.h" +#include "opal/runtime/opal_progress.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/coll/base/coll_tags.h" + +typedef int (*task_func_ptr) (void *); + +struct mca_coll_task_s { + opal_object_t super; + task_func_ptr func_ptr; + void *func_argu; +}; + +typedef struct mca_coll_task_s mca_coll_task_t; + +OBJ_CLASS_DECLARATION(mca_coll_task_t); + +/* Init task */ +int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu); + +/* Issue the task */ +int issue_task(mca_coll_task_t * t); + +#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_utils.c b/ompi/mca/coll/han/coll_han_utils.c new file mode 100644 index 0000000000..293777a256 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_utils.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "coll_han.h" + +/* Get root's low_rank and up_rank from vranks array */ +void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, + int *root_up_rank) +{ + *root_up_rank = vranks[root] / low_size; + *root_low_rank = vranks[root] % low_size; +} + +uint32_t han_auto_tuned_get_n(uint32_t n) +{ + uint32_t avail[5] = { 4, 8, 16, 32, 64 }; + uint32_t i; + for (i = 0; i < 5; i++) { + if (avail[i] >= n) { + return i; + } + } + return i - 1; +} + +uint32_t han_auto_tuned_get_c(uint32_t c) +{ + uint32_t avail[3] = { 4, 8, 12 }; + uint32_t i; + for (i = 0; i < 3; i++) { + if (avail[i] >= c) { + return i; + } + } + return i - 1; +} + +uint32_t han_auto_tuned_get_m(uint32_t m) +{ + uint32_t avail[21] = + { 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, +262144, 524288, 1048576, 2097152, 4194304 }; + uint32_t i; + for (i = 0; i < 21; i++) { + if (avail[i] >= m) { + return i; + } + } + return i - 1; +} From 220b997a580eff9f9046fb6787a76f33a406cfd3 Mon Sep 17 00:00:00 2001 From: bsergentm Date: Sat, 9 May 2020 18:16:32 +0200 Subject: [PATCH 2/4] Coll/han Bull * first import of Bull specific modifications to HAN * Cleaning, renaming and compilation fixing Changed all future into han. * Import BULL specific modifications in coll/tuned and coll/base * Fixed compilation issues in Han * Changed han_output to directly point to coll framework output. * The verbosity MCA parameter was removed as a duplicated of coll verbosity * Add fallback in han reduce when op cannot commute and ppn are imbalanced * Added fallback wfor han bcast when nodes do not have the same number of process * Add fallback in han scatter when ppn are imbalanced + fixed missing scatter_fn pointer in the module interface Signed-off-by: Brelle Emmanuel Co-authored-by: a700850 Co-authored-by: germainf --- ompi/mca/coll/base/coll_base_comm_select.c | 44 +- ompi/mca/coll/base/coll_base_comm_unselect.c | 15 + ompi/mca/coll/base/coll_base_util.c | 36 + ompi/mca/coll/base/coll_base_util.h | 17 + ompi/mca/coll/coll.h | 4 + ompi/mca/coll/han/Makefile.am | 6 + ompi/mca/coll/han/coll_han.h | 228 ++- ompi/mca/coll/han/coll_han_allgather.c | 128 +- ompi/mca/coll/han/coll_han_allreduce.c | 156 +- ompi/mca/coll/han/coll_han_bcast.c | 72 +- ompi/mca/coll/han/coll_han_component.c | 261 +++- ompi/mca/coll/han/coll_han_dynamic.c | 1338 +++++++++++++++++ ompi/mca/coll/han/coll_han_dynamic.h | 214 +++ ompi/mca/coll/han/coll_han_dynamic_file.c | 690 +++++++++ ompi/mca/coll/han/coll_han_dynamic_file.h | 111 ++ ompi/mca/coll/han/coll_han_gather.c | 339 ++++- ompi/mca/coll/han/coll_han_module.c | 521 +++---- ompi/mca/coll/han/coll_han_reduce.c | 200 ++- ompi/mca/coll/han/coll_han_scatter.c | 24 +- ompi/mca/coll/han/coll_han_subcomms.c | 490 ++++++ ompi/mca/coll/han/coll_han_topo.c | 347 +++++ ompi/mca/coll/tuned/coll_tuned_dynamic_file.c | 43 +- 22 files changed, 4775 insertions(+), 509 deletions(-) create mode 100644 ompi/mca/coll/han/coll_han_dynamic.c create mode 100644 ompi/mca/coll/han/coll_han_dynamic.h create mode 100644 ompi/mca/coll/han/coll_han_dynamic_file.c create mode 100644 ompi/mca/coll/han/coll_han_dynamic_file.h create mode 100644 ompi/mca/coll/han/coll_han_subcomms.c create mode 100644 ompi/mca/coll/han/coll_han_topo.c diff --git a/ompi/mca/coll/base/coll_base_comm_select.c b/ompi/mca/coll/base/coll_base_comm_select.c index b853f1ad26..405bd6b388 100644 --- a/ompi/mca/coll/base/coll_base_comm_select.c +++ b/ompi/mca/coll/base/coll_base_comm_select.c @@ -21,6 +21,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 BULL S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,20 +45,12 @@ #include "opal/mca/base/base.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/base.h" - +#include "ompi/mca/coll/base/coll_base_util.h" /* - * Local types + * Stuff for the OBJ interface */ -struct avail_coll_t { - opal_list_item_t super; - - int ac_priority; - mca_coll_base_module_2_3_0_t *ac_module; - const char * ac_component_name; -}; -typedef struct avail_coll_t avail_coll_t; - +OBJ_CLASS_INSTANCE(mca_coll_base_avail_coll_t, opal_list_item_t, NULL, NULL); /* * Local functions @@ -77,12 +70,6 @@ static int query_2_0_0(const mca_coll_base_component_2_0_0_t * int *priority, mca_coll_base_module_2_3_0_t ** module); -/* - * Stuff for the OBJ interface - */ -static OBJ_CLASS_INSTANCE(avail_coll_t, opal_list_item_t, NULL, NULL); - - #define COPY(module, comm, func) \ do { \ if (NULL != module->coll_ ## func) { \ @@ -138,11 +125,14 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) /* FIX ME - Do some kind of collective operation to find a module that everyone has available */ + /* List to store every valid module */ + comm->c_coll->module_list = OBJ_NEW(opal_list_t); + /* do the selection loop */ for (item = opal_list_remove_first(selectable); NULL != item; item = opal_list_remove_first(selectable)) { - avail_coll_t *avail = (avail_coll_t *) item; + mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item; /* initialize the module */ ret = avail->ac_module->coll_module_enable(avail->ac_module, comm); @@ -153,6 +143,9 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) (OMPI_SUCCESS == ret ? "Enabled": "Disabled") ); if (OMPI_SUCCESS == ret) { + /* Save every component that is initialized, + * queried and enabled successfully */ + opal_list_append(comm->c_coll->module_list, &avail->super); /* copy over any of the pointers */ COPY(avail->ac_module, comm, allgather); @@ -230,10 +223,11 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) COPY(avail->ac_module, comm, neighbor_alltoallw_init); COPY(avail->ac_module, comm, reduce_local); + } else { + /* release the original module reference and the list item */ + OBJ_RELEASE(avail->ac_module); + OBJ_RELEASE(avail); } - /* release the original module reference and the list item */ - OBJ_RELEASE(avail->ac_module); - OBJ_RELEASE(avail); } /* Done with the list from the check_components() call so release it. */ @@ -306,8 +300,8 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) static int avail_coll_compare (opal_list_item_t **a, opal_list_item_t **b) { - avail_coll_t *acoll = (avail_coll_t *) *a; - avail_coll_t *bcoll = (avail_coll_t *) *b; + mca_coll_base_avail_coll_t *acoll = (mca_coll_base_avail_coll_t *) *a; + mca_coll_base_avail_coll_t *bcoll = (mca_coll_base_avail_coll_t *) *b; if (acoll->ac_priority > bcoll->ac_priority) { return 1; @@ -332,7 +326,7 @@ static opal_list_t *check_components(opal_list_t * components, mca_base_component_list_item_t *cli; mca_coll_base_module_2_3_0_t *module; opal_list_t *selectable; - avail_coll_t *avail; + mca_coll_base_avail_coll_t *avail; /* Make a list of the components that query successfully */ selectable = OBJ_NEW(opal_list_t); @@ -345,7 +339,7 @@ static opal_list_t *check_components(opal_list_t * components, if (priority >= 0) { /* We have a component that indicated that it wants to run by giving us a module */ - avail = OBJ_NEW(avail_coll_t); + avail = OBJ_NEW(mca_coll_base_avail_coll_t); avail->ac_priority = priority; avail->ac_module = module; // Point to the string so we don't have to free later diff --git a/ompi/mca/coll/base/coll_base_comm_unselect.c b/ompi/mca/coll/base/coll_base_comm_unselect.c index fea0a53ec7..0e0f1bb5bf 100644 --- a/ompi/mca/coll/base/coll_base_comm_unselect.c +++ b/ompi/mca/coll/base/coll_base_comm_unselect.c @@ -16,6 +16,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 BULL S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +35,7 @@ #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/base/base.h" +#include "ompi/mca/coll/base/coll_base_util.h" #define CLOSE(comm, func) \ do { \ @@ -50,6 +52,8 @@ int mca_coll_base_comm_unselect(ompi_communicator_t * comm) { + opal_list_item_t *item; + CLOSE(comm, allgather); CLOSE(comm, allgatherv); CLOSE(comm, allreduce); @@ -124,6 +128,17 @@ int mca_coll_base_comm_unselect(ompi_communicator_t * comm) CLOSE(comm, reduce_local); + for (item = opal_list_remove_first(comm->c_coll->module_list); + NULL != item; item = opal_list_remove_first(comm->c_coll->module_list)) { + mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item; + + if(avail->ac_module) { + OBJ_RELEASE(avail->ac_module); + } + OBJ_RELEASE(avail); + } + OBJ_RELEASE(comm->c_coll->module_list); + free(comm->c_coll); comm->c_coll = NULL; diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 91dd677dbc..29b4a70cac 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -305,3 +305,39 @@ static void nbc_req_cons(ompi_coll_base_nbc_request_t *req) { } OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, NULL); + +/* File reading functions */ +static void skiptonewline (FILE *fptr, int *fileline) +{ + do { + char val; + int rc; + + rc = fread(&val, 1, 1, fptr); + if (0 == rc) return; + if ((1 == rc)&&('\n' == val)) { + (*fileline)++; + return; + } + } while (1); +} + +long ompi_coll_base_file_getnext (FILE *fptr, int *fileline) +{ + do { + long val; + int rc; + char trash; + + rc = fscanf(fptr, "%li", &val); + if (rc == EOF) return MYEOF; + if (1 == rc) return val; + /* in all other cases, skip to the end */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) return MYEOF; + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index 05eaa41953..239322b022 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -84,6 +84,19 @@ ompi_coll_base_nbc_reserve_tags(ompi_communicator_t* comm, int32_t reserve) typedef struct ompi_coll_base_nbc_request_t ompi_coll_base_nbc_request_t; +/* + * Structure to store an available module + */ +struct mca_coll_base_avail_coll_t { + opal_list_item_t super; + + int ac_priority; + mca_coll_base_module_t *ac_module; + const char * ac_component_name; +}; +typedef struct mca_coll_base_avail_coll_t mca_coll_base_avail_coll_t; +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_avail_coll_t); + /** * A MPI_like function doing a send and a receive simultaneously. * If one of the communications results in a zero-byte message the @@ -164,5 +177,9 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request, ompi_datatype_t * const stypes[], ompi_datatype_t * const rtypes[]); +/* File reading function */ +#define MYEOF -999 +long ompi_coll_base_file_getnext(FILE *fptr, int *fileline); + END_C_DECLS #endif /* MCA_COLL_BASE_UTIL_EXPORT_H */ diff --git a/ompi/mca/coll/coll.h b/ompi/mca/coll/coll.h index f852f26732..57e4af4ac0 100644 --- a/ompi/mca/coll/coll.h +++ b/ompi/mca/coll/coll.h @@ -19,6 +19,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2020 BULL S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -767,6 +768,9 @@ struct mca_coll_base_comm_coll_t { mca_coll_base_module_reduce_local_fn_t coll_reduce_local; mca_coll_base_module_2_3_0_t *coll_reduce_local_module; + + /* List of modules initialized, queried and enabled */ + opal_list_t *module_list; }; typedef struct mca_coll_base_comm_coll_t mca_coll_base_comm_coll_t; diff --git a/ompi/mca/coll/han/Makefile.am b/ompi/mca/coll/han/Makefile.am index 380b44d615..55892512e3 100644 --- a/ompi/mca/coll/han/Makefile.am +++ b/ompi/mca/coll/han/Makefile.am @@ -12,6 +12,8 @@ sources = \ coll_han.h \ coll_han_trigger.h \ +coll_han_dynamic.h \ +coll_han_dynamic_file.h \ coll_han_bcast.c \ coll_han_reduce.c \ coll_han_scatter.c \ @@ -21,6 +23,10 @@ coll_han_allgather.c \ coll_han_component.c \ coll_han_module.c \ coll_han_trigger.c \ +coll_han_dynamic.c \ +coll_han_dynamic_file.c \ +coll_han_topo.c \ +coll_han_subcomms.c \ coll_han_utils.c # Make the output library in this directory, and name it either diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h index 307fa52044..1af75ffec3 100644 --- a/ompi/mca/coll/han/coll_han.h +++ b/ompi/mca/coll/han/coll_han.h @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,19 +17,23 @@ #include "mpi.h" #include "ompi/mca/mca.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/communicator/communicator.h" -#include "ompi/include/mpi.h" -#include "ompi/mca/coll/base/coll_base_functions.h" -#include "opal/util/info.h" -#include "ompi/op/op.h" -#include "opal/runtime/opal_progress.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/coll/base/coll_tags.h" +#include "opal/util/output.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "coll_han_trigger.h" +#include "ompi/mca/coll/han/coll_han_dynamic.h" -BEGIN_C_DECLS typedef struct { +BEGIN_C_DECLS + +/* + * Today; + * . only 2 modules available for intranode (low) level + * . only 2 modules available for internode (up) level + */ + +#define COLL_HAN_LOW_MODULES 2 +#define COLL_HAN_UP_MODULES 2 + +typedef struct { uint32_t umod; uint32_t lmod; uint32_t fs; @@ -200,6 +205,10 @@ typedef struct mca_coll_han_component_t { uint32_t han_scatter_low_module; /* whether enable auto tune */ uint32_t han_auto_tune; + /* whether we need reproducible results + * (but disables topological optimisations) + */ + uint32_t han_reproducible; /* create a 3D array * num_processes (n): 2 4 8 16 32 64 (6) * num_core (c): 2 4 8 12 (4) @@ -209,8 +218,42 @@ typedef struct mca_coll_han_component_t { uint32_t han_auto_tune_c; uint32_t han_auto_tune_m; selection *han_auto_tuned; + bool use_simple_algorithm[COLLCOUNT]; + + /* Dynamic configuration rules */ + bool use_dynamic_file_rules; + bool dump_dynamic_rules; + char* dynamic_rules_filename; + /* Dynamic rules from file */ + mca_coll_han_dynamic_rules_t dynamic_rules; + /* Dynamic rules from mca parameter */ + COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL]; + int topo_level; + + /* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */ + int max_dynamic_errors; } mca_coll_han_component_t; +typedef void (*previous_dummy_fn_t) (void); + +/* + * Structure used to store what is necessary for the collective operations + * routines in case of fallback. + */ +typedef struct collective_fallback_t { + union { + mca_coll_base_module_allgather_fn_t allgather; + mca_coll_base_module_allgatherv_fn_t allgatherv; + mca_coll_base_module_allreduce_fn_t allreduce; + mca_coll_base_module_bcast_fn_t bcast; + mca_coll_base_module_gather_fn_t gather; + mca_coll_base_module_reduce_fn_t reduce; + mca_coll_base_module_scatter_fn_t scatter; + previous_dummy_fn_t dummy; + } previous_routine; + mca_coll_base_module_t *previous_module; +} collective_fallback_t; + /** Coll han module */ typedef struct mca_coll_han_module_t { /** Base module */ @@ -225,9 +268,56 @@ typedef struct mca_coll_han_module_t { int *cached_vranks; int *cached_topo; bool is_mapbycore; + bool are_ppn_imbalanced; + + /* To be able to fallback when the cases are not supported */ + struct collective_fallback_t previous_routines[COLLCOUNT]; + + /* To be able to fallback on reproducible algorithm */ + mca_coll_base_module_reduce_fn_t reproducible_reduce; + mca_coll_base_module_t *reproducible_reduce_module; + mca_coll_base_module_allreduce_fn_t reproducible_allreduce; + mca_coll_base_module_t *reproducible_allreduce_module; + + /* Topological level of this communicator */ + int topologic_level; + + /* Collective module storage for module choice */ + mca_coll_han_collective_modules_storage_t modules_storage; + bool storage_initialized; + + /* + * Number of dynamic errors encountered + * The first mca_coll_han_component.max_dynamic_errors + * of rank 0 are printed with verbosity = 0 + */ + int dynamic_errors; + + /* Sub-communicator */ + struct ompi_communicator_t *sub_comm[NB_TOPO_LVL]; } mca_coll_han_module_t; OBJ_CLASS_DECLARATION(mca_coll_han_module_t); +/* + * Some defines to stick to the naming used in the other components in terms of + * fallback routines + */ +#define previous_allgather previous_routines[ALLGATHER].previous_routine.allgather +#define previous_allgatherv previous_routines[ALLGATHERV].previous_routine.allgatherv +#define previous_allreduce previous_routines[ALLREDUCE].previous_routine.allreduce +#define previous_bcast previous_routines[BCAST].previous_routine.bcast +#define previous_gather previous_routines[GATHER].previous_routine.gather +#define previous_reduce previous_routines[REDUCE].previous_routine.reduce +#define previous_scatter previous_routines[SCATTER].previous_routine.scatter + +#define previous_allgather_module previous_routines[ALLGATHER].previous_module +#define previous_allgatherv_module previous_routines[ALLGATHERV].previous_module +#define previous_allreduce_module previous_routines[ALLREDUCE].previous_module +#define previous_bcast_module previous_routines[BCAST].previous_module +#define previous_gather_module previous_routines[GATHER].previous_module +#define previous_reduce_module previous_routines[REDUCE].previous_module +#define previous_scatter_module previous_routines[SCATTER].previous_module + /** * Global component instance */ @@ -244,17 +334,10 @@ int han_request_free(ompi_request_t ** request); /* Subcommunicator creation */ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); - +void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); /* Gather topology information */ -int mca_coll_han_pow10_int(int pow_value); -int mca_coll_han_hostname_to_number(char *hostname, int size); -void mca_coll_han_topo_get(int *topo, struct ompi_communicator_t *comm, int num_topo_level); -void mca_coll_han_topo_sort(int *topo, int start, int end, int size, int level, int num_topo_level); -bool mca_coll_han_topo_is_mapbycore(int *topo, struct ompi_communicator_t *comm, - int num_topo_level); int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, int num_topo_level); -void mca_coll_han_topo_print(int *topo, struct ompi_communicator_t *comm, int num_topo_level); /* Utils */ void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, @@ -263,8 +346,47 @@ uint32_t han_auto_tuned_get_n(uint32_t n); uint32_t han_auto_tuned_get_c(uint32_t c); uint32_t han_auto_tuned_get_m(uint32_t m); +const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll); +const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); + +/** Dynamic component choice */ +/* + * Get all the collective modules initialized on this communicator + * This function must be call at the start of every selector implementation + */ +int +mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module); + +int +mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_allgatherv_intra_dynamic(ALLGATHERV_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_intra_dynamic(ALLREDUCE_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_bcast_intra_dynamic(BCAST_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_gather_intra_dynamic(GATHER_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_intra_dynamic(REDUCE_BASE_ARGS, + mca_coll_base_module_t *module); +int +mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS, + mca_coll_base_module_t *module); /* Bcast */ +int mca_coll_han_bcast_intra_simple(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, int seg_count, struct ompi_datatype_t *dtype, int root_up_rank, int root_low_rank, @@ -278,6 +400,30 @@ int mca_coll_han_bcast_t0_task(void *task_argu); int mca_coll_han_bcast_t1_task(void *task_argu); /* Reduce */ +int +mca_coll_han_reduce_intra_simple(const void *sbuf, + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_reduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + + + void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, int seg_count, struct ompi_datatype_t *dtype, @@ -301,6 +447,26 @@ int mca_coll_han_reduce_t0_task(void *task_argu); int mca_coll_han_reduce_t1_task(void *task_argu); /* Allreduce */ +int +mca_coll_han_allreduce_intra_simple(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int +mca_coll_han_allreduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, mca_coll_task_t * cur_task, void *sbuf, @@ -355,7 +521,7 @@ void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, struct ompi_communicator_t *low_comm, int w_rank, bool noop, ompi_request_t * req); -/* Gatter */ +/* Gather */ int mca_coll_han_gather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, @@ -380,6 +546,23 @@ void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, struct ompi_communicator_t *up_comm, struct ompi_communicator_t *low_comm, int w_rank, bool noop, ompi_request_t * req); +int +mca_coll_han_gather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +/* reordering after gather, for unordered ranks */ +void +ompi_coll_han_reorder_gather(const void *sbuf, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + int * topo); + + /* Allgather */ int @@ -405,6 +588,13 @@ void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, struct ompi_communicator_t *low_comm, int w_rank, bool noop, bool is_mapbycore, int *topo, ompi_request_t * req); +int +mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); END_C_DECLS #endif /* MCA_COLL_HAN_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_allgather.c b/ompi/mca/coll/han/coll_han_allgather.c index 2f0e3c45bd..50702d28ff 100644 --- a/ompi/mca/coll/han/coll_han_allgather.c +++ b/ompi/mca/coll/han/coll_han_allgather.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,7 +29,10 @@ void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, struct ompi_communicator_t *up_comm, struct ompi_communicator_t *low_comm, int w_rank, - bool noop, bool is_mapbycore, int *topo, ompi_request_t * req) + bool noop, + bool is_mapbycore, + int *topo, + ompi_request_t * req) { argu->cur_task = cur_task; argu->sbuf = sbuf; @@ -53,18 +57,17 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module) + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) { int w_rank; w_rank = ompi_comm_rank(comm); /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - mca_coll_han_comm_create(comm, han_module); - ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_allgather_low_module]; - ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_allgather_up_module]; + mca_coll_han_comm_create_new(comm, han_module); + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; int low_rank = ompi_comm_rank(low_comm); ompi_request_t *temp_request = NULL; @@ -160,7 +163,8 @@ int mca_coll_han_allgather_uag_task(void *task_argu) } else { ptrdiff_t rsize, rgap = 0; rsize = - opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size * up_size, + opal_datatype_span(&t->rdtype->super, + (int64_t) t->rcount * low_size * up_size, &rgap); reorder_buf = (char *) malloc(rsize); reorder_rbuf = reorder_buf - rgap; @@ -190,7 +194,8 @@ int mca_coll_han_allgather_uag_task(void *task_argu) "[%d]: HAN Allgather copy from %d %d\n", t->w_rank, (i * low_size + j) * 2 + 1, t->topo[(i * low_size + j) * 2 + 1])); - ompi_datatype_copy_content_same_ddt(t->rdtype, (ptrdiff_t) t->rcount, + ompi_datatype_copy_content_same_ddt(t->rdtype, + (ptrdiff_t) t->rcount, (char *) t->rbuf + rextent * (ptrdiff_t) t->topo[(i * low_size + j) * 2 + @@ -238,3 +243,108 @@ int mca_coll_han_allgather_lb_task(void *task_argu) return OMPI_SUCCESS; } + +int +mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module){ + + /* create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + mca_coll_han_comm_create_new(comm, han_module); + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + + /* discovery topology */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + /* unbalanced case needs algo adaptation */ + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather with this communicator. It need to fall back on another component\n")); + return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, + comm, han_module->previous_allgather_module); + } + + /* setup up/low coordinates */ + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int up_rank = ompi_comm_rank(up_comm); + int up_size = ompi_comm_size(up_comm); + int root_low_rank = 0; // node leader will be 0 on each rank + + /* allocate the intermediary buffer + * to gather on leaders on the low sub communicator */ + char *tmp_buf = NULL; + char *tmp_buf_start = NULL; + if (low_rank == root_low_rank) { + ptrdiff_t rsize, rgap = 0; + /* Compute the size to receive all the local data, including datatypes empty gaps */ + rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap); + // intermediary buffer on node leaders to gather on low comm + tmp_buf = (char *) malloc(rsize); + tmp_buf_start = tmp_buf - rgap; + } + + /* 1. low gather on node leaders into tmp_buf */ + low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + /* 2. allgather between node leaders, from tmp_buf to reorder_buf */ + if (low_rank == root_low_rank) { + /* allocate buffer to store unordered result on node leaders + * * if the processes are mapped-by core, no need to reorder: + * * distribution of ranks on core first and node next, + * * in a increasing order for both patterns */ + char *reorder_buf = NULL; + char *reorder_buf_start = NULL; + if (han_module->is_mapbycore) { + reorder_buf_start = rbuf; + } else { + if (0 == low_rank && 0 == up_rank) { // first rank displays message + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future Allgather needs reordering: ", w_rank)); + } + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap); + reorder_buf = (char *) malloc(rsize); + reorder_buf_start = reorder_buf - rgap; + } + + /* 2a. inter node allgather */ + up_comm->c_coll->coll_allgather(tmp_buf_start, scount*low_size, sdtype, + reorder_buf_start, rcount*low_size, rdtype, + up_comm, up_comm->c_coll->coll_allgather_module); + + if (tmp_buf != NULL) { + free(tmp_buf); + tmp_buf = NULL; + tmp_buf_start = NULL; + } + + /* 2b. reorder the node leader's into rbuf. + * if ranks are not mapped in topological order, data needs to be reordered + * (see reorder_gather) + */ + if (!han_module->is_mapbycore) { + ompi_coll_han_reorder_gather(reorder_buf_start, + rbuf, rcount, rdtype, + comm, topo); + free(reorder_buf); + reorder_buf = NULL; + } + + } + + /* 3. up broadcast: leaders broadcast on their nodes */ + low_comm->c_coll->coll_bcast(rbuf, rcount*low_size*up_size, rdtype, + root_low_rank, low_comm, + low_comm->c_coll->coll_bcast_module); + + + return OMPI_SUCCESS; + } diff --git a/ompi/mca/coll/han/coll_han_allreduce.c b/ompi/mca/coll/han/coll_han_allreduce.c index 629b93a1c9..6a4fd6038f 100644 --- a/ompi/mca/coll/han/coll_han_allreduce.c +++ b/ompi/mca/coll/han/coll_han_allreduce.c @@ -2,6 +2,8 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -78,6 +80,17 @@ mca_coll_han_allreduce_intra(const void *sbuf, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { + // Fallback to another component if the op cannot commute + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + if (! ompi_op_is_commute(op)) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this communicator." + "It need to fall back on another component\n")); + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); + } + + ptrdiff_t extent, lb; ompi_datatype_get_extent(dtype, &lb, &extent); int w_rank; @@ -87,7 +100,6 @@ mca_coll_han_allreduce_intra(const void *sbuf, ompi_datatype_type_size(dtype, &typelng); /* Create the subcommunicators */ - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -393,3 +405,145 @@ int mca_coll_han_allreduce_t3_task(void *task_argu) return OMPI_SUCCESS; } + +int +mca_coll_han_allreduce_intra_simple(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + ompi_communicator_t *low_comm; + ompi_communicator_t *up_comm; + int root_low_rank = 0; + int low_rank; + int ret; + mca_coll_han_component_t *cs = &mca_coll_han_component; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + OPAL_OUTPUT_VERBOSE((10, cs->han_output, + "[OMPI][han] in mca_coll_han_reduce_intra_simple\n")); + + // Fallback to another component if the op cannot commute + if (! ompi_op_is_commute(op)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "han cannot handle allreduce with this operation." + "It need to fall back on another component\n")); + goto prev_allreduce; + } + + mca_coll_han_comm_create_new(comm, han_module); + + low_comm = han_module->sub_comm[INTRA_NODE]; + up_comm = han_module->sub_comm[INTER_NODE]; + low_rank = ompi_comm_rank(low_comm); + + /* Low_comm reduce */ + ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "HAN/ALLREDUCE: low comm reduce failed. " + "Falling back to another component\n")); + goto prev_allreduce; + } + + /* Local roots perform a allreduce on the upper comm */ + if (low_rank == root_low_rank) { + ret = up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, rbuf, count, dtype, op, + up_comm, up_comm->c_coll->coll_allreduce_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "HAN/ALLREDUCE: up comm allreduce failed. \n")); + /* + * Do not fallback in such a case: only root_low_ranks follow this + * path, the other ranks are in another collective. + * ==> Falling back would potentially lead to a hang. + * Simply return the error + */ + return ret; + } + } + + /* Low_comm bcast */ + ret = low_comm->c_coll->coll_bcast(rbuf, count, dtype, + root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((30, cs->han_output, + "HAN/ALLREDUCE: low comm bcast failed. " + "Falling back to another component\n")); + goto prev_allreduce; + } + + return OMPI_SUCCESS; + +prev_allreduce: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm, + han_module->previous_allreduce_module); +} + +/* Find a fallback on reproducible algorithm + * use tuned, or if impossible whatever available + */ +int +mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank = ompi_comm_rank(comm); + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* populate previous modules_storage*/ + mca_coll_han_get_all_coll_modules(comm, han_module); + + /* try availability of reproducible modules*/ + int fallbacks[] = {TUNED, BASIC}; + int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks); + int i; + for (i=0; imodules_storage + .modules[fallback] + .module_handler; + if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) { + if (0 == w_rank) { + opal_output_verbose(30, mca_coll_han_component.han_output, + "coll:han:allreduce_reproducible: " + "fallback on %s\n", + components_name[fallback]); + } + han_module->reproducible_allreduce_module = fallback_module; + han_module->reproducible_allreduce = fallback_module->coll_allreduce; + return OMPI_SUCCESS; + } + } + /* fallback of the fallback */ + if (0 == w_rank) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:allreduce_reproducible_decision: " + "no reproducible fallback\n"); + } + han_module->reproducible_allreduce_module = + han_module->previous_allreduce_module; + han_module->reproducible_allreduce = han_module->previous_allreduce; + return OMPI_SUCCESS; +} + +/* Fallback on reproducible algorithm */ +int +mca_coll_han_allreduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + return han_module->reproducible_allreduce(sbuf, rbuf, count, dtype, + op, comm, + han_module + ->reproducible_allreduce_module); +} diff --git a/ompi/mca/coll/han/coll_han_bcast.c b/ompi/mca/coll/han/coll_han_bcast.c index 35c0a461f9..6eebc3b7d3 100644 --- a/ompi/mca/coll/han/coll_han_bcast.c +++ b/ompi/mca/coll/han/coll_han_bcast.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,10 +64,22 @@ mca_coll_han_bcast_intra(void *buff, w_rank = ompi_comm_rank(comm); int seg_count = count; size_t typelng; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. It need to fall back on another component\n")); + return han_module->previous_bcast(buff, count, dtype, root, + comm, han_module->previous_bcast_module); + } + ompi_datatype_type_size(dtype, &typelng); /* Create the subcommunicators */ - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -220,3 +233,60 @@ int mca_coll_han_bcast_t1_task(void *task_argu) return OMPI_SUCCESS; } + +int +mca_coll_han_bcast_intra_simple(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank; + w_rank = ompi_comm_rank(comm); + + /* create the subcommunicators */ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + mca_coll_han_comm_create_new(comm, han_module); + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int root_low_rank; + int root_up_rank; + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. It need to fall back on another component\n")); + return han_module->previous_bcast(buff, count, dtype, root, + comm, han_module->previous_bcast_module); + } else { + OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, + "[OMPI][han] in mca_coll_han_bcast_intra_simple\n")); + } + + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: root_low_rank %d root_up_rank %d\n", + w_rank, root_low_rank, root_up_rank)); + + if (low_rank == root_low_rank) { + up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, up_comm, up_comm->c_coll->coll_bcast_module); + + /* To remove when han has better sub-module selection. + For now switching to ibcast enables to make runs with libnbc. */ + //ompi_request_t req; + //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, up_comm, &req, up_comm->c_coll->coll_ibcast_module); + //ompi_request_wait(&req, MPI_STATUS_IGNORE); + + } + low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index 2aa5bbd7c2..cfb40c7da0 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -22,6 +23,8 @@ #include "ompi/constants.h" #include "ompi/mca/coll/coll.h" #include "coll_han.h" +#include "coll_han_dynamic.h" +#include "coll_han_dynamic_file.h" /* * Public string showing the coll ompi_han component version number @@ -84,6 +87,7 @@ mca_coll_han_component_t mca_coll_han_component = { */ static int han_open(void) { + int param; mca_coll_han_component_t *cs = &mca_coll_han_component; if (cs->han_auto_tune) { cs->han_auto_tuned = @@ -95,7 +99,16 @@ static int han_open(void) 2 * cs->han_auto_tune_n * cs->han_auto_tune_c * cs->han_auto_tune_m, file); fclose(file); } - return OMPI_SUCCESS; + + /* + * Get the global coll verbosity: it will be ours + */ + cs->han_output = ompi_coll_base_framework.framework_output; + opal_output_verbose(1, cs->han_output, + "coll:han:component_open: done!"); + + cs->topo_level = GLOBAL_COMMUNICATOR; + return mca_coll_han_init_dynamic_rules(); } @@ -109,9 +122,89 @@ static int han_close(void) free(cs->han_auto_tuned); cs->han_auto_tuned = NULL; } + mca_coll_han_free_dynamic_rules(); return OMPI_SUCCESS; } +static bool is_simple_implemented(COLLTYPE_T coll) +{ + switch(coll) { + case ALLGATHER: + case ALLREDUCE: + case BCAST: + case GATHER: + case REDUCE: + return true; + default: + return false; + } +} + +const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl) +{ + switch(topo_lvl) { + case INTRA_NODE: + return "intra_node"; + case INTER_NODE: + return "inter_node"; + case GLOBAL_COMMUNICATOR: + return "global_communicator"; + case NB_TOPO_LVL: + default: + return "invalid topologic level"; + } +} +const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll) +{ + switch(coll) { + case ALLGATHER: + return "allgather"; + case ALLGATHERV: + return "allgatherv"; + case ALLREDUCE: + return "allreduce"; + case ALLTOALL: + return "alltoall"; + case ALLTOALLV: + return "alltoallv"; + case ALLTOALLW: + return "alltoallw"; + case BARRIER: + return "barrier"; + case BCAST: + return "bcast"; + case EXSCAN: + return "exscan"; + case GATHER: + return "gather"; + case GATHERV: + return "gatherv"; + case REDUCE: + return "reduce"; + case REDUCESCATTER: + return "reduce_scatter"; + case REDUCESCATTERBLOCK: + return "reduce_scatter_block"; + case SCAN: + return "scan"; + case SCATTER: + return "scatter"; + case SCATTERV: + return "scatterv"; + case NEIGHBOR_ALLGATHER: + return "neighbor_allgather"; + case NEIGHBOR_ALLGATHERV: + return "neighbor_allgatherv"; + case NEIGHBOR_ALLTOALL: + return "neighbor_alltoall"; + case NEIGHBOR_ALLTOALLV: + return "neighbor_alltoallv"; + case NEIGHBOR_ALLTOALLW: + return "neighbor_alltoallw"; + default: + return ""; + } +} /* * Register MCA params @@ -121,21 +214,20 @@ static int han_register(void) mca_base_component_t *c = &mca_coll_han_component.super.collm_version; mca_coll_han_component_t *cs = &mca_coll_han_component; - cs->han_priority = 50; + /* Generated parameters name and description */ + char param_name[100] = ""; + char param_desc[300] = ""; + int param_desc_size; + COLLTYPE_T coll; + TOPO_LVL_T topo_lvl; + COMPONENT_T component; + + cs->han_priority = 0; (void) mca_base_component_var_register(c, "priority", "Priority of the han coll component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); - int coll_han_verbose = 0; - (void) mca_base_component_var_register(c, "verbose", - "Verbose level", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &coll_han_verbose); - cs->han_output = opal_output_open(NULL); - opal_output_set_verbosity(cs->han_output, coll_han_verbose); - cs->han_bcast_segsize = 65536; (void) mca_base_component_var_register(c, "bcast_segsize", "segment size for bcast", @@ -254,6 +346,93 @@ static int han_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune); + cs->han_reproducible = 0; + (void) mca_base_component_var_register(c, "reproducible", + "whether we need reproducible results " + "(enabling this disables optimisations using topology)" + "0 disable 1 enable, default 0", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_reproducible); + + /* Simple algorithms MCA parameters */ + for(coll = 0 ; coll < COLLCOUNT ; coll++) { + cs->use_simple_algorithm[coll] = false; + if(is_simple_implemented(coll)) { + snprintf(param_name, 100, "use_simple_%s", + mca_coll_han_colltype_to_str(coll)); + snprintf(param_desc, 300, "whether to enable simple algo for %s", + mca_coll_han_colltype_to_str(coll)); + mca_base_component_var_register(c, param_name, + param_desc, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->use_simple_algorithm[coll])); + } + } + + /* Dynamic rules MCA parameters */ + /* TODO: Find a way to avoid unused entried */ + memset(cs->mca_rules, 0, + COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T)); + for(coll = 0 ; coll < COLLCOUNT ; coll++) { + if(!mca_coll_han_is_coll_dynamic_implemented(coll)) { + continue; + } + /* + * Default values + * Do not avoid to set correct default parameters + */ + cs->mca_rules[coll][INTRA_NODE] = TUNED; + cs->mca_rules[coll][INTER_NODE] = BASIC; + cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN; + + for(topo_lvl = 0 ; topo_lvl < NB_TOPO_LVL ; topo_lvl++) { + + snprintf(param_name, 100, "%s_dynamic_%s_module", + mca_coll_han_colltype_to_str(coll), + mca_coll_han_topo_lvl_to_str(topo_lvl)); + + param_desc_size = snprintf(param_desc, 300, + "Collective module to use for " + "collective %s on %s topological level: ", + mca_coll_han_colltype_to_str(coll), + mca_coll_han_topo_lvl_to_str(topo_lvl)); + /* + * Exhaustive description: + * 0 = self; 1 = basic; 2 = libnbc; ... + * FIXME: Do not print component not providing this collective + */ + for(component = 0 ; component < COMPONENTS_COUNT ; component++) { + if(HAN == component && GLOBAL_COMMUNICATOR != topo_lvl) { + /* Han can only be used on the global communicator */ + continue; + } + param_desc_size += snprintf(param_desc+param_desc_size, 300, + "%d = %s; ", + component, + components_name[component]); + } + + mca_base_component_var_register(c, param_name, param_desc, + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->mca_rules[coll][topo_lvl])); + } + } + + /* + * TODO: remove the following lines when auto-tune is added back to the code + */ + cs->han_auto_tune = 0; + + cs->han_auto_tune_n = 5; + cs->han_auto_tune_c = 3; + cs->han_auto_tune_m = 21; +#if 0 cs->han_auto_tune_n = 5; (void) mca_base_component_var_register(c, "auto_tune_n", "auto tune n", @@ -273,7 +452,65 @@ static int han_register(void) "auto tune n", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_m); + MCA_BASE_VAR_SCOPE_READONLY, + &cs->han_auto_tune_m); +#endif + + /* Dynamic rules */ + cs->use_dynamic_file_rules = false; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "use_dynamic_file_rules", + "Switch used to decide if we use " + "dynamic module choice rules " + "defines by file", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->use_dynamic_file_rules)); + + cs->dynamic_rules_filename = NULL; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "dynamic_rules_filename", + "Filename of configuration file that " + "contains the dynamic module choice rules", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->dynamic_rules_filename)); + + cs->dump_dynamic_rules = false; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "dump_dynamic_rules", + "Switch used to decide if we dump " + "dynamic rules provided by " + "configuration file", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->dump_dynamic_rules)); + + if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename) + && !cs->use_dynamic_file_rules) { + opal_output_verbose(0, cs->han_output, + "coll:han:han_register " + "you asked for dynamic rules " + "but they are not activated. " + "Check coll_han_use_dynamic_file_rules " + "MCA parameter"); + } + + cs->max_dynamic_errors = 10; + (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, + "max_dynamic_errors", + "Number of dynamic rules module/function " + "errors printed on rank 0 " + "with a 0 verbosity." + "Useless if coll_base_verbose is 30 or more.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &(cs->max_dynamic_errors)); + return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_dynamic.c b/ompi/mca/coll/han/coll_han_dynamic.c new file mode 100644 index 0000000000..2cda40e34b --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic.c @@ -0,0 +1,1338 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal/class/opal_list.h" +#include "ompi/mca/coll/han/coll_han.h" +#include "ompi/mca/coll/han/coll_han_dynamic.h" +#include "ompi/mca/coll/base/coll_base_util.h" + +/* + * Tests if a dynamic collective is implemented + * Usefull for file reading warnings and MCA parameter generation + * When a new dynamic collective is implemented, this function must + * return true for it + */ +bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id) +{ + switch (coll_id){ + case ALLGATHER: + case ALLGATHERV: + case ALLREDUCE: + case BCAST: + case GATHER: + case REDUCE: + case SCATTER: + return true; + default: + return false; + } +} + +static COMPONENT_T +component_name_to_id(const char* name) +{ + int i; + + if(NULL == name) { + return -1; + } + + for(i=SELF ; itopologic_level; + mca_coll_base_module_t *han_base_module = (mca_coll_base_module_t *) han_module; + /* If the modules are get yet, return success */ + if(han_module->storage_initialized) { + return OMPI_SUCCESS; + } + /* This list is populated at communicator creation */ + OPAL_LIST_FOREACH(item, + comm->c_coll->module_list, + mca_coll_base_avail_coll_t) { + mca_coll_base_module_t *module = item->ac_module; + const char *name = item->ac_component_name; + int id = component_name_to_id(name); + + if(id >= 0 && NULL != module && module != han_base_module) { + /* + * The identifier is correct + * Store the module + */ + han_module->modules_storage.modules[id].module_handler = module; + opal_output_verbose(80, mca_coll_han_component.han_output, + "coll:han:get_all_coll_modules " + "Han found module %s with id %d " + "for topological level %d (%s) " + "for communicator (%d/%s)\n", + name, + id, + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + nb_modules++; + } + } + + /* + * Add han_module on global communicator only + * to prevent any recursive call + */ + if(GLOBAL_COMMUNICATOR == han_module->topologic_level) { + han_module->modules_storage.modules[HAN].module_handler = han_base_module; + nb_modules++; + } + + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_all_coll_modules " + "Han sub-communicator modules storage " + "for topological level %d (%s) " + "gets %d modules " + "for communicator (%d/%s)\n", + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + nb_modules, + comm->c_contextid, + comm->c_name); + + assert(0 != nb_modules); + + /* The modules are get */ + han_module->storage_initialized = true; + return OMPI_SUCCESS; +} + +/* + * Find the correct rule in the dynamic rules + * Assume rules are sorted by increasing value + */ +static const msg_size_rule_t* +get_dynamic_rule(COLLTYPE_T collective, + int msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + /* Indexes of the rule */ + int coll_idx; + int topo_idx; + int conf_idx; + int msg_size_idx; + + /* Aliases */ + const mca_coll_han_dynamic_rules_t *dynamic_rules = NULL; + const collective_rule_t *coll_rule = NULL; + const topologic_rule_t *topo_rule = NULL; + const configuration_rule_t *conf_rule = NULL; + const msg_size_rule_t *msg_size_rule = NULL; + + const TOPO_LVL_T topo_lvl = han_module->topologic_level; + const int comm_size = ompi_comm_size(comm); + + COMPONENT_T component; + + /* Find the collective rule */ + dynamic_rules = &(mca_coll_han_component.dynamic_rules); + for(coll_idx = dynamic_rules->nb_collectives-1 ; + coll_idx >= 0 ; coll_idx--) { + if(dynamic_rules->collective_rules[coll_idx].collective_id == collective) { + coll_rule = &(dynamic_rules->collective_rules[coll_idx]); + break; + } + } + if(coll_idx < 0) { + /* + * No dynamic rules for this collective + */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched for collective %d (%s) " + "but did not find any rule for this collective\n", + collective, + mca_coll_han_colltype_to_str(collective)); + return NULL; + } + + /* Find the topologic level rule */ + for(topo_idx = coll_rule->nb_topologic_levels-1 ; + topo_idx >= 0 ; topo_idx--) { + if(coll_rule->topologic_rules[topo_idx].topologic_level == topo_lvl) { + topo_rule = &(coll_rule->topologic_rules[topo_idx]); + break; + } + } + if(topo_idx < 0) { + /* + * No topologic level rules for this collective + */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched for topologic level %d (%s) rule " + "for collective %d (%s) but did not find any rule\n", + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + collective, + mca_coll_han_colltype_to_str(collective)); + return NULL; + } + + /* Find the configuration rule */ + for(conf_idx = topo_rule->nb_rules-1 ; + conf_idx >= 0 ; conf_idx--) { + if(topo_rule->configuration_rules[conf_idx].configuration_size <= comm_size) { + conf_rule = &(topo_rule->configuration_rules[conf_idx]); + break; + } + } + if(conf_idx < 0) { + /* + * No corresponding configuration + * Should not happen with a correct file + */ + + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched a rule for collective %d (%s) " + "on topological level %d (%s) " + "for a %d configuration size " + "but did not manage to find anything. " + "This is the result of an invalid configuration file: " + "the first configuration size of each collective must be 1\n", + collective, + mca_coll_han_colltype_to_str(collective), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size); + return NULL; + } + + /* Find the message size rule */ + for(msg_size_idx = conf_rule->nb_msg_size-1 ; + msg_size_idx >= 0 ; msg_size_idx--) { + if(conf_rule->msg_size_rules[msg_size_idx].msg_size <= msg_size) { + msg_size_rule = &(conf_rule->msg_size_rules[msg_size_idx]); + break; + } + } + if(msg_size_idx < 0) { + /* + * No corresponding message size + * Should not happen with a correct file + */ + opal_output_verbose(60, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched a rule for collective %d (%s) " + "on topological level %d (%s) " + "for a %d configuration size " + "for a %d sized message " + "but did not manage to find anything. " + "This is the result of an invalid configuration file: " + "the first message size of each configuration must be 0\n", + collective, + mca_coll_han_colltype_to_str(collective), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, + msg_size); + + return NULL; + } + + component = msg_size_rule->component; + /* + * We have the final rule to use + * Module correctness is checked outside + */ + opal_output_verbose(80, mca_coll_han_component.han_output, + "coll:han:get_dynamic_rule " + "Han searched a rule for collective %d (%s) " + "on topological level %d (%s) " + "for a %d configuration size " + "for a %d sized message. " + "Found a rule for collective %d (%s) " + "on topological level %d (%s) " + "for a %d configuration size " + "for a %d sized message : component %d (%s)\n", + collective, + mca_coll_han_colltype_to_str(collective), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, + msg_size, + msg_size_rule->collective_id, + mca_coll_han_colltype_to_str(msg_size_rule->collective_id), + msg_size_rule->topologic_level, + mca_coll_han_topo_lvl_to_str(msg_size_rule->topologic_level), + msg_size_rule->configuration_size, + msg_size_rule->msg_size, + component, + components_name[component]); + + return msg_size_rule; +} + +/* + * Return the module to use for the collective coll_id + * for a msg_size sized message on the comm communicator + * following the dynamic rules + */ +mca_coll_base_module_t * +get_module(COLLTYPE_T coll_id, + int msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + const msg_size_rule_t *dynamic_rule; + mca_coll_base_module_t *sub_module = NULL; + TOPO_LVL_T topo_lvl; + COMPONENT_T mca_rule_component; + + topo_lvl = han_module->topologic_level; + mca_rule_component = mca_coll_han_component.mca_rules[coll_id][topo_lvl]; + + mca_coll_han_get_all_coll_modules(comm, han_module); + + /* Find the correct dynamic rule to check */ + dynamic_rule = get_dynamic_rule(coll_id, + msg_size, + comm, + han_module); + if(NULL != dynamic_rule) { + /* Use dynamic rule from file */ + sub_module = han_module->modules_storage + .modules[dynamic_rule->component] + .module_handler; + } else { + /* + * No dynamic rule from file + * Use rule from mca parameter + */ + if(mca_rule_component < 0 || mca_rule_component >= COMPONENTS_COUNT) { + /* + * Invalid MCA parameter value + * Warn the user and return NULL + */ + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:get_module " + "Invalid MCA parameter value %d " + "for collective %d (%s) " + "on topologic level %d (%s)\n", + mca_rule_component, + coll_id, + mca_coll_han_colltype_to_str(coll_id), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl)); + return NULL; + } + sub_module = han_module->modules_storage + .modules[mca_rule_component] + .module_handler; + } + + return sub_module; +} + + +/* + * Allgather selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(sdtype, &dtype_size); + msg_size = dtype_size * scount; + + sub_module = get_module(ALLGATHER, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgather_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + ALLGATHER, + mca_coll_han_colltype_to_str(ALLGATHER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHER: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + han_module + ->previous_allgather_module); + } else if (NULL == sub_module->coll_allgather) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgather_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + ALLGATHER, + mca_coll_han_colltype_to_str(ALLGATHER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHER: the module found for the sub-" + "communicator cannot handle the ALLGATHER operation. " + "Falling back to another component\n")); + return han_module->previous_allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + han_module + ->previous_allgather_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allgather is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_allgather_fn_t allgather; + if(mca_coll_han_component.use_simple_algorithm[ALLGATHER]) { + allgather = mca_coll_han_allgather_intra_simple; + } else { + allgather = mca_coll_han_allgather_intra; + } + + return allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + sub_module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + sub_module); +} + + +/* + * Allgatherv selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + * The allgatherv size is the size of the biggest segment + */ +int +mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, + const int *displs, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size, msg_size; + int rank; + int verbosity; + int comm_size; + int i; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + comm_size = ompi_comm_size(comm); + ompi_datatype_type_size(rdtype, &dtype_size); + + msg_size = 0; + for(i = 0 ; i < comm_size ; i++) { + if(dtype_size * rcounts[i] > msg_size) { + msg_size = dtype_size * rcounts[i]; + } + } + + sub_module = get_module(ALLGATHERV, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgatherv_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + ALLGATHERV, + mca_coll_han_colltype_to_str(ALLGATHERV), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHERV: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + han_module + ->previous_allgatherv_module); + } else if (NULL == sub_module->coll_allgatherv) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgatherv_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + ALLGATHERV, + mca_coll_han_colltype_to_str(ALLGATHERV), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLGATHERV: the module found for the sub-" + "communicator cannot handle the ALLGATHERV operation. " + "Falling back to another component\n")); + return han_module->previous_allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + han_module + ->previous_allgatherv_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allgatherv is valid and point to this function + * Call han topological collective algorithm + */ + opal_output_verbose(30, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allgatherv_intra_dynamic " + "Han used for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective on this topologic level\n", + ALLGATHERV, + mca_coll_han_colltype_to_str(ALLGATHERV), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + return han_module->previous_allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + han_module + ->previous_allgatherv_module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgatherv is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + sub_module); +} + + +/* + * Allreduce selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_allreduce_intra_dynamic(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(dtype, &dtype_size); + msg_size = dtype_size * count; + + sub_module = get_module(ALLREDUCE, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allreduce_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + ALLREDUCE, + mca_coll_han_colltype_to_str(ALLREDUCE), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLREDUCE: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, + op, comm, + han_module + ->previous_allreduce_module); + } else if (NULL == sub_module->coll_allreduce) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_allreduce_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + ALLREDUCE, + mca_coll_han_colltype_to_str(ALLREDUCE), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/ALLREDUCE: the module found for the sub-" + "communicator cannot handle the ALLREDUCE operation. " + "Falling back to another component\n")); + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, + op, comm, + han_module + ->previous_allreduce_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* Reproducibility: fallback on reproducible algo */ + if (mca_coll_han_component.han_reproducible) { + return mca_coll_han_allreduce_reproducible(sbuf, rbuf, count, dtype, op, + comm, module); + } + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allreduce is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_allreduce_fn_t allreduce; + if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { + allreduce = mca_coll_han_allreduce_intra_simple; + } else { + allreduce = mca_coll_han_allreduce_intra; + } + return allreduce(sbuf, rbuf, count, dtype, + op, comm, module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allreduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_allreduce(sbuf, rbuf, count, dtype, + op, comm, sub_module); +} + + +/* + * Bcast selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_bcast_intra_dynamic(void *buff, + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(dtype, &dtype_size); + msg_size = dtype_size * count; + + sub_module = get_module(BCAST, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_bcast_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + BCAST, + mca_coll_han_colltype_to_str(BCAST), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/BCAST: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_bcast(buff, count, dtype, root, comm, + han_module->previous_bcast_module); + } else if (NULL == sub_module->coll_bcast) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_bcast_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + BCAST, + mca_coll_han_colltype_to_str(BCAST), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/BCAST: the module found for the sub-" + "communicator cannot handle the BCAST operation. " + "Falling back to another component\n")); + return han_module->previous_bcast(buff, count, dtype, root, comm, + han_module->previous_bcast_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_bcast is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_bcast_fn_t bcast; + if(mca_coll_han_component.use_simple_algorithm[BCAST]) { + bcast = mca_coll_han_bcast_intra_simple; + } else { + bcast = mca_coll_han_bcast_intra; + } + return bcast(buff, + count, + dtype, + root, + comm, + module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_bcast is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_bcast(buff, + count, + dtype, + root, + comm, + sub_module); +} + + +/* + * Gather selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(sdtype, &dtype_size); + msg_size = dtype_size * scount; + + sub_module = get_module(GATHER, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_gather_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + GATHER, + mca_coll_han_colltype_to_str(GATHER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/GATHER: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module + ->previous_gather_module); + } else if (NULL == sub_module->coll_gather) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_gather_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + GATHER, + mca_coll_han_colltype_to_str(GATHER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/GATHER: the module found for the sub-" + "communicator cannot handle the GATHER operation. " + "Falling back to another component\n")); + return han_module->previous_gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module + ->previous_gather_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_gather is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_gather_fn_t gather; + if(mca_coll_han_component.use_simple_algorithm[GATHER]) { + gather = mca_coll_han_gather_intra_simple; + } else { + gather = mca_coll_han_gather_intra; + } + + + return gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_gather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); +} + + +/* + * Reduce selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_reduce_intra_dynamic(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(dtype, &dtype_size); + msg_size = dtype_size * count; + + sub_module = get_module(REDUCE, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_reduce_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + REDUCE, + mca_coll_han_colltype_to_str(REDUCE), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_reduce(sbuf, rbuf, count, dtype, + op, root, comm, + han_module + ->previous_reduce_module); + } else if (NULL == sub_module->coll_reduce) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_reduce_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + REDUCE, + mca_coll_han_colltype_to_str(REDUCE), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: the module found for the sub-" + "communicator cannot handle the REDUCE operation. " + "Falling back to another component\n")); + return han_module->previous_reduce(sbuf, rbuf, count, dtype, + op, root, comm, + han_module + ->previous_reduce_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* Reproducibility: fallback on reproducible algo */ + if (mca_coll_han_component.han_reproducible) { + return mca_coll_han_reduce_reproducible(sbuf, rbuf, count, dtype, op, + root, comm, module); + } + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_reduce is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_reduce_fn_t reduce; + if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { + reduce = mca_coll_han_reduce_intra_simple; + } else { + reduce = mca_coll_han_reduce_intra; + } + return reduce(sbuf, rbuf, count, dtype, + op, root, comm, module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_reduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_reduce(sbuf, rbuf, count, dtype, + op, root, comm, sub_module); +} + + +/* + * Scatter selector: + * On a sub-communicator, checks the stored rules to find the module to use + * On the global communicator, calls the han collective implementation, or + * calls the correct module if fallback mechanism is activated + */ +int +mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + size_t dtype_size; + int msg_size; + int rank; + int verbosity; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + mca_coll_base_module_t *sub_module; + TOPO_LVL_T topo_lvl; + + topo_lvl = han_module->topologic_level; + + /* Compute configuration information for dynamic rules */ + ompi_datatype_type_size(rdtype, &dtype_size); + msg_size = dtype_size * rcount; + + sub_module = get_module(SCATTER, + msg_size, + comm, + han_module); + + /* First errors are always printed by rank 0 */ + rank = ompi_comm_rank(comm); + if(0 == rank + && han_module->dynamic_errors + < mca_coll_han_component.max_dynamic_errors) { + verbosity = 0; + } else { + verbosity = 30; + } + + if(NULL == sub_module) { + /* + * No valid collective module from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_scatter_intra_dynamic " + "Han did not find any valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s). " + "Please check dynamic file/mca parameters\n", + SCATTER, + mca_coll_han_colltype_to_str(SCATTER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/SCATTER: No module found for the sub-" + "communicator. " + "Falling back to another component\n")); + return han_module->previous_scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module + ->previous_scatter_module); + } else if (NULL == sub_module->coll_scatter) { + /* + * No valid collective from dynamic rules + * nor from mca parameter + */ + han_module->dynamic_errors++; + opal_output_verbose(verbosity, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_scatter_intra_dynamic " + "Han found valid module for " + "collective %d (%s) " + "with topological level %d (%s) " + "on communicator (%d/%s) " + "but this module cannot handle " + "this collective. " + "Please check dynamic file/mca parameters\n", + SCATTER, + mca_coll_han_colltype_to_str(SCATTER), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, + comm->c_name); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/SCATTER: the module found for the sub-" + "communicator cannot handle the SCATTER operation. " + "Falling back to another component\n")); + return han_module->previous_scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module + ->previous_scatter_module); + } + + if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_scatter is valid and point to this function + * Call han topological collective algorithm + */ + mca_coll_base_module_scatter_fn_t scatter; + scatter = mca_coll_han_scatter_intra; + /* + * TODO: Uncomment when scatter simple is merged + * if(mca_coll_han_component.use_simple_algorithm[SCATTER]) { + * scatter = mca_coll_han_scatter_intra_simple; + * } else { + * scatter = mca_coll_han_scatter_intra; + * } + */ + return scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); + } + + /* + * If we get here: + * sub_module is valid + * sub_module->coll_scatter is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + return sub_module->coll_scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); +} + + diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h new file mode 100644 index 0000000000..979b292ba0 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic.h @@ -0,0 +1,214 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_HAN_DYNAMIC_H +#define MCA_COLL_HAN_DYNAMIC_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "opal/util/output.h" +#include "ompi/mca/coll/base/coll_base_functions.h" +#include "ompi/mca/coll/han/coll_han.h" + + +/* + * ################################################# + * # Dynamic rules global architecture description # + * ################################################# + * + * Han dynamic rules allow the user to define the collective + * module to call depending the topological configuration of the + * sub-communicators and the collective parameters. This mechanism + * can also be used to fallback the main collective on another module. + * The interface is described in coll_han_dynamic_file.h. + * + * ############################# + * # Collective module storage # + * ############################# + * To be able to switch between multiple collective modules, han + * directly accesses the module on the communicator. This information is + * stored in the collective structure of the communicator during the collective + * module choice at the communicator initialization. When han needs this + * information for the first time, it identifies the modles by their name and + * stores them in its module structure. + * Then, the modules are identified by their identifier. + * + * ######################### + * # Dynamic rules storage # + * ######################### + * There are two types of dynamic rules: + * - MCA parameter defined rules + * - File defined rules + * + * MCA parameter defined rules are stored in mca_coll_han_component.mca_rules. + * This is a double indexed table. The first index is the coresponding collective + * communication and the second index is the topological level aimed by the rule. + * These parameters define the collective component to use for a specific + * collective communication on a specific topologic level. + * + * File defined rules are stored in mca_coll_han_component.dynamic_rules. + * These structures are defined bellow. The rule storage is directy deduced + * from the rule file format. + * + * File defined rules precede MCA parameter defined rules. + * + * ####################### + * # Dynamic rules usage # + * ####################### + * To choose which collective module to use on a specific configuration, han + * adds an indirection on the collective call: dynamic choice functions. These + * functions do not implement any collective. First, they try to find a dynamic + * rule from file for the given collective. If there is not any rule for the + * fiven configuration, MCA parameter defined rules are used. Once the module + * to use is found, the correct collective implementation is called. + * + * This indirection is also used on the global communicator. This allows han + * to provide a fallback mechanism considering the collective parameters. + * + * ############################## + * # Dynamic rules choice logic # + * ############################## + * Dynamic rules choice is made with a stack logic. Each new rule precedes + * already defined rules. MCA parameters rules are the stack base. When + * a rule is needed, rules are read as a stack and the first corresponding + * encountered is chosen. + * + * Consequences: + * - If a collective identifier appears multiple times, only the last + * will be considered + * - If a topological level appears multiple times for a collective, + * only the last will be considered + * - If configuration rules or message size rules are not stored + * by increasing value, some of them will not be considered + */ + +BEGIN_C_DECLS + +/* Dynamic rules support */ +typedef enum COMPONENTS { + SELF=0, + BASIC, + LIBNBC, + TUNED, + SM, + SHARED, + ADAPT, + HAN, + COMPONENTS_COUNT +} COMPONENT_T; + +static const char *components_name[]={"self", + "basic", + "libnbc", + "tuned", + "sm", + "shared", + "adapt", + "han"}; + +/* Topologic levels */ +typedef enum TOPO_LVL { + INTRA_NODE=0, + INTER_NODE, + /* Identifies the global communicator as a topologic level */ + GLOBAL_COMMUNICATOR, + NB_TOPO_LVL +} TOPO_LVL_T; + +/* Rule for a specific msg size + * in a specific configuration + * for a specific collective + * in a specific topologic level */ +typedef struct msg_size_rule_s { + COLLTYPE_T collective_id; + TOPO_LVL_T topologic_level; + int configuration_size; + + /* Message size of the rule */ + int msg_size; + + /* Component to use on this specific configuration + * and message size */ + COMPONENT_T component; +} msg_size_rule_t; + +/* Rule for a specific configuration + * considering a specific collective + * in a specific topologic level */ +typedef struct configuration_rule_s { + COLLTYPE_T collective_id; + TOPO_LVL_T topologic_level; + + /* Number of elements of the actual topologic level + * per element of the upper topologic level */ + int configuration_size; + + /* Number of message size rules for this configuration */ + int nb_msg_size; + + /* Table of message size rules for this configuration */ + msg_size_rule_t *msg_size_rules; +} configuration_rule_t; + +/* Set of dynamic rules for a specific collective + * in a specific topologic level */ +typedef struct topologic_rule_s { + /* Collective identifier */ + COLLTYPE_T collective_id; + + /* Topologic level of the rule */ + TOPO_LVL_T topologic_level; + + /* Rule number */ + int nb_rules; + + /* Table of configuration rules + * for this collective on this topologic level */ + configuration_rule_t *configuration_rules; +} topologic_rule_t; + +/* Set of dynamic rules for a collective */ +typedef struct collective_rule_s { + COLLTYPE_T collective_id; + + /* Number of topologic level for this collective */ + int nb_topologic_levels; + + /* Table of topologic level rules + * for this collective */ + topologic_rule_t *topologic_rules; +} collective_rule_t; + +/* Global dynamic rules structure */ +typedef struct mca_coll_han_dynamic_rule_s { + int nb_collectives; + collective_rule_t *collective_rules; +} mca_coll_han_dynamic_rules_t; + +/* Module storage */ +typedef struct collective_module_storage_s { + /* Module */ + mca_coll_base_module_t *module_handler; +} collective_module_storage_t; + +/* Table of module storage */ +typedef struct mca_coll_han_collective_modules_storage_s { + /* */ + collective_module_storage_t modules[COMPONENTS_COUNT]; +} mca_coll_han_collective_modules_storage_t; + +/* Tests if a dynamic collective is implemented */ +bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id); + +END_C_DECLS +#endif diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c new file mode 100644 index 0000000000..d163071edc --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -0,0 +1,690 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifdef HAVE_STDLIB_H +#include +#endif +#ifdef HAVE_STDIO_H +#include +#endif + +#include "ompi_config.h" +#include "coll_han.h" +#include "coll_han_dynamic.h" +#include "coll_han_dynamic_file.h" + +#include "ompi/mca/coll/base/coll_base_util.h" + +static void check_dynamic_rules(void); + +/* Current file line for verbose message */ +static int fileline = 1; +#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline) + +int +mca_coll_han_init_dynamic_rules(void) +{ + /* File management */ + const char *fname; + FILE *fptr = NULL; + int nb_entries = 0; + + /* Loop counters */ + int i, j, k, l; + + /* Collective informations */ + int nb_coll; + COLLTYPE_T coll_id; + collective_rule_t *coll_rules; + + /* Topo informations */ + int nb_topo; + TOPO_LVL_T topo_lvl; + topologic_rule_t *topo_rules; + + /* Configuration informations */ + int nb_rules, conf_size; + configuration_rule_t *conf_rules; + + /* Message size informations */ + int nb_msg_size, msg_size; + msg_size_rule_t *msg_size_rules; + + /* Component informations */ + COMPONENT_T component; + + /* If the dynamic rules are not used, do not even read the file */ + if(!mca_coll_han_component.use_dynamic_file_rules) { + nb_coll = 0; + return OMPI_SUCCESS; + } + + fname = mca_coll_han_component.dynamic_rules_filename; + + if(NULL == fname) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "coll_han_use_dynamic_file_rules is true but " + "coll_han_dynamic_rules_filename is not set: " + "coll han will use dynamic rules from mca " + "parameters and their default value\n"); + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + return OMPI_SUCCESS; + } + + fptr = fopen(fname, "r"); + + if(NULL == fptr) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "cannot open dynamic file provided by " + "coll_han_dynamic_rules_filename=%s " + "please provide it with full path and " + "check file permissions. Rules from " + "MCA parameters will be used instead\n", + fname); + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + return OMPI_SUCCESS; + } + + /* The first information of the file is the collective count */ + nb_coll = getnext(fptr); + + if(nb_coll <= 0) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for collective count " + "or the reader encountered an unexpected EOF\n", + fname, + fileline, + nb_coll); + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + goto file_reading_error; + } + + mca_coll_han_component.dynamic_rules.nb_collectives = nb_coll; + + /* Allocate collective rules */ + coll_rules = malloc(nb_coll * sizeof(collective_rule_t)); + mca_coll_han_component.dynamic_rules.collective_rules = coll_rules; + if(NULL == coll_rules) { + mca_coll_han_component.dynamic_rules.nb_collectives = 0; + goto cannot_allocate; + } + + /* Iterates on collective rules */ + for(i=0 ; i= COLLCOUNT) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "invalid collective id %d at line %d: the collective " + "must be at least %d and less than %d\n", + coll_id, + fileline, + ALLGATHER, + COLLCOUNT); + coll_rules[i].nb_topologic_levels = 0; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "read collective id %d at line %d " + "but this collective is not implemented yet. " + "This is not an error but this set of rules " + "will not be used\n", + fname, + coll_id, + fileline); + } + + /* + * The first information of a collective rule + * is the number of topologic rules + */ + nb_topo = getnext(fptr); + if(nb_topo < 0) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for topo level count " + "or the reader encountered an unexpected EOF\n", + fname, + fileline, + nb_topo); + coll_rules[i].nb_topologic_levels = 0; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Store the collective rule informations */ + coll_rules[i].collective_id = coll_id; + coll_rules[i].nb_topologic_levels = nb_topo; + + if(0 == nb_topo) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "Warning on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for topo level count\n", + fname, + fileline, + nb_topo); + continue; + } + + /* Allocate topologic rules */ + topo_rules = malloc(nb_topo * sizeof(topologic_rule_t)); + coll_rules[i].topologic_rules = topo_rules; + if(NULL == topo_rules) { + coll_rules[i].nb_topologic_levels = 0; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto cannot_allocate; + } + + /* Iterates on topologic rules */ + for(j=0 ; j= NB_TOPO_LVL) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid topo level %d is given " + "or the reader encountered an unexpected EOF. " + "Topologic level must be at least %d and " + "less than %d\n", + fname, + fileline, + topo_lvl, + INTRA_NODE, + NB_TOPO_LVL); + topo_rules[j].nb_rules = 0; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* + * The first information of a topologic rule + * is the number of configurations + */ + nb_rules = getnext(fptr); + + if(nb_rules < 0) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d " + "is given for rules count " + "or the reader encountered an unexpected EOF\n", + fname, + fileline, + nb_rules); + topo_rules[j].nb_rules = 0; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Store the topologic rule informations */ + topo_rules[j].collective_id = coll_id; + topo_rules[j].topologic_level = topo_lvl; + topo_rules[j].nb_rules = nb_rules; + + if(0 == nb_rules) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "Warning on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for configuration rules count\n", + fname, + fileline, + nb_rules); + continue; + } + + /* Allocate configuration rules */ + conf_rules = malloc(nb_rules * sizeof(configuration_rule_t)); + topo_rules[j].configuration_rules = conf_rules; + if(NULL == conf_rules) { + topo_rules[j].nb_rules = 0; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto cannot_allocate; + } + + /* Iterate on configuration rules */ + for(k=0 ; k 1)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "invalid configuration size %d at line %d " + "or the reader encountered an unexpected EOF " + "the configuration size must be at least %d " + "and the first configuration size " + "of a topologic level must be %d\n", + conf_size, + fileline, + 1, + 1); + conf_rules[k].nb_msg_size = 0; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* + * The first information of a configuration rule + * is the number of message size rules + */ + nb_msg_size = getnext(fptr); + if(nb_msg_size < 0) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d " + "is given for message size rules count " + "or the reader encountered an unexpected EOF\n", + fname, + fileline, + nb_msg_size); + conf_rules[k].nb_msg_size = 0; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Store configuration rule information */ + conf_rules[k].collective_id = coll_id; + conf_rules[k].topologic_level = topo_lvl; + conf_rules[k].configuration_size = conf_size; + conf_rules[k].nb_msg_size = nb_msg_size; + + if(0 == nb_msg_size) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "Warning on dynamic rules file %s " + "at line %d: an invalid value %d is given " + "for message size rules count\n", + fname, + fileline, + nb_msg_size); + continue; + } + + /* Allocate message size rules */ + msg_size_rules = malloc(nb_msg_size * sizeof(msg_size_rule_t)); + conf_rules[k].msg_size_rules = msg_size_rules; + if(NULL == msg_size_rules) { + conf_rules[k].nb_msg_size = 0; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto cannot_allocate; + } + + /* Iterate on message size rules */ + for(l=0 ; l 1)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid value %d " + "is given for message size " + "or the reader encountered " + "an unexpected EOF. " + "The first message size rule of " + "a configuration must be 0\n", + fname, + fileline, + msg_size); + conf_rules[k].nb_msg_size = l+1; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Get the component identifier for this message size rule */ + component = getnext(fptr); + if(component < SELF || component >= COMPONENTS_COUNT) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "found an error on dynamic rules file %s " + "at line %d: an invalid collective " + "component id %d is given or the " + "reader encountered an unexpected EOF. " + "Collective component id must be at " + "least %d and less than %d\n", + fname, + fileline, + component, + SELF, + COMPONENTS_COUNT); + conf_rules[k].nb_msg_size = l+1; + topo_rules[j].nb_rules = k+1; + coll_rules[i].nb_topologic_levels = j+1; + mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + goto file_reading_error; + } + + /* Store message size rule informations */ + msg_size_rules[l].collective_id = coll_id; + msg_size_rules[l].topologic_level = topo_lvl; + msg_size_rules[l].configuration_size = conf_size; + msg_size_rules[l].msg_size = msg_size; + msg_size_rules[l].component = component; + + nb_entries++; + } + } + } + } + + if(MYEOF != getnext(fptr)) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "Warning on file %s at line %d: " + "rule reading is over but reader does not seem " + "to have reached the end of the file\n", + fname, + fileline); + } + + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "read %d rules from %s\n", + nb_entries, + fname); + + if(mca_coll_han_component.dump_dynamic_rules) { + mca_coll_han_dump_dynamic_rules(); + } + + fclose(fptr); + + check_dynamic_rules(); + return OMPI_SUCCESS; + +cannot_allocate: + /* The dynamic rules allocation failed + * Free the already allocated rules and return a failure + */ + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "cannot allocate dynamic rules\n"); + /* Do not check free_dynamic_rules + * because we are returning OMPI_ERROR anyway */ + mca_coll_han_free_dynamic_rules(); + return OMPI_ERROR; + +file_reading_error: + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "could not fully read dynamic rules file. " + "Will use mca parameters defined rules. " + "To see error detail, please set " + "collective verbosity level over 5\n"); + if(fptr) { + fclose (fptr); + } + mca_coll_han_free_dynamic_rules(); + return OMPI_SUCCESS; +} + +void +mca_coll_han_free_dynamic_rules(void) +{ + /* Loop counters */ + int i, j, k; + + /* Loop ranges */ + int nb_coll, nb_topo, nb_conf; + + /* Aliases */ + collective_rule_t *coll_rules; + topologic_rule_t *topo_rules; + configuration_rule_t *conf_rules; + + nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; + coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; + + for(i=0 ; i 0) { + free(conf_rules[k].msg_size_rules); + } + } + + if(nb_conf > 0) { + free(conf_rules); + } + } + + if(nb_topo > 0) { + free(topo_rules); + } + } + + if(nb_coll > 0) { + free(coll_rules); + } + + mca_coll_han_component.dynamic_rules.nb_collectives = 0; +} + +/* + * Try to find any logical issue in dynamic rules + */ +static void check_dynamic_rules(void) +{ + /* Loop counters */ + int i, j, k, l; + + /* Collective informations */ + int nb_coll; + COLLTYPE_T coll_id; + collective_rule_t *coll_rules; + + /* Topo informations */ + int nb_topo; + TOPO_LVL_T topo_lvl; + topologic_rule_t *topo_rules; + + /* Configuration informations */ + int nb_rules, conf_size; + configuration_rule_t *conf_rules; + + /* Message size informations */ + int nb_msg_size, msg_size; + msg_size_rule_t *msg_size_rules; + + /* Component informations */ + COMPONENT_T component; + + nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; + coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; + + for(i=0 ; i=1 && conf_rules[k-1].configuration_size > conf_size) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:check_dynamic_rules " + "Han found an issue on dynamic rules " + "for collective %d " + "on topological level %d: " + "configuration sizes %d and %d are " + "not sorted by increasing value\n", + coll_id, + topo_lvl, + conf_rules[k-1].configuration_size, + conf_size); + } + + for(l=0 ; l=1 && msg_size_rules[l-1].msg_size > msg_size) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:check_dynamic_rules " + "Han found an issue on dynamic rules " + "for collective %d " + "on topological level %d " + "with configuration size %d: " + "message sizes %d and %d are " + "not sorted by increasing value\n", + coll_id, + topo_lvl, + conf_size, + msg_size_rules[l-1].msg_size, + msg_size); + } + + if(HAN == component + && GLOBAL_COMMUNICATOR != topo_lvl) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:check_dynamic_rules " + "Han found an issue on dynamic rules " + "for collective %d " + "on topological level %d " + "with configuration size %d " + "for message size %d: " + "han collective component %d " + "can only be activated for " + "topology level %d\n", + coll_id, + topo_lvl, + conf_size, + msg_size, + HAN, + GLOBAL_COMMUNICATOR); + } + } + } + } + } +} + +void mca_coll_han_dump_dynamic_rules(void) +{ + int nb_entries = 0; + + /* Loop counters */ + int i, j, k, l; + + /* Collective informations */ + int nb_coll; + COLLTYPE_T coll_id; + collective_rule_t *coll_rules; + + /* Topo informations */ + int nb_topo; + TOPO_LVL_T topo_lvl; + topologic_rule_t *topo_rules; + + /* Configuration informations */ + int nb_rules, conf_size; + configuration_rule_t *conf_rules; + + /* Message size informations */ + int nb_msg_size, msg_size; + msg_size_rule_t *msg_size_rules; + + /* Component informations */ + COMPONENT_T component; + + nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; + coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; + + for(i=0 ; i collective component %d (%s)\n", + nb_entries, + coll_id, + mca_coll_han_colltype_to_str(coll_id), + topo_lvl, + mca_coll_han_topo_lvl_to_str(topo_lvl), + conf_size, + msg_size, + component, + components_name[component]); + + nb_entries++; + } + } + } + } +} diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.h b/ompi/mca/coll/han/coll_han_dynamic_file.h new file mode 100644 index 0000000000..846b9b74cc --- /dev/null +++ b/ompi/mca/coll/han/coll_han_dynamic_file.h @@ -0,0 +1,111 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef MCA_COLL_HAN_DYNAMIC_FILE_H +#define MCA_COLL_HAN_DYNAMIC_FILE_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/mca/mca.h" +#include "opal/util/output.h" + +/* + * ############################ + * # MCA parameters interface # + * ############################ + * An MCA parameter defined rule allows the user to choose which collective + * module will be used for a specific collective communication on a specific + * topological level. The standard name for these parameters is: + * [collective]_dynamic_[topologic_level]_module + * + * ####################### + * # Dynamic file format # + * ####################### + * File defined rules precede MCA parameter defined rule. + * To activate file reader, the MCA parameter use_dynamic_file_rules must + * be set to true. The path to the dynamic file is given by the MCA + * parameter dynamic_rules_filename. If there is any issue reading the file, + * the file is considered as invalid and only MCA parameter defined rules are + * used. If a potential logical issue is identified in the file, a + * warning is printed but the file is not considered as invalid. + * + * The file is built recursively. + * A set of rules of a type is built as follows: + * Number of rules of the set + * Rule1 + * Rule2 + * ... + * + * A rule of the level i is built as follows (excluding message size rule): + * Rule property + * Set of rules of level i+1 + * + * A message size rule is built as follows: + * Message_size Component + * + * Rule properties are (by increasing level): + * - Collective identifier: + * Defined in ompi/mca/coll/base/coll_base_functions.h. + * - Topologic level: + * Defined in coll_han_dynamic.h. It defines the communicator + * topology level. This is GLOBAL_COMMUNICATOR for the user + * communicator and the corresponding level for sub-communicators + * created by han. + * - Configuration size: + * The configuration size is the number of elements of the actual + * topology level in the upper topology level. For example, if + * topology levels are intra-node and inter-node, it can be the + * number of MPI ranks per node or the number of nodes in the global + * communicator. For the GLOBAL_COMMUNICATOR topologic level, + * the configuration size is the communicator size. + * - Message_size Component: + * This is the message size, in bytes, of the message. Component is + * the component identifier to use for this collective on this + * communicator with this message size. Components identifier are + * defined in coll_han_dynamic.h + * + * Here is an example of a dynamic rules file: + * 2 # Collective count + * 7 # Collective identifier 1 (defined in ompi/mca/coll/base/coll_base_functions.h) + * 2 # Topologic level count + * 0 # Topologic level identifier 1 + * 1 # Configuration count + * 1 # Configuration size 1 + * 2 # Message size rules count + * 0 3 # Message size 1 and component identifier + * 128 1 # Message size 2 and component identifier + * 1 # Topologic level identifier 2 + * 1 # Configuration count + * 1 # Configuration size 1 + * 1 # Message size rules count + * 0 1 # Message size 1 and component identifier + * 3 # Collective identifier 2 + * # Set of topological rules + * + * Note that configuration size and message size rules define minimal + * values and each new rule precede every other rules. This property + * implies that this types of rules must be sorted by increasing value. + * If they are not, some rules wont be used. + * + * The counts define a stack. If the count is set to x, the reader will + * attempt to read x rules of the corresponding type. If a set of rules + * has an invalid count, this is an error and it might not be detected by + * the reader. + */ + +BEGIN_C_DECLS + +int mca_coll_han_init_dynamic_rules(void); +void mca_coll_han_free_dynamic_rules(void); +void mca_coll_han_dump_dynamic_rules(void); + +END_C_DECLS +#endif diff --git a/ompi/mca/coll/han/coll_han_gather.c b/ompi/mca/coll/han/coll_han_gather.c index 5188d2aca6..2cbd6d976c 100644 --- a/ompi/mca/coll/han/coll_han_gather.c +++ b/ompi/mca/coll/han/coll_han_gather.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,29 +54,39 @@ void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, int mca_coll_han_gather_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t * module) { - int i, j; - int w_rank, w_size; + int i; + int w_rank, w_size; /* information about the global communicator */ + int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ + char *reorder_buf = NULL, *reorder_rbuf = NULL; + ptrdiff_t rsize, rgap = 0, rextent; + int *vranks, low_rank, low_size; + int * topo; + + ompi_request_t *temp_request = NULL; + w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - mca_coll_han_comm_create(comm, han_module); - ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; - ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; - int *vranks = han_module->cached_vranks; - int low_rank = ompi_comm_rank(low_comm); - int low_size = ompi_comm_size(low_comm); - int up_size = ompi_comm_size(up_comm); + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + topo = mca_coll_han_topo_init(comm, han_module, 2); + + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. It need to fall back on another component\n")); + return han_module->previous_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, han_module->previous_gather_module); + } - ompi_request_t *temp_request = NULL; /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); OMPI_REQUEST_INIT(temp_request, false); @@ -88,27 +99,44 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, temp_request->req_status._cancelled = 0; temp_request->req_status._ucount = 0; - int root_low_rank; - int root_up_rank; - mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Gather root %d root_low_rank %d root_up_rank %d\n", w_rank, - root, root_low_rank, root_up_rank)); + /* create the subcommunicators */ + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; + + /* Get the 'virtual ranks' mapping correspondong to the communicators */ + vranks = han_module->cached_vranks; + /* information about sub-communicators */ + low_rank = ompi_comm_rank(low_comm); + low_size = ompi_comm_size(low_comm); + /* Get root ranks for low and up comms */ + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", + w_rank, root, root_low_rank, root_up_rank)); - char *reorder_buf = NULL; - char *reorder_rbuf = NULL; - ptrdiff_t rsize, rgap = 0, rextent; ompi_datatype_type_extent(rdtype, &rextent); - int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + /* Allocate reorder buffers */ if (w_rank == root) { - /* If the processes are mapped-by core, no need to reorder */ + /* if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns */ if (han_module->is_mapbycore) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Gather is_bycore: ", w_rank)); - reorder_rbuf = (char *) rbuf; + "[%d]: Han Gather is_bycore: ", w_rank)); + reorder_rbuf = (char *)rbuf; + } else { - rsize = opal_datatype_span(&rdtype->super, (int64_t) rcount * w_size, &rgap); - reorder_buf = (char *) malloc(rsize); //TODO:free + /* Need a buffer to store unordered final result */ + rsize = opal_datatype_span(&rdtype->super, + (int64_t)rcount * w_size, + &rgap); + reorder_buf = (char *)malloc(rsize); //TODO:free + /* rgap is the size of unused space at the start of the datatype */ reorder_rbuf = reorder_buf - rgap; } } @@ -128,27 +156,29 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); - /* Reorder rbuf based on rank. - * Suppose, message is 0 1 2 3 4 5 6 7, - * and the processes are mapped on 2 nodes (the processes on the node 0 is 0 2 4 6 and the processes on the node 1 is 1 3 5 7), - * so the message needs to be reordered to 0 2 4 6 1 3 5 7 + /* Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are + * mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from + * low gather will be 0 2 4 6 and 1 3 5 7. + * So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered. + * The 3rd element (4) must be recopied at the 4th place. In general, the + * i-th element must be recopied at the place given by the i-th entry of the + * topology, which is topo[i*topolevel +1] */ + /* reorder rbuf based on rank */ if (w_rank == root && !han_module->is_mapbycore) { - for (i = 0; i < up_size; i++) { - for (j = 0; j < low_size; j++) { + for (i=0; iw_rank)); OBJ_RELEASE(t->cur_task); @@ -168,16 +198,29 @@ int mca_coll_han_gather_lg_task(void *task_argu) char *tmp_buf = NULL; char *tmp_rbuf = NULL; if (!t->noop) { + /* if the process is one of the node leader, allocate the intermediary + * buffer to gather on the low sub communicator */ int low_size = ompi_comm_size(t->low_comm); ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); + rsize = opal_datatype_span(&t->rdtype->super, + (int64_t)t->rcount * low_size, + &rgap); tmp_buf = (char *) malloc(rsize); tmp_rbuf = tmp_buf - rgap; } - /* Shared memory gather */ - t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, - t->rdtype, t->root_low_rank, t->low_comm, - t->low_comm->c_coll->coll_gather_module); + + /* shared memory node gather */ + t->low_comm->c_coll->coll_gather((char *)t->sbuf, + t->scount, + t->sdtype, + tmp_rbuf, + t->rcount, + t->rdtype, + t->root_low_rank, + t->low_comm, + t->low_comm->c_coll->coll_gather_module); + + /* Prepare up comm gather */ t->sbuf = tmp_rbuf; t->sbuf_inter_free = tmp_buf; @@ -201,24 +244,192 @@ int mca_coll_han_gather_ug_task(void *task_argu) if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] Future Gather: ug noop\n", t->w_rank)); + "[%d] Han Gather: ug noop\n", t->w_rank)); } else { int low_size = ompi_comm_size(t->low_comm); - /* Inter node gather */ - t->up_comm->c_coll->coll_gather((char *) t->sbuf, t->scount * low_size, t->sdtype, - (char *) t->rbuf, t->rcount * low_size, t->rdtype, - t->root_up_rank, t->up_comm, - t->up_comm->c_coll->coll_gather_module); + /* inter node gather */ + t->up_comm->c_coll->coll_gather((char *)t->sbuf, + t->scount*low_size, + t->sdtype, + (char *)t->rbuf, + t->rcount*low_size, + t->rdtype, + t->root_up_rank, + t->up_comm, + t->up_comm->c_coll->coll_gather_module); if (t->sbuf_inter_free != NULL) { free(t->sbuf_inter_free); t->sbuf_inter_free = NULL; } OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] Future Gather: ug gather finish\n", t->w_rank)); + "[%d] Han Gather: ug gather finish\n", t->w_rank)); } ompi_request_t *temp_req = t->req; free(t); ompi_request_complete(temp_req, 1); return OMPI_SUCCESS; } + +/* only work with regular situation (each node has equal number of processes) */ +int +mca_coll_han_gather_intra_simple(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank = ompi_comm_rank(comm); + int w_size = ompi_comm_size(comm); + + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + /* Here root needs to reach all nodes on up_comm. + * But in case of unbalance some up_comms are smaller, + * as the comm_split is made on the base of low_rank */ + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. It need to fall back on another component\n")); + return han_module->previous_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, han_module->previous_gather_module); + } + + /* create the subcommunicators */ + mca_coll_han_comm_create_new(comm, han_module); + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + + /* Get the 'virtual ranks' mapping corresponding to the communicators */ + int *vranks = han_module->cached_vranks; + /* information about sub-communicators */ + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + /* Get root ranks for low and up comms */ + int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + + /* allocate buffer to store unordered result on root + * if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns */ + char *reorder_buf = NULL; // allocated memory + char *reorder_buf_start = NULL; // start of the data + if (w_rank == root) { + if (han_module->is_mapbycore) { + reorder_buf_start = (char *)rbuf; + } else { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d]: Future Gather needs reordering: ", w_rank)); + ptrdiff_t rgap = 0; + ptrdiff_t rsize = opal_datatype_span(&rdtype->super, + (int64_t)rcount * w_size, + &rgap); + reorder_buf = (char *)malloc(rsize); + /* rgap is the size of unused space at the start of the datatype */ + reorder_buf_start = reorder_buf - rgap; + } + + } + + /* allocate the intermediary buffer + * * to gather on leaders on the low sub communicator */ + char *tmp_buf = NULL; // allocated memory + char *tmp_buf_start = NULL; // start of the data + if (low_rank == root_low_rank) { + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&rdtype->super, + (int64_t)rcount * low_size, + &rgap); + tmp_buf = (char *) malloc(rsize); + tmp_buf_start = tmp_buf - rgap; + } + + /* 1. low gather on nodes leaders */ + low_comm->c_coll->coll_gather((char *)sbuf, + scount, + sdtype, + tmp_buf_start, + rcount, + rdtype, + root_low_rank, + low_comm, + low_comm->c_coll->coll_gather_module); + + /* 2. upper gather (inter-node) between node leaders */ + if (low_rank == root_low_rank) { + up_comm->c_coll->coll_gather((char *)tmp_buf_start, + scount*low_size, + sdtype, + (char *)reorder_buf_start, + rcount*low_size, + rdtype, + root_up_rank, + up_comm, + up_comm->c_coll->coll_gather_module); + + if (tmp_buf != NULL) { + free(tmp_buf); + tmp_buf = NULL; + tmp_buf_start = NULL; + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] Future Gather: ug gather finish\n", t->w_rank)); + } + + /* 3. reorder data on root into rbuf + * if ranks are not mapped in topological order, data needs to be reordered + * (see reorder_gather) + */ + if (w_rank == root && !han_module->is_mapbycore) { + ompi_coll_han_reorder_gather(reorder_buf_start, + rbuf, rcount, rdtype, + comm, topo); + free(reorder_buf); + } + + return OMPI_SUCCESS; +} + +/* Reorder after gather operation, for unordered ranks + * + * Suppose, the expected message is 0 1 2 3 4 5 6 7 but the processes are + * mapped on 2 nodes, for example |0 2 4 6| |1 3 5 7|. The messages from + * low gather will be 0 2 4 6 and 1 3 5 7. + * So the upper gather result is 0 2 4 6 1 3 5 7 which must be reordered. + * The 3rd element (4) must be recopied at the 4th place. In general, the + * i-th element must be recopied at the place given by the i-th entry of the + * topology, which is topo[i*topolevel +1] + */ +void +ompi_coll_han_reorder_gather(const void *sbuf, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + int * topo) { + int i; + int topolevel = 2; // always 2 levels in topo + int w_rank = ompi_comm_rank(comm); + int w_size = ompi_comm_size(comm); + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); + for (i=0; i -#include -#ifdef HAVE_SCHED_H -#include -#endif -#include -#ifdef HAVE_SYS_MMAN_H -#include -#endif /* HAVE_SYS_MMAN_H */ -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ - #include "mpi.h" -#include "opal_stdint.h" -#include "opal/mca/hwloc/base/base.h" -#include "opal/util/os_path.h" - -#include "ompi/communicator/communicator.h" -#include "ompi/group/group.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/coll/base/base.h" -#include "ompi/proc/proc.h" #include "coll_han.h" - -#include "ompi/mca/coll/base/coll_tags.h" -#include "ompi/mca/pml/pml.h" -#include -#include +#include "coll_han_dynamic.h" /* * Local functions */ -static int han_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); +static int han_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm); static int mca_coll_han_module_disable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); /* * Module constructor */ +static void han_module_clear(mca_coll_han_module_t *han_module) +{ + int i; + + for (i = 0; i < COLLCOUNT; i++) { + /* + * Since the previous routines function pointers are declared as + * a union, initializing the dummy routineis enough + */ + han_module->previous_routines[i].previous_routine.dummy = NULL; + han_module->previous_routines[i].previous_module = NULL; + } + han_module->reproducible_reduce = NULL; + han_module->reproducible_reduce_module = NULL; + han_module->reproducible_allreduce = NULL; + han_module->reproducible_allreduce_module = NULL; +} + static void mca_coll_han_module_construct(mca_coll_han_module_t * module) { + int i; + module->enabled = false; module->super.coll_module_disable = mca_coll_han_module_disable; module->cached_comm = NULL; @@ -62,27 +58,47 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module) module->cached_vranks = NULL; module->cached_topo = NULL; module->is_mapbycore = false; + module->storage_initialized = false; + for (i = 0 ; i < NB_TOPO_LVL ; i++) { + module->sub_comm[i] = NULL; + } + for (i=SELF ; imodules_storage.modules[i].module_handler = NULL; + } + + module->dynamic_errors = 0; + + han_module_clear(module); } + +#define OBJ_RELEASE_IF_NOT_NULL(obj) do { \ + if (NULL != (obj)) { \ + OBJ_RELEASE(obj); \ + } \ +} while (0) + /* * Module destructor */ static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) { + int i; + module->enabled = false; if (module->cached_low_comms != NULL) { - ompi_comm_free(&(module->cached_low_comms[0])); - ompi_comm_free(&(module->cached_low_comms[1])); - module->cached_low_comms[0] = NULL; - module->cached_low_comms[1] = NULL; + for (i = 0; i < COLL_HAN_LOW_MODULES; i++) { + ompi_comm_free(&(module->cached_low_comms[i])); + module->cached_low_comms[i] = NULL; + } free(module->cached_low_comms); module->cached_low_comms = NULL; } if (module->cached_up_comms != NULL) { - ompi_comm_free(&(module->cached_up_comms[0])); - ompi_comm_free(&(module->cached_up_comms[1])); - module->cached_up_comms[0] = NULL; - module->cached_up_comms[1] = NULL; + for (i = 0; i < COLL_HAN_UP_MODULES; i++) { + ompi_comm_free(&(module->cached_up_comms[i])); + module->cached_up_comms[i] = NULL; + } free(module->cached_up_comms); module->cached_up_comms = NULL; } @@ -94,21 +110,27 @@ static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) free(module->cached_topo); module->cached_topo = NULL; } -} + for(i=0 ; isub_comm[i]) { + ompi_comm_free(&(module->sub_comm[i])); + } + } -/* - * Module disable - */ -static int mca_coll_han_module_disable(mca_coll_base_module_t * module, - struct ompi_communicator_t *comm) -{ - return OMPI_SUCCESS; + OBJ_RELEASE_IF_NOT_NULL(module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(module->previous_scatter_module); + + han_module_clear(module); } OBJ_CLASS_INSTANCE(mca_coll_han_module_t, mca_coll_base_module_t, - mca_coll_han_module_construct, mca_coll_han_module_destruct); + mca_coll_han_module_construct, + mca_coll_han_module_destruct); /* * Initial query function that is invoked during MPI_INIT, allowing @@ -116,7 +138,8 @@ OBJ_CLASS_INSTANCE(mca_coll_han_module_t, * required level of thread support. This function is invoked exactly * once. */ -int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_threads) +int mca_coll_han_init_query(bool enable_progress_threads, + bool enable_mpi_threads) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:han:init_query: pick me! pick me!"); @@ -129,16 +152,23 @@ int mca_coll_han_init_query(bool enable_progress_threads, bool enable_mpi_thread * Look at the communicator and decide which set of functions and * priority we want to return. */ -mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) +mca_coll_base_module_t * +mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) { mca_coll_han_module_t *han_module; - /* If we're intercomm, or if there's only one process in the - communicator */ - if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) - || !ompi_group_have_remote_peers(comm->c_local_group)) { + /* + * If we're intercomm, or if there's only one process in the communicator + */ + if (OMPI_COMM_IS_INTER(comm)) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, - "coll:han:comm_query (%d/%s): intercomm, comm is too small, only on one node; disqualifying myself", + "coll:han:comm_query (%d/%s): intercomm; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } + if (1 == ompi_comm_size(comm)) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): comm is too small; disqualifying myself", comm->c_contextid, comm->c_name); return NULL; } @@ -159,24 +189,53 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * com } /* All is good -- return a module */ - han_module->super.coll_module_enable = han_module_enable; - han_module->super.ft_event = NULL; - han_module->super.coll_allgather = NULL; //mca_coll_han_allgather_intra; - han_module->super.coll_allgatherv = NULL; - han_module->super.coll_allreduce = mca_coll_han_allreduce_intra; - han_module->super.coll_alltoall = NULL; - han_module->super.coll_alltoallv = NULL; - han_module->super.coll_alltoallw = NULL; - han_module->super.coll_barrier = NULL; - han_module->super.coll_bcast = mca_coll_han_bcast_intra; - han_module->super.coll_exscan = NULL; - han_module->super.coll_gather = mca_coll_han_gather_intra; - han_module->super.coll_gatherv = NULL; - han_module->super.coll_reduce = mca_coll_han_reduce_intra; - han_module->super.coll_reduce_scatter = NULL; - han_module->super.coll_scan = NULL; - han_module->super.coll_scatter = mca_coll_han_scatter_intra; - han_module->super.coll_scatterv = NULL; + han_module->topologic_level = mca_coll_han_component.topo_level; + + /* + * TODO: When the selector is fully implemented, + * this if will be meaningless + */ + if (GLOBAL_COMMUNICATOR == han_module->topologic_level) { + /* We are on the global communicator, return topological algorithms */ + han_module->super.coll_module_enable = han_module_enable; + han_module->super.ft_event = NULL; + han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; + han_module->super.coll_allgatherv = NULL; + han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; + han_module->super.coll_alltoall = NULL; + han_module->super.coll_alltoallv = NULL; + han_module->super.coll_alltoallw = NULL; + han_module->super.coll_barrier = NULL; + han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; + han_module->super.coll_exscan = NULL; + han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; + han_module->super.coll_gatherv = NULL; + han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; + han_module->super.coll_reduce_scatter = NULL; + han_module->super.coll_scan = NULL; + han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; + han_module->super.coll_scatterv = NULL; + } else { + /* We are on a topologic sub-communicator, return only the selector */ + han_module->super.coll_module_enable = han_module_enable; + han_module->super.ft_event = NULL; + han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; + han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic; + han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; + han_module->super.coll_alltoall = NULL; + han_module->super.coll_alltoallv = NULL; + han_module->super.coll_alltoallw = NULL; + han_module->super.coll_barrier = NULL; + han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; + han_module->super.coll_exscan = NULL; + han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; + han_module->super.coll_gatherv = NULL; + han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; + han_module->super.coll_reduce_scatter = NULL; + han_module->super.coll_scan = NULL; + han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; + han_module->super.coll_scatterv = NULL; + } opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:han:comm_query (%d/%s): pick me! pick me!", @@ -185,14 +244,71 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t * com } +/* + * In this macro, the following variables are supposed to have been declared + * in the caller: + * . ompi_communicator_t *comm + * . mca_coll_han_module_t *han_module + */ +#define HAN_SAVE_PREV_COLL_API(__api) do { \ + han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ + han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module;\ + if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ + opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ + "(%d/%s): no underlying " # __api"; disqualifying myself", \ + comm->c_contextid, comm->c_name); \ + return OMPI_ERROR; \ + } \ + /* TODO add a OBJ_RELEASE at module disabling */ \ + /* + FIXME find why releasing generates memory corruption */ \ + OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ + } while(0) + /* * Init module on the communicator */ -static int han_module_enable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm) +static int han_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) { + mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module; + + HAN_SAVE_PREV_COLL_API(allgather); + HAN_SAVE_PREV_COLL_API(allgatherv); + HAN_SAVE_PREV_COLL_API(allreduce); + HAN_SAVE_PREV_COLL_API(bcast); + HAN_SAVE_PREV_COLL_API(gather); + HAN_SAVE_PREV_COLL_API(reduce); + HAN_SAVE_PREV_COLL_API(scatter); + + /* set reproducible algos */ + mca_coll_han_reduce_reproducible_decision(comm, module); + mca_coll_han_allreduce_reproducible_decision(comm, module); + return OMPI_SUCCESS; } +/* + * Module disable + */ +static int mca_coll_han_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) +{ + mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module; + + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module); + + han_module_clear(han_module); + + return OMPI_SUCCESS; +} + + /* * Free the han request */ @@ -203,266 +319,3 @@ int han_request_free(ompi_request_t ** request) *request = MPI_REQUEST_NULL; return OMPI_SUCCESS; } - -/* Create the communicators used in the HAN module */ -void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module) -{ - /* Use cached communicators if possible */ - if (han_module->cached_comm == comm && han_module->cached_low_comms != NULL - && han_module->cached_up_comms != NULL && han_module->cached_vranks != NULL) { - return; - } - /* Create communicators if there is no cached communicator */ - else { - int low_rank, low_size; - int up_rank; - int w_rank = ompi_comm_rank(comm); - int w_size = ompi_comm_size(comm); - ompi_communicator_t **low_comms = - (struct ompi_communicator_t **) malloc(sizeof(struct ompi_communicator_t *) * 2); - ompi_communicator_t **up_comms = - (struct ompi_communicator_t **) malloc(sizeof(struct ompi_communicator_t *) * 2); - /* Create low_comms which contain all the process on a node */ - const int *origin_priority = NULL; - /* Lower the priority of HAN module */ - int han_var_id; - int tmp_han_priority = 0; - int tmp_han_origin = 0; - mca_base_var_find_by_name("coll_han_priority", &han_var_id); - mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); - tmp_han_origin = *origin_priority; - mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, - NULL); - comm->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling; - comm->c_coll->coll_allgather = ompi_coll_base_allgather_intra_bruck; - - int var_id; - int tmp_priority = 100; - int tmp_origin = 0; - /* Set up low_comms[0] with sm module */ - mca_base_var_find_by_name("coll_sm_priority", &var_id); - mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); - tmp_origin = *origin_priority; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] sm_priority origin %d %d\n", w_rank, *origin_priority, - tmp_origin)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, (opal_info_t *) (&ompi_mpi_info_null), - &(low_comms[0])); - mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - low_size = ompi_comm_size(low_comms[0]); - low_rank = ompi_comm_rank(low_comms[0]); - - /* Set up low_comms[1] with solo module */ - mca_base_var_find_by_name("coll_solo_priority", &var_id); - mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); - tmp_origin = *origin_priority; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] solo_priority origin %d %d\n", w_rank, *origin_priority, - tmp_origin)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, (opal_info_t *) (&ompi_mpi_info_null), - &(low_comms[1])); - mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - - /* Create up_comms[0] with libnbc which contain one process per node (across nodes) */ - mca_base_var_find_by_name("coll_libnbc_priority", &var_id); - mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); - tmp_origin = *origin_priority; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] libnbc_priority origin %d %d\n", w_rank, *origin_priority, - tmp_origin)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - ompi_comm_split(comm, low_rank, w_rank, &(up_comms[0]), false); - mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - up_rank = ompi_comm_rank(up_comms[0]); - - /* Create up_comms[1] with adapt which contain one process per node (across nodes) */ - mca_base_var_find_by_name("coll_adapt_priority", &var_id); - mca_base_var_get_value(var_id, &origin_priority, NULL, NULL); - tmp_origin = *origin_priority; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] adapt_priority origin %d %d\n", w_rank, *origin_priority, - tmp_origin)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - ompi_comm_split(comm, low_rank, w_rank, &(up_comms[1]), false); - mca_base_var_set_value(var_id, &tmp_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - - int *vranks = malloc(sizeof(int) * w_size); - /* Do allgather to gather vrank from each process so every process knows other processes' vrank */ - int vrank = low_size * up_rank + low_rank; - ompi_coll_base_allgather_intra_bruck(&vrank, 1, MPI_INT, vranks, 1, MPI_INT, comm, - comm->c_coll->coll_allgather_module); - han_module->cached_comm = comm; - han_module->cached_low_comms = low_comms; - han_module->cached_up_comms = up_comms; - han_module->cached_vranks = vranks; - - mca_base_var_set_value(han_var_id, &tmp_han_origin, sizeof(int), MCA_BASE_VAR_SOURCE_SET, - NULL); - comm->c_coll->coll_allreduce = mca_coll_han_allreduce_intra; - comm->c_coll->coll_allgather = mca_coll_han_allgather_intra; - } -} - -int mca_coll_han_pow10_int(int pow_value) -{ - int i, result = 1; - for (i = 0; i < pow_value; i++) { - result *= 10; - } - return result; -} - -int mca_coll_han_hostname_to_number(char *hostname, int size) -{ - int i = 0, j = 0; - char *number_array = (char *) malloc(sizeof(char) * size); - while (hostname[i] != '\0') { - if (hostname[i] >= '0' && hostname[i] <= '9') { - number_array[j++] = hostname[i]; - } - i++; - } - int number = 0; - for (i = 0; i < j; i++) { - number += (number_array[i] - '0') * mca_coll_han_pow10_int(j - 1 - i); - } - free(number_array); - return number; -} - -void mca_coll_han_topo_get(int *topo, struct ompi_communicator_t *comm, int num_topo_level) -{ - int *self_topo = (int *) malloc(sizeof(int) * num_topo_level); - /* Set daemon vpid */ - char hostname[1024]; - gethostname(hostname, 1024); - self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024); - /* Set core id */ - self_topo[1] = ompi_comm_rank(comm); - - /* Allgather all the topology information */ - ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT, topo, num_topo_level, - MPI_INT, comm, comm->c_coll->coll_allgather_module); - free(self_topo); - return; -} - -void mca_coll_han_topo_sort(int *topo, int start, int end, int size, int level, int num_topo_level) -{ - if (level > num_topo_level - 1 || start >= end) { - return; - } - int i, j; - int min = INT_MAX; - int min_loc = -1; - for (i = start; i <= end; i++) { - /* Find min */ - for (j = i; j <= end; j++) { - if (topo[j * num_topo_level + level] < min) { - min = topo[j * num_topo_level + level]; - min_loc = j; - - } - } - /* Swap i and min_loc */ - int temp; - for (j = 0; j < num_topo_level; j++) { - temp = topo[i * num_topo_level + j]; - topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j]; - topo[min_loc * num_topo_level + j] = temp; - } - min = INT_MAX; - min_loc = -1; - } - int last = 0; - int new_start = 0; - int new_end = 0; - for (i = start; i <= end; i++) { - if (i == start) { - last = topo[i * num_topo_level + level]; - new_start = start; - } else if (i == end) { - new_end = end; - mca_coll_han_topo_sort(topo, new_start, new_end, size, level + 1, num_topo_level); - } else if (last != topo[i * num_topo_level + level]) { - new_end = i - 1; - mca_coll_han_topo_sort(topo, new_start, new_end, size, level + 1, num_topo_level); - new_start = i; - last = topo[i * num_topo_level + level]; - } - } - return; -} - -/* Check if the current processes are mapped by core */ -bool mca_coll_han_topo_is_mapbycore(int *topo, struct ompi_communicator_t * comm, - int num_topo_level) -{ - int i; - int size = ompi_comm_size(comm); - for (i = 1; i < size; i++) { - if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level] - || topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) { - return false; - - } - } - return true; -} - -int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, - int num_topo_level) -{ - int size; - size = ompi_comm_size(comm); - int *topo; - if ((han_module->cached_topo != NULL) && (han_module->cached_comm == comm)) { - topo = han_module->cached_topo; - } - else { - if (han_module->cached_topo != NULL) { - free(han_module->cached_topo); - han_module->cached_topo = NULL; - } - topo = (int *) malloc(sizeof(int) * size * num_topo_level); - /* Get topo infomation */ - mca_coll_han_topo_get(topo, comm, num_topo_level); - mca_coll_han_topo_print(topo, comm, num_topo_level); - - /* Check if the processes are mapped by core */ - han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level); - /* Sort the topo such that each group is contiguous */ - if (!han_module->is_mapbycore) { - mca_coll_han_topo_sort(topo, 0, size - 1, size, 0, num_topo_level); - } - han_module->cached_topo = topo; - han_module->cached_comm = comm; - } - - mca_coll_han_topo_print(topo, comm, num_topo_level); - return topo; -} - -/* Print out the topology info, for debugging purpose */ -void mca_coll_han_topo_print(int *topo, struct ompi_communicator_t *comm, int num_topo_level) -{ - int rank = ompi_comm_rank(comm); - int size = ompi_comm_size(comm); - - if (rank == 0) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: HAN topo: ", rank)); - int i; - for (i = 0; i < size * num_topo_level; i++) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i])); - } - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n")); - - } -} diff --git a/ompi/mca/coll/han/coll_han_reduce.c b/ompi/mca/coll/han/coll_han_reduce.c index f6137a8cd0..d0dc337ce8 100644 --- a/ompi/mca/coll/han/coll_han_reduce.c +++ b/ompi/mca/coll/han/coll_han_reduce.c @@ -2,6 +2,7 @@ * Copyright (c) 2018-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -69,8 +70,24 @@ mca_coll_han_reduce_intra(const void *sbuf, size_t typelng; ompi_datatype_type_size(dtype, &typelng); - /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + /* Do not initialize topology if the operation cannot commute */ + if(!ompi_op_is_commute(op)){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this operation. It needs to fall back on another component\n")); + goto prev_reduce_intra; + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); + goto prev_reduce_intra; + } + + /* Create the subcommunicators */ mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -133,6 +150,11 @@ mca_coll_han_reduce_intra(const void *sbuf, free(t); return OMPI_SUCCESS; + +prev_reduce_intra: + return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, + comm, + han_module->previous_reduce_module); } /* t0 task: issue and wait for the low level reduce of segment 0 */ @@ -189,4 +211,178 @@ int mca_coll_han_reduce_t1_task(void *task_argu) { } return OMPI_SUCCESS; -} \ No newline at end of file +} + +/* In case of non regular situation (imbalanced number of processes per nodes), + * a fallback is made on the next component that provides a reduce in priority order */ +int +mca_coll_han_reduce_intra_simple(const void *sbuf, + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank; /* information about the global communicator */ + int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ + int ret; + int *vranks, low_rank, low_size; + ptrdiff_t rsize, rgap = 0; + void * tmp_buf; + + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* Do not initialize topology if the operation cannot commute */ + if(!ompi_op_is_commute(op)){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this operation. It needs to fall back on another component\n")); + goto prev_reduce_intra_simple; + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); + goto prev_reduce_intra_simple; + } + + mca_coll_han_comm_create(comm, han_module); + ompi_communicator_t *low_comm = + han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; + ompi_communicator_t *up_comm = + han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module]; + + /* Get the 'virtual ranks' mapping corresponding to the communicators */ + vranks = han_module->cached_vranks; + w_rank = ompi_comm_rank(comm); + low_rank = ompi_comm_rank(low_comm); + + low_size = ompi_comm_size(low_comm); + /* Get root ranks for low and up comms */ + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); + + if (root_low_rank == low_rank && w_rank != root) { + rsize = opal_datatype_span(&dtype->super, (int64_t)count, &rgap); + tmp_buf = malloc(rsize); + if (NULL == tmp_buf) { + return OMPI_ERROR; + } + } else { + /* global root rbuf is valid, local non-root do not need buffers */ + tmp_buf = rbuf; + } + /* No need to handle MPI_IN_PLACE: only the global root may ask for it and + * it is ok to use it for intermediary reduces since it is also a local root*/ + + /* Low_comm reduce */ + ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)tmp_buf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){ + if (root_low_rank == low_rank && w_rank != root){ + free(tmp_buf); + } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: low comm reduce failed. " + "Falling back to another component\n")); + goto prev_reduce_intra_simple; + } + + /* Up_comm reduce */ + if (root_low_rank == low_rank ){ + if(w_rank != root){ + ret = up_comm->c_coll->coll_reduce((char *)tmp_buf, NULL, + count, dtype, op, root_up_rank, + up_comm, up_comm->c_coll->coll_reduce_module); + free(tmp_buf); + } else { + /* Take advantage of any optimisation made for IN_PLACE + * communcations */ + ret = up_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)tmp_buf, + count, dtype, op, root_up_rank, + up_comm, up_comm->c_coll->coll_reduce_module); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "HAN/REDUCE: low comm reduce failed.\n")); + return ret; + } + + } + return OMPI_SUCCESS; + +prev_reduce_intra_simple: + return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, + comm, + han_module->previous_reduce_module); +} + + +/* Find a fallback on reproducible algorithm + * use tuned or basic or if impossible whatever available + */ +int +mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int w_rank = ompi_comm_rank(comm); + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + + /* populate previous modules_storage*/ + mca_coll_han_get_all_coll_modules(comm, han_module); + + /* try availability of reproducible modules */ + int fallbacks[] = {TUNED, BASIC}; + int fallbacks_len = sizeof(fallbacks) / sizeof(*fallbacks); + int i; + for (i=0; imodules_storage + .modules[fallback] + .module_handler; + if (fallback_module != NULL && fallback_module->coll_reduce != NULL) { + if (0 == w_rank) { + opal_output_verbose(30, mca_coll_han_component.han_output, + "coll:han:reduce_reproducible: " + "fallback on %s\n", + components_name[fallback]); + } + han_module->reproducible_reduce_module = fallback_module; + han_module->reproducible_reduce = fallback_module->coll_reduce; + return OMPI_SUCCESS; + } + } + /* fallback of the fallback */ + if (0 == w_rank) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:reduce_reproducible_decision: " + "no reproducible fallback\n"); + } + han_module->reproducible_reduce_module = + han_module->previous_reduce_module; + han_module->reproducible_reduce = han_module->previous_reduce; + return OMPI_SUCCESS; +} + + +/* Fallback on reproducible algorithm */ +int +mca_coll_han_reduce_reproducible(const void *sbuf, + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + return han_module->reproducible_reduce(sbuf, rbuf, count, dtype, + op, root, comm, + han_module + ->reproducible_reduce_module); +} diff --git a/ompi/mca/coll/han/coll_han_scatter.c b/ompi/mca/coll/han/coll_han_scatter.c index 90d92659e1..b2a8752938 100644 --- a/ompi/mca/coll/han/coll_han_scatter.c +++ b/ompi/mca/coll/han/coll_han_scatter.c @@ -66,13 +66,23 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); - /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle scatter with this communicator. It needs to fall back on another component\n")); + goto prev_scatter_intra; + } + + /* Create the subcommunicators */ mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; + han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; + han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); @@ -93,6 +103,8 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, int root_low_rank; int root_up_rank; + + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank, @@ -105,7 +117,6 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, */ char *reorder_buf = NULL; char *reorder_sbuf = NULL; - int *topo = mca_coll_han_topo_init(comm, han_module, 2); if (w_rank == root) { /* If the processes are mapped-by core, no need to reorder */ @@ -154,6 +165,11 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); return OMPI_SUCCESS; +prev_scatter_intra: + return han_module->previous_scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + han_module->previous_scatter_module); } /* us: upper level (intra-node) scatter task */ diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c new file mode 100644 index 0000000000..e99f3e614b --- /dev/null +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Warning: this is not for the faint of heart -- don't even bother + * reading this source code if you don't have a strong understanding + * of nested data structures and pointer math (remember that + * associativity and order of C operations is *critical* in terms of + * pointer math!). + */ + +#include "ompi_config.h" + +#include "mpi.h" +#include "coll_han.h" +#include "coll_han_dynamic.h" + + +/* + * Local functions + */ +static void create_intranode_comm_new(ompi_communicator_t *, + ompi_communicator_t **); +static void create_internode_comm_new(ompi_communicator_t *, + int, int, + ompi_communicator_t **); +static void create_intranode_comm(ompi_communicator_t *, + const char *, + int, + ompi_communicator_t **); +static void create_internode_comm(ompi_communicator_t *, + const char *, + int, int, + ompi_communicator_t **); + +/** + * Create a sub-communicator containing the ranks that share my node. + * + * @param comm (IN) original communicator for the collective + * target module priority + * @param sub_comm (OUT) created sub-communicator + */ +static void create_intranode_comm_new(ompi_communicator_t *comm, + ompi_communicator_t **sub_comm) +{ + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + (opal_info_t *)(&ompi_mpi_info_null), sub_comm); + return; +} + +/** + * Create a sub-communicator containing one rank per node. + * + * @param comm (IN) original communicator for the collective + * @param my_rank (IN) my rank in comm + * @param intra_rank (IN) local rank in the intra-node sub-communicator + * @param sub_comm (OUT) created sub-communicator + */ +static void create_internode_comm_new(ompi_communicator_t *comm, + int my_rank, + int intra_rank, + ompi_communicator_t **sub_comm) +{ + ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); + return; +} + +/* + * Routine that creates the local hierarchical sub-communicators + * Called each time a collective is called. + * comm: input communicator of the collective + */ +void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + int low_rank, low_size; + int up_rank; + int w_rank; + int w_size; + ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]); + ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]); + const int *origin_priority; + int han_var_id; + int tmp_han_priority; + int vrank, *vranks; + + mca_coll_base_module_allreduce_fn_t old_allreduce; + mca_coll_base_module_t *old_allreduce_module; + + mca_coll_base_module_allgather_fn_t old_allgather; + mca_coll_base_module_t *old_allgather_module; + + mca_coll_base_module_bcast_fn_t old_bcast; + mca_coll_base_module_t *old_bcast_module; + + mca_coll_base_module_gather_fn_t old_gather; + mca_coll_base_module_t *old_gather_module; + + mca_coll_base_module_reduce_fn_t old_reduce; + mca_coll_base_module_t *old_reduce_module; + + /* The sub communicators have already been created */ + if (NULL != han_module->sub_comm[INTRA_NODE] + && NULL != han_module->sub_comm[INTER_NODE] + && NULL != han_module->cached_vranks) { + return; + } + + /* + * We cannot use han allreduce and allgather without sub-communicators + * Temporary set previous ones + * + * Allgather is used to compute vranks + * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new + * Reduce + Bcast may be called by the allreduce implementation + * Gather + Bcast may be called by the allgather implementation + */ + old_allreduce = comm->c_coll->coll_allreduce; + old_allreduce_module = comm->c_coll->coll_allreduce_module; + + old_allgather = comm->c_coll->coll_allgather; + old_allgather_module = comm->c_coll->coll_allgather_module; + + old_reduce = comm->c_coll->coll_reduce; + old_reduce_module = comm->c_coll->coll_reduce_module; + + old_bcast = comm->c_coll->coll_bcast; + old_bcast_module = comm->c_coll->coll_bcast_module; + + old_gather = comm->c_coll->coll_gather; + old_gather_module = comm->c_coll->coll_gather_module; + + comm->c_coll->coll_allreduce = han_module->previous_allreduce; + comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; + + comm->c_coll->coll_allgather = han_module->previous_allgather; + comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; + + comm->c_coll->coll_reduce = han_module->previous_reduce; + comm->c_coll->coll_reduce_module = han_module->previous_reduce_module; + + comm->c_coll->coll_bcast = han_module->previous_bcast; + comm->c_coll->coll_bcast_module = han_module->previous_bcast_module; + + comm->c_coll->coll_gather = han_module->previous_gather; + comm->c_coll->coll_gather_module = han_module->previous_gather_module; + + /* Create topological sub-communicators */ + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + + origin_priority = NULL; + mca_base_var_find_by_name("coll_han_priority", &han_var_id); + mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); + + /* + * Maximum priority for selector on sub-communicators + */ + tmp_han_priority = 100; + mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* + * This sub-communicator contains the ranks that share my node. + */ + mca_coll_han_component.topo_level = INTRA_NODE; + create_intranode_comm_new(comm, low_comm); + + /* + * Get my local rank and the local size + */ + low_size = ompi_comm_size(*low_comm); + low_rank = ompi_comm_rank(*low_comm); + + /* + * This sub-communicator contains one process per node: processes with the + * same intra-node rank id share such a sub-communicator + */ + mca_coll_han_component.topo_level = INTER_NODE; + create_internode_comm_new(comm, w_rank, low_rank, up_comm); + + up_rank = ompi_comm_rank(*up_comm); + + /* + * Set my virtual rank number. + * my rank # = * + * + + * WARNING: this formula works only if the ranks are perfectly spread over + * the nodes + * TODO: find a better way of doing + */ + vrank = low_size * up_rank + low_rank; + vranks = (int *)malloc(sizeof(int) * w_size); + /* + * gather vrank from each process so every process will know other processes + * vrank + */ + comm->c_coll->coll_allgather(&vrank, + 1, + MPI_INT, + vranks, + 1, + MPI_INT, + comm, + comm->c_coll->coll_allgather_module); + + /* + * Set the cached info + */ + han_module->cached_vranks = vranks; + + /* + * Come back to the original han module priority + */ + mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* Put allreduce, allgather, reduce and bcast back */ + comm->c_coll->coll_allreduce = old_allreduce; + comm->c_coll->coll_allreduce_module = old_allreduce_module; + + comm->c_coll->coll_allgather = old_allgather; + comm->c_coll->coll_allgather_module = old_allgather_module; + + comm->c_coll->coll_reduce = old_reduce; + comm->c_coll->coll_reduce_module = old_reduce_module; + + comm->c_coll->coll_bcast = old_bcast; + comm->c_coll->coll_bcast_module = old_bcast_module; + + comm->c_coll->coll_gather = old_gather; + comm->c_coll->coll_gather_module = old_gather_module; + + mca_coll_han_component.topo_level = GLOBAL_COMMUNICATOR; +} + +/** + * Create a sub-communicator containing the ranks that share my node. + * Associate this sub-communicator a given collective module. + * module can be one of: + * . sm + * . shared + * + * @param comm (IN) original communicator for the collective + * @param prio_string (IN) string containing the mca variable associated to + * target module priority + * @param my_rank (IN) my rank in comm + * @param sub_comm (OUT) created sub-communicator + */ +static void create_intranode_comm(ompi_communicator_t *comm, + const char *prio_string, + int my_rank, + ompi_communicator_t **sub_comm) +{ + int var_id; + const int *sav_priority; + int tmp_priority = 100; + + /* + * Upgrade the target module priority to make the resulting sub-communicator + * use that collective module + */ + mca_base_var_find_by_name(prio_string, &var_id); + mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] %s origin %d\n", + my_rank, prio_string, *sav_priority)); + + mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + /* + * Create the sub-communicator + * Since the target priority has been set to the highest value, this + * sub-communicator will inherit it as a collective module. + */ + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + (opal_info_t *)(&ompi_mpi_info_null), sub_comm); + /* + * Come back to the target module's original priority + */ + mca_base_var_set_value(var_id, sav_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + return; +} + +/** + * Create a sub-communicator containing one rank per node. + * Associate this sub-communicator a given collective module. + * module can be one of: + * . libnbc + * . adapt + * + * @param comm (IN) original communicator for the collective + * @param prio_string (IN) string containing the mca variable associated to + * target module priority + * @param my_rank (IN) my rank in comm + * @param intra_rank (IN) local rank in the intra-node sub-communicator + * @param sub_comm (OUT) created sub-communicator + */ +static void create_internode_comm(ompi_communicator_t *comm, + const char *prio_string, + int my_rank, + int intra_rank, + ompi_communicator_t **sub_comm) +{ + int var_id; + const int *sav_priority; + int tmp_priority = 100; + + /* + * Upgrade the target module priority to make the resulting sub-communicator + * use that collective module + */ + mca_base_var_find_by_name(prio_string, &var_id); + mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "[%d] %s origin %d\n", my_rank, prio_string, + *sav_priority)); + mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* + * Create the sub-communicator + * Since the target priority has been set to the highest value, this + * sub-communicator will inherit it as a collective module. + */ + ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); + mca_base_var_set_value(var_id, sav_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + return; +} + + +/* + * Routine that creates the local hierarchical sub-communicators + * Called each time a collective is called. + * comm: input communicator of the collective + */ +void mca_coll_han_comm_create(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) +{ + int low_rank, low_size; + int up_rank; + int w_rank; + int w_size; + ompi_communicator_t **low_comms; + ompi_communicator_t **up_comms; + const int *origin_priority; + int han_var_id; + int tmp_han_priority; + int vrank, *vranks; + + mca_coll_base_module_allreduce_fn_t old_allreduce; + mca_coll_base_module_t *old_allreduce_module; + mca_coll_base_module_allgather_fn_t old_allgather; + mca_coll_base_module_t *old_allgather_module; + + /* use cached communicators if possible */ + if (han_module->cached_comm == comm && + han_module->cached_low_comms != NULL && + han_module->cached_up_comms != NULL && + han_module->cached_vranks != NULL) { + return; + } + + /* We cannot use han allreduce and allgather without sub-communicators + * Temporary set previous ones */ + old_allreduce = comm->c_coll->coll_allreduce; + old_allreduce_module = comm->c_coll->coll_allreduce_module; + + old_allgather = comm->c_coll->coll_allgather; + old_allgather_module = comm->c_coll->coll_allgather_module; + + comm->c_coll->coll_allreduce = han_module->previous_allreduce; + comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; + + comm->c_coll->coll_allgather = han_module->previous_allgather; + comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; + + /* create communicators if there is no cached communicator */ + + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES * + sizeof(struct ompi_communicator_t *)); + up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES * + sizeof(struct ompi_communicator_t *)); + origin_priority = NULL; + mca_base_var_find_by_name("coll_han_priority", &han_var_id); + mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); + + /* + * Lower down our current priority + */ + tmp_han_priority = 0; + mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); + mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* + * Upgrade sm module priority to set up low_comms[0] with sm module + * This sub-communicator contains the ranks that share my node. + */ + create_intranode_comm(comm, "coll_sm_priority", w_rank, &(low_comms[0])); + + /* + * Get my local rank and the local size + */ + low_size = ompi_comm_size(low_comms[0]); + low_rank = ompi_comm_rank(low_comms[0]); + + /* + * Upgrade shared module priority to set up low_comms[1] with shared module + * This sub-communicator contains the ranks that share my node. + */ + create_intranode_comm(comm, "coll_shared_priority", w_rank, &(low_comms[1])); + + /* + * Upgrade libnbc module priority to set up up_comms[0] with libnbc module + * This sub-communicator contains one process per node: processes with the + * same intra-node rank id share such a sub-communicator + */ + create_internode_comm(comm, "coll_libnbc_priority", w_rank, low_rank, + &(up_comms[0])); + + up_rank = ompi_comm_rank(up_comms[0]); + + /* + * Upgrade adapt module priority to set up up_comms[0] with adapt module + * This sub-communicator contains one process per node. + */ + create_internode_comm(comm, "coll_adapt_priority", w_rank, low_rank, + &(up_comms[1])); + + /* + * Set my virtual rank number. + * my rank # = * + * + + * WARNING: this formula works only if the ranks are perfectly spread over + * the nodes + * TODO: find a better way of doing + */ + vrank = low_size * up_rank + low_rank; + vranks = (int *)malloc(sizeof(int) * w_size); + /* + * gather vrank from each process so every process will know other processes + * vrank + */ + comm->c_coll->coll_allgather(&vrank, 1, MPI_INT, vranks, 1, MPI_INT, comm, + comm->c_coll->coll_allgather_module); + + /* + * Set the cached info + */ + han_module->cached_comm = comm; + han_module->cached_low_comms = low_comms; + han_module->cached_up_comms = up_comms; + han_module->cached_vranks = vranks; + + /* + * Come back to the original han module priority + */ + mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), + MCA_BASE_VAR_SOURCE_SET, NULL); + + /* Put allreduce and allgather back */ + comm->c_coll->coll_allreduce = old_allreduce; + comm->c_coll->coll_allreduce_module = old_allreduce_module; + + comm->c_coll->coll_allgather = old_allgather; + comm->c_coll->coll_allgather_module = old_allgather_module; +} + + diff --git a/ompi/mca/coll/han/coll_han_topo.c b/ompi/mca/coll/han/coll_han_topo.c new file mode 100644 index 0000000000..cbcfd698d0 --- /dev/null +++ b/ompi/mca/coll/han/coll_han_topo.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Warning: this is not for the faint of heart -- don't even bother + * reading this source code if you don't have a strong understanding + * of nested data structures and pointer math (remember that + * associativity and order of C operations is *critical* in terms of + * pointer math!). + */ + +#include "ompi_config.h" + +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ + +#ifdef HAVE_STDLIB_H +#include +#endif /* HAVE_STDLIB_H */ + + +#include "mpi.h" +#include "coll_han.h" + + +/* + * Local functions + */ + +static int mca_coll_han_hostname_to_number(char* hostname, int size); +static void mca_coll_han_topo_get(int *topo, + struct ompi_communicator_t* comm, + int num_topo_level); +static void mca_coll_han_topo_sort(int *topo, int start, int end, + int level, int num_topo_level); +static bool mca_coll_han_topo_is_mapbycore(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level); +static void mca_coll_han_topo_print(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level); + + +/* + * takes the number part of a host: hhh2031 -->2031 + */ +static int mca_coll_han_hostname_to_number(char* hostname, int size) +{ + int i, j; + char *number_array = (char *)malloc(sizeof(char) * size); + int number = 0; + + for (i = 0, j = 0; hostname[i] != '\0'; i++) { + if ('0' <= hostname[i] && '9' >= hostname[i]) { + number_array[j++] = hostname[i]; + } + } + number_array[j] = '\0'; + number = atoi(number_array); + free(number_array); + return number; +} + +/* + * Set the virtual topo id. It is made of num_topo_level ints (2 today): + * . the integer part of the host id + * . the rank in the main communicator + * Gather the virtual topoid from each process so every process will know other + * processes virtual topids + */ +static void mca_coll_han_topo_get(int *topo, + struct ompi_communicator_t* comm, + int num_topo_level) +{ + int *self_topo = (int *)malloc(sizeof(int) * num_topo_level); + char hostname[1024]; + + gethostname(hostname, 1024); + self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024); + self_topo[1] = ompi_comm_rank(comm); + + ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT, + topo, num_topo_level, MPI_INT, comm, + comm->c_coll->coll_allgather_module); + free(self_topo); + + return; +} + +/* + * Sort the topology array in order to have ranks sharing the same node + * contiguous in the topology array. + * Called from topo_init whenever the processes are not mapped by core. + * ex: 4 ranks executing on 2 nodes, mapped by node + * ranks 0 and 2 on hid0 + * ranks 1 and 3 on hid1 + * On entry the topo array looks like + * hid0 0 hid1 1 hid0 2 hid1 3 + * After the sort: + * hid0 0 hid0 2 hid1 1 hid1 3 + * This is to have the gather result in the right order + * + * @param topo (IN/OUT) topology description array (sorted in out) + * @param start (IN) where to begin the processing + * The index in topo will actually be: + * start * num_topo_level + level + * topo contains num_topo_level ids per rank. + * @param end (IN) where to stop the processing + * The index in topo will actually be: + * end * num_topo_level + level + * topo contains num_topo_level ids per rank. + * @param level (IN) level number we are currently processing + * @param num_topo_level (IN) number of topological levels + * + */ +static void mca_coll_han_topo_sort(int *topo, int start, int end, + int level, int num_topo_level) +{ + int i, j; + int min, min_loc; + int last, new_start, new_end; + + if (level > num_topo_level-1 || start >= end) { + return; + } + + min = INT_MAX; + min_loc = -1; + for (i = start; i <= end; i++) { + int temp; + /* get the min value for current level and its location */ + for (j = i; j <= end; j++) { + /* topo contains num_topo_level ids per rank. */ + if (topo[j * num_topo_level + level] < min) { + min = topo[j*num_topo_level+level]; + min_loc = j; + + } + } + /* + * swap i and min_loc + * We have num_topo_level ids to swap + */ + for (j = 0; j < num_topo_level; j++) { + temp = topo[i * num_topo_level + j]; + topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j]; + topo[min_loc * num_topo_level + j] = temp; + } + min = INT_MAX; + min_loc = -1; + } + + /* Process next level */ + last = 0; + new_start = 0; + new_end = 0; + for (i = start; i <= end; i++) { + if (i == start) { + last = topo[i * num_topo_level + level]; + new_start = start; + } else if (i == end) { + new_end = end; + mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, + num_topo_level); + } else if (last != topo[i * num_topo_level + level]) { + new_end = i - 1; + mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, + num_topo_level); + new_start = i; + last = topo[i * num_topo_level + level]; + } + } + return; +} + +/* + * Check whether the ranks in the communicator given as input are mapped by core + * Mapped by core: each node is first filled with as many ranks as needed before + * moving to the next one + * This is checked as follows: + * . 2 contiguous ranks should be either on the same node or on node ids in + * ascending order + * The topology is actually an array of ints: + * +----------+-------+----------+-------+------+----------+-------+-----+ + * | host_id0 | rank0 | host_id1 | rank1 | .... | host_idX | rankX | ... | + * +----------+-------+----------+-------+------+----------+-------+-----+ + */ +static bool mca_coll_han_topo_is_mapbycore(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) +{ + int i; + int size = ompi_comm_size(comm); + + for (i = 1; i < size; i++) { + /* + * The host id for a given rank should be < host id for the next rank + */ + if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level]) { + return false; + } + /* + * For the same host id, consecutive ranks should be sorted in + * ascending order. + */ + if (topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) { + return false; + } + } + return true; +} + +/* The topo is supposed sorted by host */ +static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level){ + int i; + int size = ompi_comm_size(comm); + if (size < 2){ + return false; + } + int ppn; + int last_host = topo[0]; + + /* Find the ppn for the first node */ + for (i = 1; i < size; i++) { + if (topo[i * num_topo_level] != last_host){ + break; + } + } + ppn = i; + + /* All on one node */ + if ( size == ppn){ + return false; + } + /* Trivial case */ + if (size % ppn != 0){ + return true; + } + + last_host = topo[ppn * num_topo_level]; + /* Check that the 2nd and next hosts also this ppn. Since the topo is sorted + * one just need to jump ppn ranks to check the supposed switch of host */ + for (i = 2 * ppn; i < size; i += ppn ){ + /* the list of ranks for the last known host have ended before */ + if (topo[(i-1) * num_topo_level] != last_host){ + return true; + } + /* the list of ranks for the last known host are bigger than excpected */ + if (topo[(i-1) * num_topo_level] == topo[i*num_topo_level]){ + return true; + } + last_host = topo[i * num_topo_level]; + } + /* Check the last host */ + if (topo[(size-1) * num_topo_level] != last_host){ + return true; + } + + return false; +} + + +/** + * Topology initialization phase + * Called each time a collective that needs buffer reordering is called + * + * @param num_topo_level (IN) Number of the topological levels + */ +int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module, + int num_topo_level) +{ + int size; + int *topo; + + size = ompi_comm_size(comm); + + if (!((han_module->cached_topo) && (han_module->cached_comm == comm))) { + if (han_module->cached_topo) { + free(han_module->cached_topo); + han_module->cached_topo = NULL; + } + + topo = (int *)malloc(sizeof(int) * size * num_topo_level); + + /* get topo infomation */ + mca_coll_han_topo_get(topo, comm, num_topo_level); + mca_coll_han_topo_print(topo, comm, num_topo_level); + + /* + * All the ranks now have the topo information + */ + + /* check if the processes are mapped by core */ + han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level); + + /* + * If not, sort the topo such that each group of ids is sorted by rank + * i.e. ids for rank i are contiguous to ids for rank i+1. + * This will be needed for the operations that are order sensitive + * (like gather) + */ + if (!han_module->is_mapbycore) { + mca_coll_han_topo_sort(topo, 0, size-1, 0, num_topo_level); + } + han_module->are_ppn_imbalanced = mca_coll_han_topo_are_ppn_imbalanced(topo, comm , num_topo_level); + han_module->cached_topo = topo; + han_module->cached_comm = comm; + } else { + topo = han_module->cached_topo; + } + + mca_coll_han_topo_print(topo, comm, num_topo_level); + return topo; +} + +static void mca_coll_han_topo_print(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) +{ + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); + + if (rank == 0) { + int i; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter topo: ", rank)); + for (i=0; i Date: Thu, 14 May 2020 00:07:50 -0400 Subject: [PATCH 3/4] A complete overhaul of the HAN code. Among many other things: - Fix an imbalance bug in MPI_allgather - Accept more human readable configuration files. We can now specify the collective by name instead of a magic number, and the component we want to use also by name. - Add the capability to have optional arguments in the collective communication configuration file. Right now the capability exists for segment lengths, but is yet to be connected with the algorithms. - Redo the initialization of all HAN collectives. Cleanup the fallback collective support. - In case the module is unable to deliver the expected result, it will fallback executing the collective operation on another collective component. This change make the support for this fallback simpler to use. - Implement a fallback allowing a HAN module to remove itself as potential active collective module, and instead fallback to the next module in line. - Completely disable the HAN modules on error. From the moment an error is encountered they remove themselves from the communicator, and in case some other modules calls them simply behave as a pass-through. Communicator: provide ompi_comm_split_with_info to split and provide info at the same time Add ompi_comm_coll_preference info key to control collective component selection COLL HAN: use info keys instead of component-level variable to communicate topology level between abstraction layers - The info value is a comma-separated list of entries, which are chosen with decreasing priorities. This overrides the priority of the component, unless the component has disqualified itself. An entry prefixed with ^ starts the ignore-list. Any entry following this character will be ingnored during the collective component selection for the communicator. Example: "sm,libnbc,^han,adapt" gives sm the highest preference, followed by libnbc. The components han and adapt are ignored in the selection process. - Allocate a temporary buffer for all lower-level leaders (length 2 segments) - Fix the handling of MPI_IN_PLACE for gather and scatter. COLL HAN: Fix topology handling - HAN should not rely on node names to determine the ordering of ranks. Instead, use the node leaders as identifiers and short-cut if the node-leaders agree that ranks are consecutive. Also, error out if the rank distribution is imbalanced for now. Signed-off-by: Xi Luo Signed-off-by: Joseph Schuchart Signed-off-by: George Bosilca --- ompi/communicator/comm.c | 24 +- ompi/communicator/communicator.h | 15 + ompi/group/group.c | 28 + ompi/group/group.h | 8 + ompi/mca/coll/adapt/coll_adapt_ibcast.c | 4 +- ompi/mca/coll/base/coll_base_comm_select.c | 99 +- ompi/mca/coll/base/coll_base_util.c | 257 +++- ompi/mca/coll/base/coll_base_util.h | 15 +- ompi/mca/coll/han/Makefile.am | 3 +- ompi/mca/coll/han/coll_han.h | 429 +++---- ompi/mca/coll/han/coll_han_allgather.c | 274 ++-- ompi/mca/coll/han/coll_han_allreduce.c | 275 ++-- ompi/mca/coll/han/coll_han_bcast.c | 251 ++-- ompi/mca/coll/han/coll_han_component.c | 250 +--- ompi/mca/coll/han/coll_han_dynamic.c | 1125 +++++++---------- ompi/mca/coll/han/coll_han_dynamic.h | 36 +- ompi/mca/coll/han/coll_han_dynamic_file.c | 466 +++---- ompi/mca/coll/han/coll_han_dynamic_file.h | 13 +- ompi/mca/coll/han/coll_han_gather.c | 428 ++++--- ompi/mca/coll/han/coll_han_module.c | 174 +-- ompi/mca/coll/han/coll_han_reduce.c | 218 ++-- ompi/mca/coll/han/coll_han_scatter.c | 184 +-- ompi/mca/coll/han/coll_han_subcomms.c | 459 +++---- ompi/mca/coll/han/coll_han_topo.c | 428 ++----- ompi/mca/coll/han/coll_han_trigger.c | 19 +- ompi/mca/coll/han/coll_han_trigger.h | 31 +- ompi/mca/coll/han/coll_han_utils.c | 58 - ompi/mca/coll/sm/coll_sm_module.c | 2 +- .../coll/tuned/coll_tuned_decision_fixed.c | 3 +- ompi/mca/coll/tuned/coll_tuned_dynamic_file.c | 64 +- ompi/request/request.c | 2 +- 31 files changed, 2668 insertions(+), 2974 deletions(-) delete mode 100644 ompi/mca/coll/han/coll_han_utils.c diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 4c6a7a7b4f..649979746d 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -401,11 +401,10 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group, /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ -/* -** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). -*/ -int ompi_comm_split( ompi_communicator_t* comm, int color, int key, - ompi_communicator_t **newcomm, bool pass_on_topo ) + +int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, + opal_info_t *info, + ompi_communicator_t **newcomm, bool pass_on_topo ) { int myinfo[2]; int size, my_size; @@ -611,7 +610,11 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d", newcomp->c_contextid, comm->c_contextid ); - + /* Copy info if there is one */ + if (info) { + newcomp->super.s_info = OBJ_NEW(opal_info_t); + opal_info_dup(info, &(newcomp->super.s_info)); + } /* Activate the communicator and init coll-component */ rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); @@ -638,6 +641,15 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, } +/* +** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). +*/ +int ompi_comm_split( ompi_communicator_t* comm, int color, int key, + ompi_communicator_t **newcomm, bool pass_on_topo ) +{ + return ompi_comm_split_with_info(comm, color, key, NULL, newcomm, pass_on_topo); +} + /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 8936b7f1df..01c0261488 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -463,6 +463,21 @@ int ompi_topo_dist_graph_create_adjacent(ompi_communicator_t *old_comm, OMPI_DECLSPEC int ompi_comm_split (ompi_communicator_t *comm, int color, int key, ompi_communicator_t** newcomm, bool pass_on_topo); +/** + * split a communicator based on color and key. Parameters + * are identical to the MPI-counterpart of the function. + * Similar to \see ompi_comm_split with an additional info parameter. + * + * @param comm: input communicator + * @param color + * @param key + * + * @ + */ +OMPI_DECLSPEC int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, + opal_info_t *info, + ompi_communicator_t **newcomm, bool pass_on_topo ); + /** * split a communicator based on type and key. Parameters * are identical to the MPI-counterpart of the function. diff --git a/ompi/group/group.c b/ompi/group/group.c index f5cc88be98..9e368c96da 100644 --- a/ompi/group/group.c +++ b/ompi/group/group.c @@ -578,3 +578,31 @@ bool ompi_group_have_remote_peers (ompi_group_t *group) return false; } + +/** + * Count the number of processes on this group that share the same node as + * this process. + */ +int ompi_group_count_local_peers (ompi_group_t *group) +{ + int local_peers = 0; + for (int i = 0 ; i < group->grp_proc_count ; ++i) { + ompi_proc_t *proc = NULL; +#if OMPI_GROUP_SPARSE + proc = ompi_group_peer_lookup (group, i); +#else + proc = ompi_group_get_proc_ptr_raw (group, i); + if (ompi_proc_is_sentinel (proc)) { + /* the proc must be stored in the group or cached in the proc + * hash table if the process resides in the local node + * (see ompi_proc_complete_init) */ + continue; + } +#endif + if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { + local_peers++; + } + } + + return local_peers; +} diff --git a/ompi/group/group.h b/ompi/group/group.h index 661666246e..d1cf7d99ae 100644 --- a/ompi/group/group.h +++ b/ompi/group/group.h @@ -420,8 +420,16 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t return ompi_group_get_proc_ptr (group, peer_id, false); } +/** + * Return true if all processes in the group are not on the local node. + */ bool ompi_group_have_remote_peers (ompi_group_t *group); +/** + * Count the number of processes on the local node. + */ +int ompi_group_count_local_peers (ompi_group_t *group); + /** * Function to print the group info */ diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index b22982c011..605d626230 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -178,7 +178,7 @@ static int send_cb(ompi_request_t * req) || (context->con->tree->tree_nextsize > 0 && rank != context->con->root && num_sent == context->con->tree->tree_nextsize * context->con->num_segs && num_recv_fini == context->con->num_segs)) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n", + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in send\n", ompi_comm_rank(context->con->comm))); ibcast_request_fini(context); } @@ -306,7 +306,7 @@ static int recv_cb(ompi_request_t * req) && num_recv_fini == context->con->num_segs) || (context->con->tree->tree_nextsize == 0 && num_recv_fini == context->con->num_segs)) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n", + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in recv\n", ompi_comm_rank(context->con->comm))); ibcast_request_fini(context); } diff --git a/ompi/mca/coll/base/coll_base_comm_select.c b/ompi/mca/coll/base/coll_base_comm_select.c index 405bd6b388..8c6023d411 100644 --- a/ompi/mca/coll/base/coll_base_comm_select.c +++ b/ompi/mca/coll/base/coll_base_comm_select.c @@ -38,6 +38,7 @@ #include "mpi.h" #include "ompi/communicator/communicator.h" #include "opal/util/output.h" +#include "opal/util/argv.h" #include "opal/util/show_help.h" #include "opal/class/opal_list.h" #include "opal/class/opal_object.h" @@ -312,6 +313,20 @@ static int avail_coll_compare (opal_list_item_t **a, return 0; } +static inline int +component_in_argv(char **argv, const char* component_name) +{ + if( NULL != argv ) { + while( NULL != *argv ) { + if( 0 == strcmp(component_name, *argv) ) { + return 1; + } + argv++; /* move to the next argument */ + } + } + return 0; +} + /* * For each module in the list, check and see if it wants to run, and * do the resulting priority comparison. Make a list of modules to be @@ -321,13 +336,66 @@ static int avail_coll_compare (opal_list_item_t **a, static opal_list_t *check_components(opal_list_t * components, ompi_communicator_t * comm) { - int priority; + int priority, flag; const mca_base_component_t *component; mca_base_component_list_item_t *cli; mca_coll_base_module_2_3_0_t *module; opal_list_t *selectable; mca_coll_base_avail_coll_t *avail; + char info_val[OPAL_MAX_INFO_VAL+1]; + char **coll_argv = NULL, **coll_exclude = NULL, **coll_include = NULL; + /* Check if this communicator comes with restrictions on the collective modules + * it wants to use. The restrictions are consistent with the MCA parameter + * to limit the collective components loaded, but it applies for each + * communicator and is provided as an info key during the communicator + * creation. Unlike the MCA param, this info key is used not to select + * components but either to prevent components from being used or to + * force a change in the component priority. + */ + if( NULL != comm->super.s_info) { + opal_info_get(comm->super.s_info, "ompi_comm_coll_preference", + sizeof(info_val), info_val, &flag); + if( !flag ) { + goto proceed_to_select; + } + coll_argv = opal_argv_split(info_val, ','); + if(NULL == coll_argv) { + goto proceed_to_select; + } + int idx2, count_include = opal_argv_count(coll_argv); + /* Allocate the coll_include argv */ + coll_include = (char**)malloc((count_include + 1) * sizeof(char*)); + coll_include[count_include] = NULL; /* NULL terminated array */ + /* Dispatch the include/exclude in the corresponding arrays */ + for( int idx = 0; NULL != coll_argv[idx]; idx++ ) { + if( '^' == coll_argv[idx][0] ) { + coll_include[idx] = NULL; /* NULL terminated array */ + + /* Allocate the coll_exclude argv */ + coll_exclude = (char**)malloc((count_include - idx + 1) * sizeof(char*)); + /* save the exclude components */ + for( idx2 = idx; NULL != coll_argv[idx2]; idx2++ ) { + coll_exclude[idx2 - idx] = coll_argv[idx2]; + } + coll_exclude[idx2 - idx] = NULL; /* NULL-terminated array */ + coll_exclude[0] = coll_exclude[0] + 1; /* get rid of the ^ */ + count_include = idx; + break; + } + coll_include[idx] = coll_argv[idx]; + } + /* Reverse the order of the coll_inclide argv to faciliate the ordering of + * the selected components reverse. + */ + for( idx2 = 0; idx2 < (count_include - 1); idx2++ ) { + char* temp = coll_include[idx2]; + coll_include[idx2] = coll_include[count_include - 1]; + coll_include[count_include - 1] = temp; + count_include--; + } + } + proceed_to_select: /* Make a list of the components that query successfully */ selectable = OBJ_NEW(opal_list_t); @@ -335,6 +403,13 @@ static opal_list_t *check_components(opal_list_t * components, OPAL_LIST_FOREACH(cli, &ompi_coll_base_framework.framework_components, mca_base_component_list_item_t) { component = cli->cli_component; + /* dont bother is we have this component in the exclusion list */ + if( component_in_argv(coll_exclude, component->mca_component_name) ) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:base:comm_select: component disqualified: %s (due to communicator info key)", + component->mca_component_name ); + continue; + } priority = check_one_component(comm, component, &module); if (priority >= 0) { /* We have a component that indicated that it wants to run @@ -370,6 +445,27 @@ static opal_list_t *check_components(opal_list_t * components, /* Put this list in priority order */ opal_list_sort(selectable, avail_coll_compare); + /* For all valid component reorder them not on their provided priorities but on + * the order requested in the info key. As at this point the coll_include is + * already ordered backward we can simply prepend the components. + */ + mca_coll_base_avail_coll_t *item, *item_next; + OPAL_LIST_FOREACH_SAFE(item, item_next, + selectable, mca_coll_base_avail_coll_t) { + if( component_in_argv(coll_include, item->ac_component_name) ) { + opal_list_remove_item(selectable, &item->super); + opal_list_prepend(selectable, &item->super); + } + } + + opal_argv_free(coll_argv); + if( NULL != coll_exclude ) { + free(coll_exclude); + } + if( NULL != coll_include ) { + free(coll_include); + } + /* All done */ return selectable; } @@ -403,7 +499,6 @@ static int check_one_component(ompi_communicator_t * comm, return priority; } - /************************************************************************** * Query functions **************************************************************************/ diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 29b4a70cac..99c8b516a2 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -29,6 +29,8 @@ #include "ompi/mca/topo/base/base.h" #include "ompi/mca/pml/pml.h" #include "coll_base_util.h" +#include "coll_base_functions.h" +#include int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, @@ -268,7 +270,7 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *req, } else { scount = rcount = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm); } - + for (int i=0; icb.req_complete_cb = NULL; req->req_complete_cb_data = NULL; req->data.objs.objs[0] = NULL; @@ -309,35 +312,249 @@ OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, N /* File reading functions */ static void skiptonewline (FILE *fptr, int *fileline) { - do { - char val; - int rc; + char val; + int rc; + do { rc = fread(&val, 1, 1, fptr); - if (0 == rc) return; - if ((1 == rc)&&('\n' == val)) { + if (0 == rc) { + return; + } + if ('\n' == val) { (*fileline)++; return; - } + } } while (1); } -long ompi_coll_base_file_getnext (FILE *fptr, int *fileline) +int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val) { - do { - long val; - int rc; - char trash; + char trash; + int rc; - rc = fscanf(fptr, "%li", &val); - if (rc == EOF) return MYEOF; - if (1 == rc) return val; - /* in all other cases, skip to the end */ + do { + rc = fscanf(fptr, "%li", val); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + return 0; + } + /* in all other cases, skip to the end of the token */ rc = fread(&trash, sizeof(char), 1, fptr); - if (rc == EOF) return MYEOF; + if (rc == EOF) { + return -1; + } if ('\n' == trash) (*fileline)++; if ('#' == trash) { skiptonewline (fptr, fileline); - } + } } while (1); } + +int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val) +{ + char trash, token[32]; + int rc; + + *val = NULL; /* security in case we fail */ + do { + rc = fscanf(fptr, "%32s", token); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + if( '#' == token[0] ) { + skiptonewline(fptr, fileline); + continue; + } + *val = (char*)malloc(strlen(token) + 1); + strcpy(*val, token); + return 0; + } + /* in all other cases, skip to the end of the token */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) { + return -1; + } + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} + +int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val) +{ + char trash; + int rc; + + do { + rc = fscanf(fptr, "%" PRIsize_t, val); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + return 0; + } + /* in all other cases, skip to the end of the token */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) { + return -1; + } + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} + +int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected) +{ + char trash; + int rc; + + do { + rc = fread(&trash, sizeof(char), 1, fptr); + if (0 == rc) { /* hit the end of the file */ + return -1; + } + if ('\n' == trash) { + (*fileline)++; + continue; + } + if ('#' == trash) { + skiptonewline (fptr, fileline); + continue; + } + if( trash == expected ) + return 1; /* return true and eat the char */ + if( isblank(trash) ) /* skip all spaces if that's not what we were looking for */ + continue; + if( 0 != fseek(fptr, -1, SEEK_CUR) ) + return -1; + return 0; + } while (1); +} + +/** + * There are certainly simpler implementation for this function when performance + * is not a critical point. But, as this function is used during the collective + * configuration, and we can do this configurations once for each communicator, + * I would rather have a more complex but faster implementation. + * The approach here is to search for the largest common denominators, to create + * something similar to a dichotomic search. + */ +int mca_coll_base_name_to_colltype(const char* name) +{ + if( 'n' == name[0] ) { + if( 0 == strncmp(name, "neighbor_all", 12) ) { + if( 't' != name[12] ) { + if( 0 == strncmp(name+12, "gather", 6) ) { + if('\0' == name[18]) return NEIGHBOR_ALLGATHER; + if( 'v' == name[18]) return NEIGHBOR_ALLGATHERV; + } + } else { + if( 0 == strncmp(name+12, "toall", 5) ) { + if( '\0' == name[17] ) return NEIGHBOR_ALLTOALL; + if( 'v' == name[17] ) return NEIGHBOR_ALLTOALLV; + if( 'w' == name[17] ) return NEIGHBOR_ALLTOALLW; + } + } + } + return -1; + } + if( 'a' == name[0] ) { + if( 0 != strncmp(name, "all", 3) ) { + return -1; + } + if( 't' != name[3] ) { + if( 'r' == name[3] ) { + if( 0 == strcmp(name+3, "reduce") ) + return ALLREDUCE; + } else { + if( 0 == strncmp(name+3, "gather", 6) ) { + if( '\0' == name[9] ) return ALLGATHER; + if( 'v' == name[9] ) return ALLGATHERV; + } + } + } else { + if( 0 == strncmp(name+3, "toall", 5) ) { + if( '\0' == name[8] ) return ALLTOALL; + if( 'v' == name[8] ) return ALLTOALLV; + if( 'w' == name[8] ) return ALLTOALLW; + } + } + return -1; + } + if( 'r' > name[0] ) { + if( 'b' == name[0] ) { + if( 0 == strcmp(name, "barrier") ) + return BARRIER; + if( 0 == strcmp(name, "bcast") ) + return BCAST; + } else if( 'g'== name[0] ) { + if( 0 == strncmp(name, "gather", 6) ) { + if( '\0' == name[6] ) return GATHER; + if( 'v' == name[6] ) return GATHERV; + } + } + if( 0 == strcmp(name, "exscan") ) + return EXSCAN; + return -1; + } + if( 's' > name[0] ) { + if( 0 == strncmp(name, "reduce", 6) ) { + if( '\0' == name[6] ) return REDUCE; + if( '_' == name[6] ) { + if( 0 == strncmp(name+7, "scatter", 7) ) { + if( '\0' == name[14] ) return REDUCESCATTER; + if( 0 == strcmp(name+14, "_block") ) return REDUCESCATTERBLOCK; + } + } + } + return -1; + } + if( 0 == strcmp(name, "scan") ) + return SCAN; + if( 0 == strcmp(name, "scatterv") ) + return SCATTERV; + if( 0 == strcmp(name, "scatter") ) + return SCATTER; + return -1; +} + +/* conversion table for all COLLTYPE_T values defined in ompi/mca/coll/base/coll_base_functions.h */ +static const char* colltype_translation_table[] = { + [ALLGATHER] = "allgather", + [ALLGATHERV] = "allgatherv", + [ALLREDUCE] = "allreduce", + [ALLTOALL] = "alltoall", + [ALLTOALLV] = "alltoallv", + [ALLTOALLW] = "alltoallw", + [BARRIER] = "barrier", + [BCAST] = "bcast", + [EXSCAN] = "exscan", + [GATHER] = "gather", + [GATHERV] = "gatherv", + [REDUCE] = "reduce", + [REDUCESCATTER] = "reduce_scatter", + [REDUCESCATTERBLOCK] = "reduce_scatter_block", + [SCAN] = "scan", + [SCATTER] = "scatter", + [SCATTERV] = "scatterv", + [NEIGHBOR_ALLGATHER] = "neighbor_allgather", + [NEIGHBOR_ALLGATHERV] = "neighbor_allgatherv", + [NEIGHBOR_ALLTOALL] = "neighbor_alltoall", + [NEIGHBOR_ALLTOALLV] = "neighbor_alltoallv", + [NEIGHBOR_ALLTOALLW] = "neighbor_alltoallw", + [COLLCOUNT] = NULL +}; + +char* mca_coll_base_colltype_to_str(int collid) +{ + if( (collid < 0) || (collid >= COLLCOUNT) ) { + return NULL; + } + return strdup(colltype_translation_table[collid]); +} diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index 239322b022..ee649fa63f 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, @@ -178,8 +178,17 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request, ompi_datatype_t * const rtypes[]); /* File reading function */ -#define MYEOF -999 -long ompi_coll_base_file_getnext(FILE *fptr, int *fileline); +int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val); +int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val); +int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val); +/* peek at the next valid token to see if it begins with the expected value. If yes + * eat the value, otherwise put it back into the file. + */ +int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected); + +/* Miscelaneous function */ +char* mca_coll_base_colltype_to_str(int collid); +int mca_coll_base_name_to_colltype(const char* name); END_C_DECLS #endif /* MCA_COLL_BASE_UTIL_EXPORT_H */ diff --git a/ompi/mca/coll/han/Makefile.am b/ompi/mca/coll/han/Makefile.am index 55892512e3..61b40d97c5 100644 --- a/ompi/mca/coll/han/Makefile.am +++ b/ompi/mca/coll/han/Makefile.am @@ -26,8 +26,7 @@ coll_han_trigger.c \ coll_han_dynamic.c \ coll_han_dynamic_file.c \ coll_han_topo.c \ -coll_han_subcomms.c \ -coll_han_utils.c +coll_han_subcomms.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h index 1af75ffec3..16efcbe8e5 100644 --- a/ompi/mca/coll/han/coll_han.h +++ b/ompi/mca/coll/han/coll_han.h @@ -20,9 +20,7 @@ #include "opal/util/output.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "coll_han_trigger.h" -#include "ompi/mca/coll/han/coll_han_dynamic.h" - -BEGIN_C_DECLS +#include "ompi/mca/coll/han/coll_han_dynamic.h" /* * Today; @@ -33,131 +31,125 @@ BEGIN_C_DECLS #define COLL_HAN_LOW_MODULES 2 #define COLL_HAN_UP_MODULES 2 -typedef struct { - uint32_t umod; - uint32_t lmod; - uint32_t fs; - uint32_t ualg; - uint32_t us; -} selection; - -struct mca_bcast_argu_s { +struct mca_coll_han_bcast_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; void *buff; + ompi_datatype_t *dtype; int seg_count; - struct ompi_datatype_t *dtype; int root_low_rank; int root_up_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int num_segments; int cur_seg; int w_rank; int last_seg_count; bool noop; }; -typedef struct mca_bcast_argu_s mca_bcast_argu_t; +typedef struct mca_coll_han_bcast_args_s mca_coll_han_bcast_args_t; -struct mca_reduce_argu_s { +struct mca_coll_han_reduce_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; void *sbuf; void *rbuf; + ompi_op_t *op; + ompi_datatype_t *dtype; int seg_count; - struct ompi_datatype_t *dtype; - struct ompi_op_t *op; int root_low_rank; int root_up_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int num_segments; int cur_seg; int w_rank; int last_seg_count; bool noop; + bool is_tmp_rbuf; }; -typedef struct mca_reduce_argu_s mca_reduce_argu_t; +typedef struct mca_coll_han_reduce_args_s mca_coll_han_reduce_args_t; -struct mca_allreduce_argu_s { +struct mca_coll_han_allreduce_args_s { mca_coll_task_t *cur_task; - void *sbuf; - void *rbuf; - int seg_count; - struct ompi_datatype_t *dtype; - struct ompi_op_t *op; - int root_up_rank; - int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; - int num_segments; - int cur_seg; - int w_rank; - int last_seg_count; - bool noop; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; ompi_request_t *req; + void *sbuf; + void *rbuf; + ompi_op_t *op; + ompi_datatype_t *dtype; + int seg_count; + int root_up_rank; + int root_low_rank; + int num_segments; + int cur_seg; + int w_rank; + int last_seg_count; + bool noop; int *completed; }; -typedef struct mca_allreduce_argu_s mca_allreduce_argu_t; +typedef struct mca_coll_han_allreduce_args_s mca_coll_han_allreduce_args_t; -struct mca_scatter_argu_s { +struct mca_coll_han_scatter_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *sbuf_inter_free; void *sbuf_reorder_free; - int scount; - struct ompi_datatype_t *sdtype; void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; int rcount; - struct ompi_datatype_t *rdtype; int root; int root_up_rank; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int w_rank; bool noop; - ompi_request_t *req; }; -typedef struct mca_scatter_argu_s mca_scatter_argu_t; +typedef struct mca_coll_han_scatter_args_s mca_coll_han_scatter_args_t; -struct mca_gather_argu_s { +struct mca_coll_han_gather_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *sbuf_inter_free; - int scount; - struct ompi_datatype_t *sdtype; void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; int rcount; - struct ompi_datatype_t *rdtype; int root; int root_up_rank; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int w_rank; bool noop; - ompi_request_t *req; + bool is_mapbycore; }; -typedef struct mca_gather_argu_s mca_gather_argu_t; +typedef struct mca_coll_han_gather_args_s mca_coll_han_gather_args_t; -struct mca_allgather_argu_s { +struct mca_coll_han_allgather_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *sbuf_inter_free; - int scount; - struct ompi_datatype_t *sdtype; void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; int rcount; - struct ompi_datatype_t *rdtype; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int w_rank; bool noop; bool is_mapbycore; int *topo; - ompi_request_t *req; }; -typedef struct mca_allgather_argu_s mca_allgather_argu_t; +typedef struct mca_coll_han_allgather_s mca_coll_han_allgather_t; /** * Structure to hold the han coll component. First it holds the @@ -184,7 +176,7 @@ typedef struct mca_coll_han_component_t { /* up level module for reduce */ uint32_t han_reduce_up_module; /* low level module for reduce */ - uint32_t han_reduce_low_module; + uint32_t han_reduce_low_module; /* segment size for allreduce */ uint32_t han_allreduce_segsize; /* up level module for allreduce */ @@ -203,21 +195,10 @@ typedef struct mca_coll_han_component_t { uint32_t han_scatter_up_module; /* low level module for scatter */ uint32_t han_scatter_low_module; - /* whether enable auto tune */ - uint32_t han_auto_tune; /* whether we need reproducible results * (but disables topological optimisations) */ uint32_t han_reproducible; - /* create a 3D array - * num_processes (n): 2 4 8 16 32 64 (6) - * num_core (c): 2 4 8 12 (4) - * message size (m): 1 - 4194304 (23) - */ - uint32_t han_auto_tune_n; - uint32_t han_auto_tune_c; - uint32_t han_auto_tune_m; - selection *han_auto_tuned; bool use_simple_algorithm[COLLCOUNT]; /* Dynamic configuration rules */ @@ -228,7 +209,6 @@ typedef struct mca_coll_han_component_t { mca_coll_han_dynamic_rules_t dynamic_rules; /* Dynamic rules from mca parameter */ COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL]; - int topo_level; /* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */ int max_dynamic_errors; @@ -240,7 +220,7 @@ typedef void (*previous_dummy_fn_t) (void); * Structure used to store what is necessary for the collective operations * routines in case of fallback. */ -typedef struct collective_fallback_t { +typedef struct mca_coll_han_single_collective_fallback_s { union { mca_coll_base_module_allgather_fn_t allgather; mca_coll_base_module_allgatherv_fn_t allgatherv; @@ -250,9 +230,24 @@ typedef struct collective_fallback_t { mca_coll_base_module_reduce_fn_t reduce; mca_coll_base_module_scatter_fn_t scatter; previous_dummy_fn_t dummy; - } previous_routine; - mca_coll_base_module_t *previous_module; -} collective_fallback_t; + }; + mca_coll_base_module_t* module; +} mca_coll_han_single_collective_fallback_t; + +/* + * The structure containing a replacement for all collective supported + * by HAN. This structure is used as a fallback during subcommunicator + * creation. + */ +typedef struct mca_coll_han_collectives_fallback_s { + mca_coll_han_single_collective_fallback_t allgather; + mca_coll_han_single_collective_fallback_t allgatherv; + mca_coll_han_single_collective_fallback_t allreduce; + mca_coll_han_single_collective_fallback_t bcast; + mca_coll_han_single_collective_fallback_t reduce; + mca_coll_han_single_collective_fallback_t gather; + mca_coll_han_single_collective_fallback_t scatter; +} mca_coll_han_collectives_fallback_t; /** Coll han module */ typedef struct mca_coll_han_module_t { @@ -262,7 +257,6 @@ typedef struct mca_coll_han_module_t { /* Whether this module has been lazily initialized or not yet */ bool enabled; - struct ompi_communicator_t *cached_comm; struct ompi_communicator_t **cached_low_comms; struct ompi_communicator_t **cached_up_comms; int *cached_vranks; @@ -271,7 +265,7 @@ typedef struct mca_coll_han_module_t { bool are_ppn_imbalanced; /* To be able to fallback when the cases are not supported */ - struct collective_fallback_t previous_routines[COLLCOUNT]; + struct mca_coll_han_collectives_fallback_s fallback; /* To be able to fallback on reproducible algorithm */ mca_coll_base_module_reduce_fn_t reproducible_reduce; @@ -280,7 +274,7 @@ typedef struct mca_coll_han_module_t { mca_coll_base_module_t *reproducible_allreduce_module; /* Topological level of this communicator */ - int topologic_level; + TOPO_LVL_T topologic_level; /* Collective module storage for module choice */ mca_coll_han_collective_modules_storage_t modules_storage; @@ -302,21 +296,53 @@ OBJ_CLASS_DECLARATION(mca_coll_han_module_t); * Some defines to stick to the naming used in the other components in terms of * fallback routines */ -#define previous_allgather previous_routines[ALLGATHER].previous_routine.allgather -#define previous_allgatherv previous_routines[ALLGATHERV].previous_routine.allgatherv -#define previous_allreduce previous_routines[ALLREDUCE].previous_routine.allreduce -#define previous_bcast previous_routines[BCAST].previous_routine.bcast -#define previous_gather previous_routines[GATHER].previous_routine.gather -#define previous_reduce previous_routines[REDUCE].previous_routine.reduce -#define previous_scatter previous_routines[SCATTER].previous_routine.scatter +#define previous_allgather fallback.allgather.allgather +#define previous_allgather_module fallback.allgather.module + +#define previous_allgatherv fallback.allgatherv.allgatherv +#define previous_allgatherv_module fallback.allgatherv.module + +#define previous_allreduce fallback.allreduce.allreduce +#define previous_allreduce_module fallback.allreduce.module + +#define previous_bcast fallback.bcast.bcast +#define previous_bcast_module fallback.bcast.module + +#define previous_reduce fallback.reduce.reduce +#define previous_reduce_module fallback.reduce.module + +#define previous_gather fallback.gather.gather +#define previous_gather_module fallback.gather.module + +#define previous_scatter fallback.scatter.scatter +#define previous_scatter_module fallback.scatter.module + + +/* macro to correctly load a fallback collective module */ +#define HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, COLL) \ + do { \ + if ( ((COMM)->c_coll->coll_ ## COLL ## _module) == (mca_coll_base_module_t*)(HANM) ) { \ + (COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \ + mca_coll_base_module_t *coll_module = (COMM)->c_coll->coll_ ## COLL ## _module; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \ + OBJ_RETAIN((COMM)->c_coll->coll_ ## COLL ## _module); \ + OBJ_RELEASE(coll_module); \ + } \ + } while(0) + +/* macro to correctly load /all/ fallback collectives */ +#define HAN_LOAD_FALLBACK_COLLECTIVES(HANM, COMM) \ + do { \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, bcast); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, scatter); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, gather); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, reduce); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allreduce); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgather); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgatherv); \ + han_module->enabled = false; /* entire module set to pass-through from now on */ \ + } while(0) -#define previous_allgather_module previous_routines[ALLGATHER].previous_module -#define previous_allgatherv_module previous_routines[ALLGATHERV].previous_module -#define previous_allreduce_module previous_routines[ALLREDUCE].previous_module -#define previous_bcast_module previous_routines[BCAST].previous_module -#define previous_gather_module previous_routines[GATHER].previous_module -#define previous_reduce_module previous_routines[REDUCE].previous_module -#define previous_scatter_module previous_routines[SCATTER].previous_module /** * Global component instance @@ -333,20 +359,30 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t *comm int han_request_free(ompi_request_t ** request); /* Subcommunicator creation */ -void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); -void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); -/* Gather topology information */ +int mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); +int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); + +/** + * Gather topology information + * + * Returns a pointer to the (potentially already cached) topology. + * NOTE: if the rank distribution is imbalanced, no effort will be made to gather + * the topology at all ranks and instead NULL is returned and han_module->is_mapbycore + * is set to false. + * If HAN ever learns to deal with imbalanced topologies, this needs fixing! + */ int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, int num_topo_level); /* Utils */ -void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, - int *root_up_rank); -uint32_t han_auto_tuned_get_n(uint32_t n); -uint32_t han_auto_tuned_get_c(uint32_t c); -uint32_t han_auto_tuned_get_m(uint32_t m); +static inline void +mca_coll_han_get_ranks(int *vranks, int root, int low_size, + int *root_low_rank, int *root_up_rank) +{ + *root_up_rank = vranks[root] / low_size; + *root_low_rank = vranks[root] % low_size; +} -const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll); const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); /** Dynamic component choice */ @@ -356,7 +392,7 @@ const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); */ int mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module); + mca_coll_han_module_t *han_module); int mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS, @@ -382,22 +418,13 @@ mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS, /* Bcast */ int mca_coll_han_bcast_intra_simple(void *buff, - int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, - int seg_count, struct ompi_datatype_t *dtype, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop); + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_bcast_t0_task(void *task_argu); -int mca_coll_han_bcast_t1_task(void *task_argu); /* Reduce */ int @@ -422,145 +449,75 @@ mca_coll_han_reduce_reproducible(const void *sbuf, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); - - -void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, - void *sbuf, - void *rbuf, int seg_count, struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop); - -int mca_coll_han_reduce_intra(const void *sbuf, +int mca_coll_han_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, ompi_op_t* op, int root, - struct ompi_communicator_t *comm, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_reduce_t0_task(void *task_argu); -int mca_coll_han_reduce_t1_task(void *task_argu); - /* Allreduce */ int mca_coll_han_allreduce_intra_simple(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); int mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, mca_coll_base_module_t *module); int mca_coll_han_allreduce_reproducible(const void *sbuf, void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); -void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *rbuf, - int seg_count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, - int cur_seg, - int w_rank, - int last_seg_count, - bool noop, ompi_request_t * req, int *completed); int mca_coll_han_allreduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_allreduce_t0_task(void *task_argu); -int mca_coll_han_allreduce_t1_task(void *task_argu); -int mca_coll_han_allreduce_t2_task(void *task_argu); -int mca_coll_han_allreduce_t3_task(void *task_argu); /* Scatter */ int mca_coll_han_scatter_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_scatter_us_task(void *task_argu); -int mca_coll_han_scatter_ls_task(void *task_argu); -void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - void *sbuf_reorder_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req); - -/* Gather */ -int -mca_coll_han_gather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_gather_lg_task(void *task_argu); -int mca_coll_han_gather_ug_task(void *task_argu); -void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req); + +/* Gather */ +int +mca_coll_han_gather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); int mca_coll_han_gather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); /* reordering after gather, for unordered ranks */ void ompi_coll_han_reorder_gather(const void *sbuf, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - int * topo); + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + int * topo); @@ -571,30 +528,12 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_allgather_lg_task(void *task_argu); -int mca_coll_han_allgather_uag_task(void *task_argu); -int mca_coll_han_allgather_lb_task(void *task_argu); -void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, - bool noop, bool is_mapbycore, int *topo, ompi_request_t * req); int mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); -END_C_DECLS #endif /* MCA_COLL_HAN_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_allgather.c b/ompi/mca/coll/han/coll_han_allgather.c index 50702d28ff..cc7dfaff26 100644 --- a/ompi/mca/coll/han/coll_han_allgather.c +++ b/ompi/mca/coll/han/coll_han_allgather.c @@ -16,40 +16,45 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" -void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, - bool noop, - bool is_mapbycore, - int *topo, - ompi_request_t * req) +static int mca_coll_han_allgather_lb_task(void *task_args); +static int mca_coll_han_allgather_lg_task(void *task_args); +static int mca_coll_han_allgather_uag_task(void *task_args); + +static inline void +mca_coll_han_set_allgather_args(mca_coll_han_allgather_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, + bool noop, + bool is_mapbycore, + int *topo, + ompi_request_t * req) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->sbuf_inter_free = sbuf_inter_free; - argu->scount = scount; - argu->sdtype = sdtype; - argu->rbuf = rbuf; - argu->rcount = rcount; - argu->rdtype = rdtype; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->w_rank = w_rank; - argu->noop = noop; - argu->is_mapbycore = is_mapbycore; - argu->topo = topo; - argu->req = req; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->is_mapbycore = is_mapbycore; + args->topo = topo; + args->req = req; } int @@ -60,44 +65,52 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - int w_rank; - w_rank = ompi_comm_rank(comm); - /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - mca_coll_han_comm_create_new(comm, han_module); + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather within this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; int low_rank = ompi_comm_rank(low_comm); + int w_rank = ompi_comm_rank(comm); + + /* Init topo */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + /* unbalanced case needs algo adaptation */ + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather with this communicator (imbalance). Fall back on another component\n")); + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } ompi_request_t *temp_request = NULL; /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; - - /* Init topo */ - int *topo = mca_coll_han_topo_init(comm, han_module, 2); + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; int root_low_rank = 0; /* Create lg (lower level gather) task */ mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); /* Setup lg task arguments */ - mca_allgather_argu_t *lg_argu = malloc(sizeof(mca_allgather_argu_t)); - mac_coll_han_set_allgather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount, + mca_coll_han_allgather_t *lg_args = malloc(sizeof(mca_coll_han_allgather_t)); + mca_coll_han_set_allgather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount, rdtype, root_low_rank, up_comm, low_comm, w_rank, low_rank != root_low_rank, han_module->is_mapbycore, topo, temp_request); - /* Init lg task */ - init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_argu)); - /* Issure lg task */ + /* Init and issue lg task */ + init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_args)); issue_task(lg); ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); @@ -105,48 +118,70 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, return OMPI_SUCCESS; } -/* lg: lower level (shared memory) gather task */ -int mca_coll_han_allgather_lg_task(void *task_argu) +/* lg: lower level gather task */ +int mca_coll_han_allgather_lg_task(void *task_args) { - mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; + char *tmp_buf = NULL, *tmp_rbuf = NULL; + char *tmp_send = NULL; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: lg\n", t->w_rank)); - OBJ_RELEASE(t->cur_task); /* If the process is one of the node leader */ - char *tmp_buf = NULL; - char *tmp_rbuf = NULL; + ptrdiff_t rlb, rext; + ompi_datatype_get_extent (t->rdtype, &rlb, &rext); + if (MPI_IN_PLACE == t->sbuf) { + t->sdtype = t->rdtype; + t->scount = t->rcount; + } if (!t->noop) { int low_size = ompi_comm_size(t->low_comm); ptrdiff_t rsize, rgap = 0; rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); tmp_buf = (char *) malloc(rsize); tmp_rbuf = tmp_buf - rgap; + if (MPI_IN_PLACE == t->sbuf) { + tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext; + ompi_datatype_copy_content_same_ddt(t->rdtype, t->rcount, tmp_rbuf, tmp_send); + } } - /* Shared memory gather */ - t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, - t->rdtype, t->root_low_rank, t->low_comm, - t->low_comm->c_coll->coll_gather_module); + /* Lower level (shared memory or intra-node) gather */ + if (MPI_IN_PLACE == t->sbuf) { + if (!t->noop) { + t->low_comm->c_coll->coll_gather(MPI_IN_PLACE, t->scount, t->sdtype, + tmp_rbuf, t->rcount, t->rdtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_gather_module); + } + else { + tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext; + t->low_comm->c_coll->coll_gather(tmp_send, t->rcount, t->rdtype, + NULL, t->rcount, t->rdtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_gather_module); + } + } + else { + t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, + t->rdtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_gather_module); + } + t->sbuf = tmp_rbuf; t->sbuf_inter_free = tmp_buf; /* Create uag (upper level all-gather) task */ - mca_coll_task_t *uag = OBJ_NEW(mca_coll_task_t); - /* Setup uag task arguments */ - t->cur_task = uag; - /* Init uag task */ + mca_coll_task_t *uag = t->cur_task; + /* Init and issue uag task */ init_task(uag, mca_coll_han_allgather_uag_task, (void *) t); - /* Issure uag task */ issue_task(uag); return OMPI_SUCCESS; } /* uag: upper level (inter-node) all-gather task */ -int mca_coll_han_allgather_uag_task(void *task_argu) +int mca_coll_han_allgather_uag_task(void *task_args) { - mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; - OBJ_RELEASE(t->cur_task); + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, @@ -213,21 +248,18 @@ int mca_coll_han_allgather_uag_task(void *task_argu) /* Create lb (low level broadcast) task */ - mca_coll_task_t *lb = OBJ_NEW(mca_coll_task_t); - /* Setup lb task arguments */ - t->cur_task = lb; - /* Init lb task */ + mca_coll_task_t *lb = t->cur_task; + /* Init and issue lb task */ init_task(lb, mca_coll_han_allgather_lb_task, (void *) t); - /* Issure lb task */ issue_task(lb); return OMPI_SUCCESS; } -/* lb: low level (shared-memory) broadcast task */ -int mca_coll_han_allgather_lb_task(void *task_argu) +/* lb: low level broadcast task */ +int mca_coll_han_allgather_lb_task(void *task_args) { - mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: uag noop\n", t->w_rank)); OBJ_RELEASE(t->cur_task); @@ -246,30 +278,41 @@ int mca_coll_han_allgather_lb_task(void *task_argu) int mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module){ + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module){ /* create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - mca_coll_han_comm_create_new(comm, han_module); - ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; - ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather within this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } /* discovery topology */ int *topo = mca_coll_han_topo_init(comm, han_module, 2); /* unbalanced case needs algo adaptation */ - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle allgather with this communicator. It need to fall back on another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, - comm, han_module->previous_allgather_module); + "han cannot handle allgather within this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); } + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + int w_rank = ompi_comm_rank(comm); /* setup up/low coordinates */ int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); @@ -279,27 +322,54 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, /* allocate the intermediary buffer * to gather on leaders on the low sub communicator */ + ptrdiff_t rlb, rext; + ompi_datatype_get_extent (rdtype, &rlb, &rext); char *tmp_buf = NULL; char *tmp_buf_start = NULL; + char *tmp_send = NULL; + if (MPI_IN_PLACE == sbuf) { + scount = rcount; + sdtype = rdtype; + } if (low_rank == root_low_rank) { ptrdiff_t rsize, rgap = 0; /* Compute the size to receive all the local data, including datatypes empty gaps */ rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap); - // intermediary buffer on node leaders to gather on low comm + /* intermediary buffer on node leaders to gather on low comm */ tmp_buf = (char *) malloc(rsize); tmp_buf_start = tmp_buf - rgap; + if (MPI_IN_PLACE == sbuf) { + tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext; + ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmp_buf_start, tmp_send); + } } /* 1. low gather on node leaders into tmp_buf */ - low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, - tmp_buf_start, rcount, rdtype, root_low_rank, - low_comm, low_comm->c_coll->coll_gather_module); + if (MPI_IN_PLACE == sbuf) { + if (low_rank == root_low_rank) { + low_comm->c_coll->coll_gather(MPI_IN_PLACE, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } + else { + tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext; + low_comm->c_coll->coll_gather(tmp_send, rcount, rdtype, + NULL, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } + } + else { + low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } /* 2. allgather between node leaders, from tmp_buf to reorder_buf */ if (low_rank == root_low_rank) { /* allocate buffer to store unordered result on node leaders - * * if the processes are mapped-by core, no need to reorder: - * * distribution of ranks on core first and node next, - * * in a increasing order for both patterns */ + * if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns. + */ char *reorder_buf = NULL; char *reorder_buf_start = NULL; if (han_module->is_mapbycore) { @@ -307,7 +377,7 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, } else { if (0 == low_rank && 0 == up_rank) { // first rank displays message OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Allgather needs reordering: ", w_rank)); + "[%d]: Future Allgather needs reordering: ", up_rank)); } ptrdiff_t rsize, rgap = 0; rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap); @@ -332,8 +402,8 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, */ if (!han_module->is_mapbycore) { ompi_coll_han_reorder_gather(reorder_buf_start, - rbuf, rcount, rdtype, - comm, topo); + rbuf, rcount, rdtype, + comm, topo); free(reorder_buf); reorder_buf = NULL; } @@ -347,4 +417,4 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, return OMPI_SUCCESS; - } +} diff --git a/ompi/mca/coll/han/coll_han_allreduce.c b/ompi/mca/coll/han/coll_han_allreduce.c index 6a4fd6038f..afa0e0a220 100644 --- a/ompi/mca/coll/han/coll_han_allreduce.c +++ b/ompi/mca/coll/han/coll_han_allreduce.c @@ -17,46 +17,52 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" +static int mca_coll_han_allreduce_t0_task(void *task_args); +static int mca_coll_han_allreduce_t1_task(void *task_args); +static int mca_coll_han_allreduce_t2_task(void *task_args); +static int mca_coll_han_allreduce_t3_task(void *task_args); + /* Only work with regular situation (each node has equal number of processes) */ -void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *rbuf, - int seg_count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, - int cur_seg, - int w_rank, - int last_seg_count, - bool noop, ompi_request_t * req, int *completed) +static inline void +mca_coll_han_set_allreduce_args(mca_coll_han_allreduce_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *rbuf, + int seg_count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, + int cur_seg, + int w_rank, + int last_seg_count, + bool noop, ompi_request_t * req, int *completed) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->rbuf = rbuf; - argu->seg_count = seg_count; - argu->dtype = dtype; - argu->op = op; - argu->root_up_rank = root_up_rank; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->num_segments = num_segments; - argu->cur_seg = cur_seg; - argu->w_rank = w_rank; - argu->last_seg_count = last_seg_count; - argu->noop = noop; - argu->req = req; - argu->completed = completed; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->rbuf = rbuf; + args->seg_count = seg_count; + args->dtype = dtype; + args->op = op; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; + args->req = req; + args->completed = completed; } -/* - * Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce: +/* + * Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce: * lr: lower level (shared-memory or intra-node) reduce, * ur: upper level (inter-node) reduce, * ub: upper level (inter-node) bcast, @@ -80,72 +86,40 @@ mca_coll_han_allreduce_intra(const void *sbuf, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - // Fallback to another component if the op cannot commute mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - if (! ompi_op_is_commute(op)) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle allreduce with this communicator." - "It need to fall back on another component\n")); - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, - comm, han_module->previous_allreduce_module); - } - - ptrdiff_t extent, lb; - ompi_datatype_get_extent(dtype, &lb, &extent); - int w_rank; - w_rank = ompi_comm_rank(comm); - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); + /* No support for non-commutative operations */ + if(!ompi_op_is_commute(op)) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this operation. Fall back on another component\n")); + goto prev_allreduce_intra; + } /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op, + comm, comm->c_coll->coll_reduce_module); + } + + ptrdiff_t extent, lb; + size_t dtype_size; + ompi_datatype_get_extent(dtype, &lb, &extent); + int seg_count = count, w_rank; + w_rank = ompi_comm_rank(comm); + ompi_datatype_type_size(dtype, &dtype_size); + ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; - /* Auto tune is enabled */ - if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) { - uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0])); - uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0])); - uint32_t m = han_auto_tuned_get_m(typelng * count); - uint32_t id = - n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m + - c * mca_coll_han_component.han_auto_tune_m + m + - mca_coll_han_component.han_auto_tune_n * mca_coll_han_component.han_auto_tune_c * - mca_coll_han_component.han_auto_tune_m; - uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod; - uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod; - uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs; - /* ualg and us are only available when using ADAPT */ - /* - uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg; - uint32_t us = mca_coll_han_component.han_auto_tuned[id].us; - */ - /* Set up umod */ - up_comm = han_module->cached_up_comms[umod]; - /* Set up lmod */ - low_comm = han_module->cached_low_comms[lmod]; - /* Set up fs */ - COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count); - /* Set up ualg and us, which is only available when using ADAPT */ - /* - if (umod == 1) { - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_algorithm = ualg; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_algorithm = ualg; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_segment_size = us; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_segment_size = us; - } - */ - } else { - low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; - up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, typelng, - seg_count); - } + + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_allreduce_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_allreduce_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, dtype_size, + seg_count); /* Determine number of elements sent per task. */ OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, @@ -161,8 +135,8 @@ mca_coll_han_allreduce_intra(const void *sbuf, /* Setup up t0 task arguments */ int *completed = (int *) malloc(sizeof(int)); completed[0] = 0; - mca_allreduce_argu_t *t = malloc(sizeof(mca_allreduce_argu_t)); - mac_coll_han_set_allreduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, + mca_coll_han_allreduce_args_t *t = malloc(sizeof(mca_coll_han_allreduce_args_t)); + mca_coll_han_set_allreduce_args(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, low_rank != root_low_rank, NULL, completed); @@ -208,35 +182,51 @@ mca_coll_han_allreduce_intra(const void *sbuf, init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t); issue_task(t3); } - if (t->completed != NULL) { - free(t->completed); - t->completed = NULL; - } + free(t->completed); + t->completed = NULL; free(t); return OMPI_SUCCESS; + + prev_allreduce_intra: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); } /* t0 task */ -int mca_coll_han_allreduce_t0_task(void *task_argu) +int mca_coll_han_allreduce_t0_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t0 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); OBJ_RELEASE(t->cur_task); ptrdiff_t extent, lb; ompi_datatype_get_extent(t->dtype, &lb, &extent); - t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, - t->op, t->root_low_rank, t->low_comm, - t->low_comm->c_coll->coll_reduce_module); + if (MPI_IN_PLACE == t->sbuf) { + if (!t->noop) { + t->low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + else { + t->low_comm->c_coll->coll_reduce((char *) t->rbuf, NULL, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + } + else { + t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } return OMPI_SUCCESS; } /* t1 task */ -int mca_coll_han_allreduce_t1_task(void *task_argu) +int mca_coll_han_allreduce_t1_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t1 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); @@ -270,16 +260,16 @@ int mca_coll_han_allreduce_t1_task(void *task_argu) } if (!t->noop) { - ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); + ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE); } return OMPI_SUCCESS; } /* t2 task */ -int mca_coll_han_allreduce_t2_task(void *task_argu) +int mca_coll_han_allreduce_t2_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t2 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); @@ -336,9 +326,9 @@ int mca_coll_han_allreduce_t2_task(void *task_argu) } /* t3 task */ -int mca_coll_han_allreduce_t3_task(void *task_argu) +int mca_coll_han_allreduce_t3_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t3 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); @@ -408,12 +398,12 @@ int mca_coll_han_allreduce_t3_task(void *task_argu) int mca_coll_han_allreduce_intra_simple(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -428,22 +418,43 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf, // Fallback to another component if the op cannot commute if (! ompi_op_is_commute(op)) { - OPAL_OUTPUT_VERBOSE((30, cs->han_output, - "han cannot handle allreduce with this operation." - "It need to fall back on another component\n")); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this operation. Fall back on another component\n")); goto prev_allreduce; } - mca_coll_han_comm_create_new(comm, han_module); + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op, + comm, comm->c_coll->coll_reduce_module); + } low_comm = han_module->sub_comm[INTRA_NODE]; up_comm = han_module->sub_comm[INTER_NODE]; low_rank = ompi_comm_rank(low_comm); /* Low_comm reduce */ - ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, + if (MPI_IN_PLACE == sbuf) { + if (low_rank == root_low_rank) { + ret = low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)rbuf, count, dtype, op, root_low_rank, low_comm, low_comm->c_coll->coll_reduce_module); + } + else { + ret = low_comm->c_coll->coll_reduce((char *)rbuf, NULL, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + } + } + else { + ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + } if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((30, cs->han_output, "HAN/ALLREDUCE: low comm reduce failed. " @@ -480,9 +491,9 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf, return OMPI_SUCCESS; -prev_allreduce: - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm, - han_module->previous_allreduce_module); + prev_allreduce: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); } /* Find a fallback on reproducible algorithm @@ -504,15 +515,14 @@ mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, int i; for (i=0; imodules_storage - .modules[fallback] - .module_handler; + mca_coll_base_module_t *fallback_module + = han_module->modules_storage.modules[fallback].module_handler; if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) { if (0 == w_rank) { opal_output_verbose(30, mca_coll_han_component.han_output, "coll:han:allreduce_reproducible: " "fallback on %s\n", - components_name[fallback]); + available_components[fallback].component_name); } han_module->reproducible_allreduce_module = fallback_module; han_module->reproducible_allreduce = fallback_module->coll_allreduce; @@ -525,8 +535,7 @@ mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, "coll:han:allreduce_reproducible_decision: " "no reproducible fallback\n"); } - han_module->reproducible_allreduce_module = - han_module->previous_allreduce_module; + han_module->reproducible_allreduce_module = han_module->previous_allreduce_module; han_module->reproducible_allreduce = han_module->previous_allreduce; return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_bcast.c b/ompi/mca/coll/han/coll_han_bcast.c index 6eebc3b7d3..c32ea745b0 100644 --- a/ompi/mca/coll/han/coll_han_bcast.c +++ b/ompi/mca/coll/han/coll_han_bcast.c @@ -16,31 +16,35 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" -void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, - int seg_count, struct ompi_datatype_t *dtype, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop) +static int mca_coll_han_bcast_t0_task(void *task_args); +static int mca_coll_han_bcast_t1_task(void *task_args); + +static inline void +mca_coll_han_set_bcast_args(mca_coll_han_bcast_args_t * args, mca_coll_task_t * cur_task, void *buff, + int seg_count, struct ompi_datatype_t *dtype, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop) { - argu->cur_task = cur_task; - argu->buff = buff; - argu->seg_count = seg_count; - argu->dtype = dtype; - argu->root_low_rank = root_low_rank; - argu->root_up_rank = root_up_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->num_segments = num_segments; - argu->cur_seg = cur_seg; - argu->w_rank = w_rank; - argu->last_seg_count = last_seg_count; - argu->noop = noop; + args->cur_task = cur_task; + args->buff = buff; + args->seg_count = seg_count; + args->dtype = dtype; + args->root_low_rank = root_low_rank; + args->root_up_rank = root_up_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; } -/* - * Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast: +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast: * ub: upper level (inter-node) bcast * lb: low level (shared-memory or intra-node) bcast. * Hence, in each iteration, there is a combination of collective operations which is called a task. @@ -58,82 +62,57 @@ mca_coll_han_bcast_intra(void *buff, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - ptrdiff_t extent, lb; - ompi_datatype_get_extent(dtype, &lb, &extent); - int w_rank; - w_rank = ompi_comm_rank(comm); - int seg_count = count; - size_t typelng; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + int err, seg_count = count, w_rank = ompi_comm_rank(comm); + ompi_communicator_t *low_comm, *up_comm; + ptrdiff_t extent, lb; + size_t dtype_size; + /* Create the subcommunicators */ + err = mca_coll_han_comm_create(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle bcast with this communicator. It need to fall back on another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, - comm, han_module->previous_bcast_module); + "han cannot handle bcast with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); } - ompi_datatype_type_size(dtype, &typelng); + ompi_datatype_get_extent(dtype, &lb, &extent); + ompi_datatype_type_size(dtype, &dtype_size); - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); - ompi_communicator_t *low_comm; - ompi_communicator_t *up_comm; - /* Auto tune is enabled */ - if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) { - uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0])); - uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0])); - uint32_t m = han_auto_tuned_get_m(typelng * count); - uint32_t id = - n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m + - c * mca_coll_han_component.han_auto_tune_m + m; - uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod; - uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod; - uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs; - /* ualg and us are only available when using ADAPT */ - /* - uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg; - uint32_t us = mca_coll_han_component.han_auto_tuned[id].us; - */ - /* Set up umod */ - up_comm = han_module->cached_up_comms[umod]; - /* Set up lmod */ - low_comm = han_module->cached_low_comms[lmod]; - /* Set up fs */ - COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count); - /* Set up ualg and us, which is only available when using ADAPT */ - /* - if (umod == 1) { - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_algorithm = ualg; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_segment_size = us; - } - */ - - } else { - /* If auto tune is disabled, use MCA parameters */ - low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; - up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, typelng, - seg_count); - } + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, dtype_size, + seg_count); int num_segments = (count + seg_count - 1) / seg_count; OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output, - "In HAN seg_count %d count %d num_seg %d\n", + "In HAN seg_count %d count %d num_seg %d\n", seg_count, count, num_segments)); int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); - int root_low_rank; - int root_up_rank; + int root_low_rank, root_up_rank; mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, @@ -142,8 +121,8 @@ mca_coll_han_bcast_intra(void *buff, /* Create t0 tasks for the first segment */ mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); /* Setup up t0 task arguments */ - mca_bcast_argu_t *t = malloc(sizeof(mca_bcast_argu_t)); - mac_coll_han_set_bcast_argu(t, t0, (char *) buff, seg_count, dtype, + mca_coll_han_bcast_args_t *t = malloc(sizeof(mca_coll_han_bcast_args_t)); + mca_coll_han_set_bcast_args(t, t0, (char *) buff, seg_count, dtype, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, low_rank != root_low_rank); @@ -161,9 +140,7 @@ mca_coll_han_bcast_intra(void *buff, while (t->cur_seg <= t->num_segments - 2) { /* Create t1 task */ - mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); - /* Setup up t1 task arguments */ - t->cur_task = t1; + t->cur_task = t1 = OBJ_NEW(mca_coll_task_t); t->buff = (char *) t->buff + extent * seg_count; t->cur_seg = t->cur_seg + 1; /* Init the t1 task */ @@ -177,43 +154,40 @@ mca_coll_han_bcast_intra(void *buff, } /* t0 task: issue and wait for the upper level ibcast of segment 0 */ -int mca_coll_han_bcast_t0_task(void *task_argu) +int mca_coll_han_bcast_t0_task(void *task_args) { - mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; + mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); if (t->noop) { return OMPI_SUCCESS; - } else { - ptrdiff_t extent, lb; - ompi_datatype_get_extent(t->dtype, &lb, &extent); - ompi_request_t *ibcast_req; - t->up_comm->c_coll->coll_ibcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank, - t->up_comm, &ibcast_req, t->up_comm->c_coll->coll_ibcast_module); - ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE); - return OMPI_SUCCESS; } + t->up_comm->c_coll->coll_bcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank, + t->up_comm, t->up_comm->c_coll->coll_bcast_module); + return OMPI_SUCCESS; } -/* t1 task: +/* t1 task: * 1. issue the upper level ibcast of segment cur_seg + 1 * 2. issue the low level bcast of segment cur_seg * 3. wait for the completion of the ibcast */ -int mca_coll_han_bcast_t1_task(void *task_argu) +int mca_coll_han_bcast_t1_task(void *task_args) { - mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; + mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args; + ompi_request_t *ibcast_req = NULL; + int tmp_count = t->seg_count; + ptrdiff_t extent, lb; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); - ptrdiff_t extent, lb; ompi_datatype_get_extent(t->dtype, &lb, &extent); - ompi_request_t *ibcast_req = NULL; - int tmp_count = t->seg_count; if (!t->noop) { if (t->cur_seg <= t->num_segments - 2 ) { - if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + if (t->cur_seg == t->num_segments - 2) { tmp_count = t->last_seg_count; } t->up_comm->c_coll->coll_ibcast((char *) t->buff + extent * t->seg_count, @@ -223,12 +197,14 @@ int mca_coll_han_bcast_t1_task(void *task_argu) } } + /* are we the last segment to be pushed downstream ? */ + tmp_count = (t->cur_seg == (t->num_segments - 1)) ? t->last_seg_count : t->seg_count; t->low_comm->c_coll->coll_bcast((char *) t->buff, - t->seg_count, t->dtype, t->root_low_rank, t->low_comm, + tmp_count, t->dtype, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_bcast_module); - if (!t->noop && ibcast_req != NULL) { - ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE); + if (NULL != ibcast_req) { + ompi_request_wait(&ibcast_req, MPI_STATUS_IGNORE); } return OMPI_SUCCESS; @@ -242,51 +218,64 @@ mca_coll_han_bcast_intra_simple(void *buff, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int w_rank; - w_rank = ompi_comm_rank(comm); - /* create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - mca_coll_han_comm_create_new(comm, han_module); - ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; - ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + ompi_communicator_t *low_comm, *up_comm; + int err, w_rank = ompi_comm_rank(comm); + + /* Create the subcommunicators */ + err = mca_coll_han_comm_create_new(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } + + low_comm = han_module->sub_comm[INTRA_NODE]; + up_comm = han_module->sub_comm[INTER_NODE]; int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); - int root_low_rank; - int root_up_rank; - - /* Topo must be initialized to know rank distribution which then is used to - * determine if han can be used */ - mca_coll_han_topo_init(comm, han_module, 2); - - if (han_module->are_ppn_imbalanced){ - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle bcast with this communicator. It need to fall back on another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, - comm, han_module->previous_bcast_module); - } else { - OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, - "[OMPI][han] in mca_coll_han_bcast_intra_simple\n")); - } + int root_low_rank, root_up_rank; mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: root_low_rank %d root_up_rank %d\n", + "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, root_up_rank)); if (low_rank == root_low_rank) { - up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, up_comm, up_comm->c_coll->coll_bcast_module); + up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, + up_comm, up_comm->c_coll->coll_bcast_module); /* To remove when han has better sub-module selection. For now switching to ibcast enables to make runs with libnbc. */ //ompi_request_t req; - //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, up_comm, &req, up_comm->c_coll->coll_ibcast_module); + //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, + // up_comm, &req, up_comm->c_coll->coll_ibcast_module); //ompi_request_wait(&req, MPI_STATUS_IGNORE); } - low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module); + low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, + low_comm, low_comm->c_coll->coll_bcast_module); return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index cfb40c7da0..ef55a6ac99 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -25,13 +25,24 @@ #include "coll_han.h" #include "coll_han_dynamic.h" #include "coll_han_dynamic_file.h" +#include "ompi/mca/coll/base/coll_base_util.h" /* * Public string showing the coll ompi_han component version number */ const char *mca_coll_han_component_version_string = - "Open MPI han collective MCA component version " OMPI_VERSION; + "Open MPI HAN collective MCA component version " OMPI_VERSION; +ompi_coll_han_components available_components[COMPONENTS_COUNT] = { + { SELF, "self", NULL }, + { BASIC, "basic", NULL }, + { LIBNBC, "libnbc", NULL }, + { TUNED, "tuned", NULL }, + { SM, "sm", NULL }, + { SHARED, "shared", NULL }, + { ADAPT, "adapt", NULL }, + { HAN, "han", NULL } +}; /* * Local functions @@ -46,35 +57,33 @@ static int han_register(void); */ mca_coll_han_component_t mca_coll_han_component = { - /* First, fill in the super */ - { - /* First, the mca_component_t struct containing meta - information about the component itself */ + /* First, the mca_component_t struct containing meta + information about the component itself */ - .collm_version = { - MCA_COLL_BASE_VERSION_2_0_0, + .collm_version = { + MCA_COLL_BASE_VERSION_2_0_0, - /* Component name and version */ - .mca_component_name = "han", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), + /* Component name and version */ + .mca_component_name = "han", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), - /* Component functions */ - .mca_open_component = han_open, - .mca_close_component = han_close, - .mca_register_component_params = han_register, - }, - .collm_data = { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE}, + /* Component functions */ + .mca_open_component = han_open, + .mca_close_component = han_close, + .mca_register_component_params = han_register, + }, + .collm_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE}, - /* Initialization / querying functions */ + /* Initialization / querying functions */ - .collm_init_query = mca_coll_han_init_query, - .collm_comm_query = mca_coll_han_comm_query, - }, + .collm_init_query = mca_coll_han_init_query, + .collm_comm_query = mca_coll_han_comm_query, + }, /* han-component specifc information */ @@ -87,27 +96,9 @@ mca_coll_han_component_t mca_coll_han_component = { */ static int han_open(void) { - int param; - mca_coll_han_component_t *cs = &mca_coll_han_component; - if (cs->han_auto_tune) { - cs->han_auto_tuned = - (selection *) malloc(2 * cs->han_auto_tune_n * cs->han_auto_tune_c * - cs->han_auto_tune_m * sizeof(selection)); - char *filename = "/home/dycz0fx/results/auto/auto_tuned.bin"; - FILE *file = fopen(filename, "r"); - fread(cs->han_auto_tuned, sizeof(selection), - 2 * cs->han_auto_tune_n * cs->han_auto_tune_c * cs->han_auto_tune_m, file); - fclose(file); - } + /* Get the global coll verbosity: it will be ours */ + mca_coll_han_component.han_output = ompi_coll_base_framework.framework_output; - /* - * Get the global coll verbosity: it will be ours - */ - cs->han_output = ompi_coll_base_framework.framework_output; - opal_output_verbose(1, cs->han_output, - "coll:han:component_open: done!"); - - cs->topo_level = GLOBAL_COMMUNICATOR; return mca_coll_han_init_dynamic_rules(); } @@ -117,11 +108,6 @@ static int han_open(void) */ static int han_close(void) { - mca_coll_han_component_t *cs = &mca_coll_han_component; - if (cs->han_auto_tune && cs->han_auto_tuned != NULL) { - free(cs->han_auto_tuned); - cs->han_auto_tuned = NULL; - } mca_coll_han_free_dynamic_rules(); return OMPI_SUCCESS; } @@ -154,57 +140,7 @@ const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl) return "invalid topologic level"; } } -const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll) -{ - switch(coll) { - case ALLGATHER: - return "allgather"; - case ALLGATHERV: - return "allgatherv"; - case ALLREDUCE: - return "allreduce"; - case ALLTOALL: - return "alltoall"; - case ALLTOALLV: - return "alltoallv"; - case ALLTOALLW: - return "alltoallw"; - case BARRIER: - return "barrier"; - case BCAST: - return "bcast"; - case EXSCAN: - return "exscan"; - case GATHER: - return "gather"; - case GATHERV: - return "gatherv"; - case REDUCE: - return "reduce"; - case REDUCESCATTER: - return "reduce_scatter"; - case REDUCESCATTERBLOCK: - return "reduce_scatter_block"; - case SCAN: - return "scan"; - case SCATTER: - return "scatter"; - case SCATTERV: - return "scatterv"; - case NEIGHBOR_ALLGATHER: - return "neighbor_allgather"; - case NEIGHBOR_ALLGATHERV: - return "neighbor_allgatherv"; - case NEIGHBOR_ALLTOALL: - return "neighbor_alltoall"; - case NEIGHBOR_ALLTOALLV: - return "neighbor_alltoallv"; - case NEIGHBOR_ALLTOALLW: - return "neighbor_alltoallw"; - default: - return ""; - } -} + /* * Register MCA params @@ -215,15 +151,14 @@ static int han_register(void) mca_coll_han_component_t *cs = &mca_coll_han_component; /* Generated parameters name and description */ - char param_name[100] = ""; - char param_desc[300] = ""; + char param_name[128], param_desc[256]; int param_desc_size; COLLTYPE_T coll; TOPO_LVL_T topo_lvl; COMPONENT_T component; cs->han_priority = 0; - (void) mca_base_component_var_register(c, "priority", "Priority of the han coll component", + (void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); @@ -261,16 +196,14 @@ static int han_register(void) "up level module for allreduce, 0 libnbc, 1 adapt", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_reduce_up_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_up_module); cs->han_reduce_low_module = 0; (void) mca_base_component_var_register(c, "reduce_low_module", "low level module for allreduce, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_reduce_low_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_low_module); cs->han_allreduce_segsize = 524288; (void) mca_base_component_var_register(c, "allreduce_segsize", "segment size for allreduce", @@ -283,32 +216,28 @@ static int han_register(void) "up level module for allreduce, 0 libnbc, 1 adapt", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allreduce_up_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_up_module); cs->han_allreduce_low_module = 0; (void) mca_base_component_var_register(c, "allreduce_low_module", "low level module for allreduce, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allreduce_low_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_low_module); cs->han_allgather_up_module = 0; (void) mca_base_component_var_register(c, "allgather_up_module", "up level module for allgather, 0 libnbc, 1 adapt", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allgather_up_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_up_module); cs->han_allgather_low_module = 0; (void) mca_base_component_var_register(c, "allgather_low_module", "low level module for allgather, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allgather_low_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_low_module); cs->han_gather_up_module = 0; (void) mca_base_component_var_register(c, "gather_up_module", @@ -336,15 +265,7 @@ static int han_register(void) "low level module for scatter, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_scatter_low_module); - - cs->han_auto_tune = 0; - (void) mca_base_component_var_register(c, "auto_tune", - "whether enable auto tune, 0 disable, 1 enable, default 0", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_low_module); cs->han_reproducible = 0; (void) mca_base_component_var_register(c, "reproducible", @@ -353,17 +274,15 @@ static int han_register(void) "0 disable 1 enable, default 0", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_reproducible); - + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reproducible); /* Simple algorithms MCA parameters */ for(coll = 0 ; coll < COLLCOUNT ; coll++) { cs->use_simple_algorithm[coll] = false; if(is_simple_implemented(coll)) { - snprintf(param_name, 100, "use_simple_%s", - mca_coll_han_colltype_to_str(coll)); - snprintf(param_desc, 300, "whether to enable simple algo for %s", - mca_coll_han_colltype_to_str(coll)); + snprintf(param_name, sizeof(param_name), "use_simple_%s", + mca_coll_base_colltype_to_str(coll)); + snprintf(param_desc, sizeof(param_desc), "whether to enable simple algo for %s", + mca_coll_base_colltype_to_str(coll)); mca_base_component_var_register(c, param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, @@ -374,31 +293,28 @@ static int han_register(void) } /* Dynamic rules MCA parameters */ - /* TODO: Find a way to avoid unused entried */ memset(cs->mca_rules, 0, COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T)); - for(coll = 0 ; coll < COLLCOUNT ; coll++) { + for(coll = 0; coll < COLLCOUNT; coll++) { if(!mca_coll_han_is_coll_dynamic_implemented(coll)) { continue; } /* * Default values - * Do not avoid to set correct default parameters */ cs->mca_rules[coll][INTRA_NODE] = TUNED; cs->mca_rules[coll][INTER_NODE] = BASIC; cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN; - for(topo_lvl = 0 ; topo_lvl < NB_TOPO_LVL ; topo_lvl++) { + for(topo_lvl = 0; topo_lvl < NB_TOPO_LVL; topo_lvl++) { - snprintf(param_name, 100, "%s_dynamic_%s_module", - mca_coll_han_colltype_to_str(coll), + snprintf(param_name, sizeof(param_name), "%s_dynamic_%s_module", + mca_coll_base_colltype_to_str(coll), mca_coll_han_topo_lvl_to_str(topo_lvl)); - param_desc_size = snprintf(param_desc, 300, - "Collective module to use for " - "collective %s on %s topological level: ", - mca_coll_han_colltype_to_str(coll), + param_desc_size = snprintf(param_desc, sizeof(param_desc), + "Collective module to use for %s on %s topological level: ", + mca_coll_base_colltype_to_str(coll), mca_coll_han_topo_lvl_to_str(topo_lvl)); /* * Exhaustive description: @@ -410,10 +326,10 @@ static int han_register(void) /* Han can only be used on the global communicator */ continue; } - param_desc_size += snprintf(param_desc+param_desc_size, 300, + param_desc_size += snprintf(param_desc+param_desc_size, sizeof(param_desc) - param_desc_size, "%d = %s; ", component, - components_name[component]); + available_components[component].component_name); } mca_base_component_var_register(c, param_name, param_desc, @@ -424,45 +340,11 @@ static int han_register(void) } } - /* - * TODO: remove the following lines when auto-tune is added back to the code - */ - cs->han_auto_tune = 0; - - cs->han_auto_tune_n = 5; - cs->han_auto_tune_c = 3; - cs->han_auto_tune_m = 21; -#if 0 - cs->han_auto_tune_n = 5; - (void) mca_base_component_var_register(c, "auto_tune_n", - "auto tune n", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_n); - - cs->han_auto_tune_c = 3; - (void) mca_base_component_var_register(c, "auto_tune_c", - "auto tune c", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_c); - - cs->han_auto_tune_m = 21; - (void) mca_base_component_var_register(c, "auto_tune_m", - "auto tune n", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_auto_tune_m); -#endif - /* Dynamic rules */ cs->use_dynamic_file_rules = false; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "use_dynamic_file_rules", - "Switch used to decide if we use " - "dynamic module choice rules " - "defines by file", + "Enable the dynamic selection provided via the dynamic_rules_filename MCA", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -471,8 +353,7 @@ static int han_register(void) cs->dynamic_rules_filename = NULL; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "dynamic_rules_filename", - "Filename of configuration file that " - "contains the dynamic module choice rules", + "Configuration file containing the dynamic selection rules", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -481,9 +362,7 @@ static int han_register(void) cs->dump_dynamic_rules = false; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "dump_dynamic_rules", - "Switch used to decide if we dump " - "dynamic rules provided by " - "configuration file", + "Switch used to decide if we dump dynamic rules provided by configuration file", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -492,11 +371,8 @@ static int han_register(void) if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename) && !cs->use_dynamic_file_rules) { opal_output_verbose(0, cs->han_output, - "coll:han:han_register " - "you asked for dynamic rules " - "but they are not activated. " - "Check coll_han_use_dynamic_file_rules " - "MCA parameter"); + "HAN: dynamic rules for collectives are hot activated." + "Check coll_han_use_dynamic_file_rules MCA parameter"); } cs->max_dynamic_errors = 10; diff --git a/ompi/mca/coll/han/coll_han_dynamic.c b/ompi/mca/coll/han/coll_han_dynamic.c index 2cda40e34b..d32b12fbcd 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.c +++ b/ompi/mca/coll/han/coll_han_dynamic.c @@ -22,31 +22,29 @@ */ bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id) { - switch (coll_id){ - case ALLGATHER: - case ALLGATHERV: - case ALLREDUCE: - case BCAST: - case GATHER: - case REDUCE: - case SCATTER: - return true; - default: - return false; + switch (coll_id) { + case ALLGATHER: + case ALLGATHERV: + case ALLREDUCE: + case BCAST: + case GATHER: + case REDUCE: + case SCATTER: + return true; + default: + return false; } } -static COMPONENT_T -component_name_to_id(const char* name) +COMPONENT_T +mca_coll_han_component_name_to_id(const char* name) { - int i; - if(NULL == name) { return -1; } - for(i=SELF ; itopologic_level; mca_coll_base_module_t *han_base_module = (mca_coll_base_module_t *) han_module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + int nb_modules = 0; + mca_coll_base_avail_coll_t *item; + /* If the modules are get yet, return success */ if(han_module->storage_initialized) { return OMPI_SUCCESS; @@ -76,7 +75,7 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, mca_coll_base_avail_coll_t) { mca_coll_base_module_t *module = item->ac_module; const char *name = item->ac_component_name; - int id = component_name_to_id(name); + int id = mca_coll_han_component_name_to_id(name); if(id >= 0 && NULL != module && module != han_base_module) { /* @@ -85,16 +84,10 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, */ han_module->modules_storage.modules[id].module_handler = module; opal_output_verbose(80, mca_coll_han_component.han_output, - "coll:han:get_all_coll_modules " - "Han found module %s with id %d " - "for topological level %d (%s) " - "for communicator (%d/%s)\n", - name, - id, - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + "coll:han:get_all_coll_modules HAN found module %s with id %d " + "for topological level %d (%s) for communicator (%d/%s)\n", + name, id, topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); nb_modules++; } } @@ -109,16 +102,11 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, } opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_all_coll_modules " - "Han sub-communicator modules storage " - "for topological level %d (%s) " - "gets %d modules " + "coll:han:get_all_coll_modules HAN sub-communicator modules storage " + "for topological level %d (%s) gets %d modules " "for communicator (%d/%s)\n", - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - nb_modules, - comm->c_contextid, - comm->c_name); + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + nb_modules, comm->c_contextid, comm->c_name); assert(0 != nb_modules); @@ -133,15 +121,13 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, */ static const msg_size_rule_t* get_dynamic_rule(COLLTYPE_T collective, - int msg_size, - struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module) + size_t msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) { /* Indexes of the rule */ - int coll_idx; - int topo_idx; - int conf_idx; - int msg_size_idx; + int coll_idx, topo_idx; + int conf_idx, msg_size_idx; /* Aliases */ const mca_coll_han_dynamic_rules_t *dynamic_rules = NULL; @@ -157,107 +143,78 @@ get_dynamic_rule(COLLTYPE_T collective, /* Find the collective rule */ dynamic_rules = &(mca_coll_han_component.dynamic_rules); - for(coll_idx = dynamic_rules->nb_collectives-1 ; - coll_idx >= 0 ; coll_idx--) { + for(coll_idx = dynamic_rules->nb_collectives-1; + coll_idx >= 0; coll_idx--) { if(dynamic_rules->collective_rules[coll_idx].collective_id == collective) { coll_rule = &(dynamic_rules->collective_rules[coll_idx]); break; } } - if(coll_idx < 0) { - /* - * No dynamic rules for this collective - */ + if(coll_idx < 0) { /* No dynamic rules for this collective */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched for collective %d (%s) " + "coll:han:get_dynamic_rule HAN searched for collective %d (%s) " "but did not find any rule for this collective\n", - collective, - mca_coll_han_colltype_to_str(collective)); + collective, mca_coll_base_colltype_to_str(collective)); return NULL; } /* Find the topologic level rule */ - for(topo_idx = coll_rule->nb_topologic_levels-1 ; - topo_idx >= 0 ; topo_idx--) { + for(topo_idx = coll_rule->nb_topologic_levels-1; + topo_idx >= 0; topo_idx--) { if(coll_rule->topologic_rules[topo_idx].topologic_level == topo_lvl) { topo_rule = &(coll_rule->topologic_rules[topo_idx]); break; } } - if(topo_idx < 0) { - /* - * No topologic level rules for this collective - */ + if(topo_idx < 0) { /* No topologic level rules for this collective */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched for topologic level %d (%s) rule " + "coll:han:get_dynamic_rule HAN searched for topologic level %d (%s) rule " "for collective %d (%s) but did not find any rule\n", - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - collective, - mca_coll_han_colltype_to_str(collective)); + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + collective, mca_coll_base_colltype_to_str(collective)); return NULL; } /* Find the configuration rule */ - for(conf_idx = topo_rule->nb_rules-1 ; - conf_idx >= 0 ; conf_idx--) { + for(conf_idx = topo_rule->nb_rules-1; + conf_idx >= 0; conf_idx--) { if(topo_rule->configuration_rules[conf_idx].configuration_size <= comm_size) { conf_rule = &(topo_rule->configuration_rules[conf_idx]); break; } } if(conf_idx < 0) { - /* - * No corresponding configuration - * Should not happen with a correct file - */ - + /* No corresponding configuration. Should not have happen with a correct file */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " "but did not manage to find anything. " "This is the result of an invalid configuration file: " "the first configuration size of each collective must be 1\n", - collective, - mca_coll_han_colltype_to_str(collective), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm_size); + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), comm_size); return NULL; } /* Find the message size rule */ - for(msg_size_idx = conf_rule->nb_msg_size-1 ; - msg_size_idx >= 0 ; msg_size_idx--) { + for(msg_size_idx = conf_rule->nb_msg_size-1; + msg_size_idx >= 0; msg_size_idx--) { if(conf_rule->msg_size_rules[msg_size_idx].msg_size <= msg_size) { msg_size_rule = &(conf_rule->msg_size_rules[msg_size_idx]); break; } } if(msg_size_idx < 0) { - /* - * No corresponding message size - * Should not happen with a correct file - */ + /* No corresponding message size. Should not happen with a correct file */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " - "for a %d sized message " - "but did not manage to find anything. " + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message but did not manage to find anything. " "This is the result of an invalid configuration file: " "the first message size of each configuration must be 0\n", - collective, - mca_coll_han_colltype_to_str(collective), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm_size, - msg_size); + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, msg_size); return NULL; } @@ -268,29 +225,19 @@ get_dynamic_rule(COLLTYPE_T collective, * Module correctness is checked outside */ opal_output_verbose(80, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " - "for a %d sized message. " - "Found a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " - "for a %d sized message : component %d (%s)\n", - collective, - mca_coll_han_colltype_to_str(collective), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm_size, - msg_size, - msg_size_rule->collective_id, - mca_coll_han_colltype_to_str(msg_size_rule->collective_id), + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message. Found a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message : component %d (%s)\n", + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, msg_size, msg_size_rule->collective_id, + mca_coll_base_colltype_to_str(msg_size_rule->collective_id), msg_size_rule->topologic_level, mca_coll_han_topo_lvl_to_str(msg_size_rule->topologic_level), msg_size_rule->configuration_size, - msg_size_rule->msg_size, - component, - components_name[component]); + msg_size_rule->msg_size, component, available_components[component].component_name); return msg_size_rule; } @@ -300,14 +247,13 @@ get_dynamic_rule(COLLTYPE_T collective, * for a msg_size sized message on the comm communicator * following the dynamic rules */ -mca_coll_base_module_t * +static mca_coll_base_module_t* get_module(COLLTYPE_T coll_id, - int msg_size, + size_t msg_size, struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module) { const msg_size_rule_t *dynamic_rule; - mca_coll_base_module_t *sub_module = NULL; TOPO_LVL_T topo_lvl; COMPONENT_T mca_rule_component; @@ -323,37 +269,26 @@ get_module(COLLTYPE_T coll_id, han_module); if(NULL != dynamic_rule) { /* Use dynamic rule from file */ - sub_module = han_module->modules_storage - .modules[dynamic_rule->component] - .module_handler; - } else { - /* - * No dynamic rule from file - * Use rule from mca parameter - */ - if(mca_rule_component < 0 || mca_rule_component >= COMPONENTS_COUNT) { - /* - * Invalid MCA parameter value - * Warn the user and return NULL - */ - opal_output_verbose(0, mca_coll_han_component.han_output, - "coll:han:get_module " - "Invalid MCA parameter value %d " - "for collective %d (%s) " - "on topologic level %d (%s)\n", - mca_rule_component, - coll_id, - mca_coll_han_colltype_to_str(coll_id), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl)); - return NULL; - } - sub_module = han_module->modules_storage - .modules[mca_rule_component] - .module_handler; + return han_module->modules_storage.modules[dynamic_rule->component].module_handler; } - - return sub_module; + /* + * No dynamic rule from file + * Use rule from mca parameter + */ + if(mca_rule_component < 0 || mca_rule_component >= COMPONENTS_COUNT) { + /* + * Invalid MCA parameter value + * Warn the user and return NULL + */ + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:get_module Invalid MCA parameter value %d " + "for collective %d (%s) on topologic level %d (%s)\n", + mca_rule_component, coll_id, + mca_coll_base_colltype_to_str(coll_id), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl)); + return NULL; + } + return han_module->modules_storage.modules[mca_rule_component].module_handler; } @@ -365,38 +300,35 @@ get_module(COLLTYPE_T coll_id, */ int mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allgather_fn_t allgather; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ - ompi_datatype_type_size(sdtype, &dtype_size); - msg_size = dtype_size * scount; - + if( MPI_IN_PLACE != sbuf ) { + ompi_datatype_type_size(sdtype, &dtype_size); + dtype_size = dtype_size * scount; + } else { + ompi_datatype_type_size(rdtype, &dtype_size); + dtype_size = dtype_size * rcount; + } sub_module = get_module(ALLGATHER, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -408,26 +340,17 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgather_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - ALLGATHER, - mca_coll_han_colltype_to_str(ALLGATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHER, mca_coll_base_colltype_to_str(ALLGATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLGATHER: No module found for the sub-" - "communicator. " + "HAN/ALLGATHER: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - han_module - ->previous_allgather_module); + allgather = han_module->previous_allgather; + sub_module = han_module->previous_allgather_module; } else if (NULL == sub_module->coll_allgather) { /* * No valid collective from dynamic rules @@ -435,62 +358,43 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, */ han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_allgather_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "coll:han:mca_coll_han_allgather_intra_dynamic HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - ALLGATHER, - mca_coll_han_colltype_to_str(ALLGATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHER, mca_coll_base_colltype_to_str(ALLGATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLGATHER: the module found for the sub-" - "communicator cannot handle the ALLGATHER operation. " - "Falling back to another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - han_module - ->previous_allgather_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + "HAN/ALLGATHER: the module found for the sub-communicator" + " cannot handle the ALLGATHER operation. Falling back to another component\n")); + allgather = han_module->previous_allgather; + sub_module = han_module->previous_allgather_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_allgather is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_allgather_fn_t allgather; if(mca_coll_han_component.use_simple_algorithm[ALLGATHER]) { allgather = mca_coll_han_allgather_intra_simple; } else { allgather = mca_coll_han_allgather_intra; } - - return allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - sub_module); + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allgather = sub_module->coll_allgather; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_allgather is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - sub_module); + return allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + sub_module); } @@ -503,30 +407,25 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, */ int mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, const int *rcounts, - const int *displs, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, + const int *displs, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size, msg_size; - int rank; - int verbosity; - int comm_size; - int i; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allgatherv_fn_t allgatherv; + int rank, verbosity = 0, comm_size, i; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size, msg_size = 0; /* Compute configuration information for dynamic rules */ comm_size = ompi_comm_size(comm); ompi_datatype_type_size(rdtype, &dtype_size); - msg_size = 0; - for(i = 0 ; i < comm_size ; i++) { + for(i = 0; i < comm_size; i++) { if(dtype_size * rcounts[i] > msg_size) { msg_size = dtype_size * rcounts[i]; } @@ -539,11 +438,7 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -555,26 +450,17 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgatherv_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - ALLGATHERV, - mca_coll_han_colltype_to_str(ALLGATHERV), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLGATHERV: No module found for the sub-" - "communicator. " + "HAN/ALLGATHERV: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - han_module - ->previous_allgatherv_module); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; } else if (NULL == sub_module->coll_allgatherv) { /* * No valid collective from dynamic rules @@ -583,31 +469,24 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgatherv_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - ALLGATHERV, - mca_coll_han_colltype_to_str(ALLGATHERV), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/ALLGATHERV: the module found for the sub-" "communicator cannot handle the ALLGATHERV operation. " "Falling back to another component\n")); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; return han_module->previous_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - han_module - ->previous_allgatherv_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + rbuf, rcounts, displs, + rdtype, comm, + han_module->previous_allgatherv_module); + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid @@ -616,36 +495,28 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, */ opal_output_verbose(30, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgatherv_intra_dynamic " - "Han used for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " + "HAN used for collective %d (%s) with topological level %d (%s) " + "on communicator (%d/%s) but this module cannot handle " "this collective on this topologic level\n", - ALLGATHERV, - mca_coll_han_colltype_to_str(ALLGATHERV), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); - return han_module->previous_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - han_module - ->previous_allgatherv_module); + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgatherv is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allgatherv = sub_module->coll_allgatherv; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_allgatherv is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - sub_module); + return allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + sub_module); } @@ -657,39 +528,32 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, */ int mca_coll_han_allreduce_intra_dynamic(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allreduce_fn_t allreduce; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(dtype, &dtype_size); - msg_size = dtype_size * count; + dtype_size = dtype_size * count; sub_module = get_module(ALLREDUCE, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -701,25 +565,17 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allreduce_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - ALLREDUCE, - mca_coll_han_colltype_to_str(ALLREDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLREDUCE, mca_coll_base_colltype_to_str(ALLREDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLREDUCE: No module found for the sub-" - "communicator. " + "HAN/ALLREDUCE: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, - op, comm, - han_module - ->previous_allreduce_module); + allreduce = han_module->previous_allreduce; + sub_module = han_module->previous_allreduce_module; } else if (NULL == sub_module->coll_allreduce) { /* * No valid collective from dynamic rules @@ -728,60 +584,49 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allreduce_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - ALLREDUCE, - mca_coll_han_colltype_to_str(ALLREDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLREDUCE, mca_coll_base_colltype_to_str(ALLREDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/ALLREDUCE: the module found for the sub-" "communicator cannot handle the ALLREDUCE operation. " "Falling back to another component\n")); - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, - op, comm, - han_module - ->previous_allreduce_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + allreduce = han_module->previous_allreduce; + sub_module = han_module->previous_allreduce_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* Reproducibility: fallback on reproducible algo */ if (mca_coll_han_component.han_reproducible) { - return mca_coll_han_allreduce_reproducible(sbuf, rbuf, count, dtype, op, - comm, module); - } - /* - * No fallback mechanism activated for this configuration - * sub_module is valid - * sub_module->coll_allreduce is valid and point to this function - * Call han topological collective algorithm - */ - mca_coll_base_module_allreduce_fn_t allreduce; - if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { - allreduce = mca_coll_han_allreduce_intra_simple; + allreduce = mca_coll_han_allreduce_reproducible; } else { - allreduce = mca_coll_han_allreduce_intra; + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allreduce is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { + allreduce = mca_coll_han_allreduce_intra_simple; + } else { + allreduce = mca_coll_han_allreduce_intra; + } } - return allreduce(sbuf, rbuf, count, dtype, - op, comm, module); + sub_module = module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allreduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allreduce = mca_coll_han_allreduce_intra; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_allreduce is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_allreduce(sbuf, rbuf, count, dtype, - op, comm, sub_module); + return allreduce(sbuf, rbuf, count, dtype, + op, comm, sub_module); } @@ -793,38 +638,31 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, */ int mca_coll_han_bcast_intra_dynamic(void *buff, - int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_bcast_fn_t bcast; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(dtype, &dtype_size); - msg_size = dtype_size * count; + dtype_size = dtype_size * count; sub_module = get_module(BCAST, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -836,23 +674,17 @@ mca_coll_han_bcast_intra_dynamic(void *buff, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_bcast_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - BCAST, - mca_coll_han_colltype_to_str(BCAST), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + BCAST, mca_coll_base_colltype_to_str(BCAST), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/BCAST: No module found for the sub-" - "communicator. " + "HAN/BCAST: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, comm, - han_module->previous_bcast_module); + bcast = han_module->previous_bcast; + sub_module = han_module->previous_bcast_module; } else if (NULL == sub_module->coll_bcast) { /* * No valid collective from dynamic rules @@ -861,61 +693,44 @@ mca_coll_han_bcast_intra_dynamic(void *buff, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_bcast_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - BCAST, - mca_coll_han_colltype_to_str(BCAST), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + BCAST, mca_coll_base_colltype_to_str(BCAST), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/BCAST: the module found for the sub-" "communicator cannot handle the BCAST operation. " "Falling back to another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, comm, - han_module->previous_bcast_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + bcast = han_module->previous_bcast; + sub_module = han_module->previous_bcast_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_bcast is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_bcast_fn_t bcast; if(mca_coll_han_component.use_simple_algorithm[BCAST]) { bcast = mca_coll_han_bcast_intra_simple; } else { bcast = mca_coll_han_bcast_intra; } - return bcast(buff, - count, - dtype, - root, - comm, - module); + sub_module = module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_bcast is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + bcast = sub_module->coll_bcast; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_bcast is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_bcast(buff, - count, - dtype, - root, - comm, - sub_module); + return bcast(buff, count, dtype, + root, comm, sub_module); } @@ -927,39 +742,37 @@ mca_coll_han_bcast_intra_dynamic(void *buff, */ int mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_gather_fn_t gather; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ - ompi_datatype_type_size(sdtype, &dtype_size); - msg_size = dtype_size * scount; + if( MPI_IN_PLACE != sbuf ) { + ompi_datatype_type_size(sdtype, &dtype_size); + dtype_size = dtype_size * scount; + } else { + ompi_datatype_type_size(rdtype, &dtype_size); + dtype_size = dtype_size * rcount; + } sub_module = get_module(GATHER, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -971,26 +784,17 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_gather_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - GATHER, - mca_coll_han_colltype_to_str(GATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + GATHER, mca_coll_base_colltype_to_str(GATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/GATHER: No module found for the sub-" - "communicator. " + "HAN/GATHER: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_gather_module); + gather = han_module->previous_gather; + sub_module = han_module->previous_gather_module; } else if (NULL == sub_module->coll_gather) { /* * No valid collective from dynamic rules @@ -999,62 +803,45 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_gather_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - GATHER, - mca_coll_han_colltype_to_str(GATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + GATHER, mca_coll_base_colltype_to_str(GATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/GATHER: the module found for the sub-" "communicator cannot handle the GATHER operation. " "Falling back to another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_gather_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + gather = han_module->previous_gather; + sub_module = han_module->previous_gather_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_gather is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_gather_fn_t gather; if(mca_coll_han_component.use_simple_algorithm[GATHER]) { gather = mca_coll_han_gather_intra_simple; } else { gather = mca_coll_han_gather_intra; } - - - return gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_gather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + gather = sub_module->coll_gather; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_gather is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + return gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); } @@ -1066,40 +853,33 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, */ int mca_coll_han_reduce_intra_dynamic(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_reduce_fn_t reduce; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(dtype, &dtype_size); - msg_size = dtype_size * count; + dtype_size = dtype_size * count; sub_module = get_module(REDUCE, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -1111,25 +891,17 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_reduce_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - REDUCE, - mca_coll_han_colltype_to_str(REDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + REDUCE, mca_coll_base_colltype_to_str(REDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/REDUCE: No module found for the sub-" - "communicator. " + "HAN/REDUCE: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_reduce(sbuf, rbuf, count, dtype, - op, root, comm, - han_module - ->previous_reduce_module); + reduce = han_module->previous_reduce; + sub_module = han_module->previous_reduce_module; } else if (NULL == sub_module->coll_reduce) { /* * No valid collective from dynamic rules @@ -1138,60 +910,51 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_reduce_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - REDUCE, - mca_coll_han_colltype_to_str(REDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + REDUCE, mca_coll_base_colltype_to_str(REDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/REDUCE: the module found for the sub-" "communicator cannot handle the REDUCE operation. " "Falling back to another component\n")); - return han_module->previous_reduce(sbuf, rbuf, count, dtype, - op, root, comm, - han_module - ->previous_reduce_module); + reduce = han_module->previous_reduce; + sub_module = han_module->previous_reduce_module; } if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* Reproducibility: fallback on reproducible algo */ if (mca_coll_han_component.han_reproducible) { - return mca_coll_han_reduce_reproducible(sbuf, rbuf, count, dtype, op, - root, comm, module); - } - /* - * No fallback mechanism activated for this configuration - * sub_module is valid - * sub_module->coll_reduce is valid and point to this function - * Call han topological collective algorithm - */ - mca_coll_base_module_reduce_fn_t reduce; - if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { - reduce = mca_coll_han_reduce_intra_simple; + reduce = mca_coll_han_reduce_reproducible; } else { - reduce = mca_coll_han_reduce_intra; + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_reduce is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { + reduce = mca_coll_han_reduce_intra_simple; + } else { + reduce = mca_coll_han_reduce_intra; + } } - return reduce(sbuf, rbuf, count, dtype, - op, root, comm, module); + sub_module = module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_reduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + reduce = sub_module->coll_reduce; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_reduce is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_reduce(sbuf, rbuf, count, dtype, - op, root, comm, sub_module); + return reduce(sbuf, rbuf, count, dtype, + op, root, comm, sub_module); } @@ -1203,39 +966,32 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, */ int mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_scatter_fn_t scatter; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(rdtype, &dtype_size); - msg_size = dtype_size * rcount; + dtype_size = dtype_size * rcount; sub_module = get_module(SCATTER, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -1247,26 +1003,17 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_scatter_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - SCATTER, - mca_coll_han_colltype_to_str(SCATTER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + SCATTER, mca_coll_base_colltype_to_str(SCATTER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/SCATTER: No module found for the sub-" - "communicator. " + "HAN/SCATTER: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_scatter_module); + scatter = han_module->previous_scatter; + sub_module = han_module->previous_scatter_module; } else if (NULL == sub_module->coll_scatter) { /* * No valid collective from dynamic rules @@ -1275,38 +1022,26 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_scatter_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - SCATTER, - mca_coll_han_colltype_to_str(SCATTER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + SCATTER, mca_coll_base_colltype_to_str(SCATTER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/SCATTER: the module found for the sub-" "communicator cannot handle the SCATTER operation. " "Falling back to another component\n")); - return han_module->previous_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_scatter_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + scatter = han_module->previous_scatter; + sub_module = han_module->previous_scatter_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_scatter is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_scatter_fn_t scatter; scatter = mca_coll_han_scatter_intra; /* * TODO: Uncomment when scatter simple is merged @@ -1316,10 +1051,8 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, * scatter = mca_coll_han_scatter_intra; * } */ - return scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + } else { + scatter = sub_module->coll_scatter; } /* @@ -1329,10 +1062,8 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, * They points to the collective to use, according to the dynamic rules * Selector's job is done, call the collective */ - return sub_module->coll_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + return scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); } - - diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h index 979b292ba0..0ccecb63ba 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.h +++ b/ompi/mca/coll/han/coll_han_dynamic.h @@ -1,5 +1,8 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved. * * $COPYRIGHT$ @@ -27,9 +30,9 @@ * ################################################# * * Han dynamic rules allow the user to define the collective - * module to call depending the topological configuration of the + * module to call depending on the topological configuration of the * sub-communicators and the collective parameters. This mechanism - * can also be used to fallback the main collective on another module. + * can also be used to fallback to the main collective on another module. * The interface is described in coll_han_dynamic_file.h. * * ############################# @@ -39,7 +42,7 @@ * directly accesses the module on the communicator. This information is * stored in the collective structure of the communicator during the collective * module choice at the communicator initialization. When han needs this - * information for the first time, it identifies the modles by their name and + * information for the first time, it identifies the modules by their name and * stores them in its module structure. * Then, the modules are identified by their identifier. * @@ -69,7 +72,7 @@ * adds an indirection on the collective call: dynamic choice functions. These * functions do not implement any collective. First, they try to find a dynamic * rule from file for the given collective. If there is not any rule for the - * fiven configuration, MCA parameter defined rules are used. Once the module + * given configuration, MCA parameter defined rules are used. Once the module * to use is found, the correct collective implementation is called. * * This indirection is also used on the global communicator. This allows han @@ -92,11 +95,9 @@ * by increasing value, some of them will not be considered */ -BEGIN_C_DECLS - /* Dynamic rules support */ typedef enum COMPONENTS { - SELF=0, + SELF = 0, BASIC, LIBNBC, TUNED, @@ -107,18 +108,17 @@ typedef enum COMPONENTS { COMPONENTS_COUNT } COMPONENT_T; -static const char *components_name[]={"self", - "basic", - "libnbc", - "tuned", - "sm", - "shared", - "adapt", - "han"}; +typedef struct { + COMPONENT_T id; + char* component_name; + mca_coll_base_component_t* component; +} ompi_coll_han_components; + +extern ompi_coll_han_components available_components[COMPONENTS_COUNT]; /* Topologic levels */ typedef enum TOPO_LVL { - INTRA_NODE=0, + INTRA_NODE = 0, INTER_NODE, /* Identifies the global communicator as a topologic level */ GLOBAL_COMMUNICATOR, @@ -135,7 +135,7 @@ typedef struct msg_size_rule_s { int configuration_size; /* Message size of the rule */ - int msg_size; + size_t msg_size; /* Component to use on this specific configuration * and message size */ @@ -209,6 +209,6 @@ typedef struct mca_coll_han_collective_modules_storage_s { /* Tests if a dynamic collective is implemented */ bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id); +COMPONENT_T mca_coll_han_component_name_to_id(const char* name); -END_C_DECLS #endif diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c index d163071edc..e6673cf941 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.c +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -26,11 +26,14 @@ #include "ompi/mca/coll/base/coll_base_util.h" +#define getnext_long(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) +#define getnext_string(fptr, pval) ompi_coll_base_file_getnext_string(fptr, &fileline, pval) +#define getnext_size_t(fptr, pval) ompi_coll_base_file_getnext_size_t(fptr, &fileline, pval) + static void check_dynamic_rules(void); /* Current file line for verbose message */ static int fileline = 1; -#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline) int mca_coll_han_init_dynamic_rules(void) @@ -38,31 +41,31 @@ mca_coll_han_init_dynamic_rules(void) /* File management */ const char *fname; FILE *fptr = NULL; - int nb_entries = 0; + int nb_entries = 0, rc; /* Loop counters */ int i, j, k, l; /* Collective informations */ - int nb_coll; - COLLTYPE_T coll_id; + long nb_coll, coll_id; + char * coll_name = NULL; collective_rule_t *coll_rules; /* Topo informations */ - int nb_topo; - TOPO_LVL_T topo_lvl; + long nb_topo, topo_lvl; topologic_rule_t *topo_rules; /* Configuration informations */ - int nb_rules, conf_size; + long nb_rules, conf_size; configuration_rule_t *conf_rules; /* Message size informations */ - int nb_msg_size, msg_size; + long nb_msg_size; + size_t msg_size; msg_size_rule_t *msg_size_rules; /* Component informations */ - COMPONENT_T component; + long component; /* If the dynamic rules are not used, do not even read the file */ if(!mca_coll_han_component.use_dynamic_file_rules) { @@ -70,47 +73,31 @@ mca_coll_han_init_dynamic_rules(void) return OMPI_SUCCESS; } - fname = mca_coll_han_component.dynamic_rules_filename; - - if(NULL == fname) { + if( NULL == (fname = mca_coll_han_component.dynamic_rules_filename) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "coll_han_use_dynamic_file_rules is true but " - "coll_han_dynamic_rules_filename is not set: " - "coll han will use dynamic rules from mca " - "parameters and their default value\n"); + "coll:han:mca_coll_han_init_dynamic_rules coll_han_use_dynamic_file_rules is set but " + "coll_han_dynamic_rules_filename is not Rules from MCA parameters will be used instead\n"); mca_coll_han_component.dynamic_rules.nb_collectives = 0; return OMPI_SUCCESS; } - fptr = fopen(fname, "r"); - - if(NULL == fptr) { + if( NULL == (fptr = fopen(fname, "r")) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "cannot open dynamic file provided by " - "coll_han_dynamic_rules_filename=%s " - "please provide it with full path and " - "check file permissions. Rules from " - "MCA parameters will be used instead\n", + "coll:han:mca_coll_han_init_dynamic_rules cannot open dynamic file provided by " + "coll_han_dynamic_rules_filename=%s. Make sure it provides the full path and " + "check file permissions. Rules from MCA parameters will be used instead\n", fname); mca_coll_han_component.dynamic_rules.nb_collectives = 0; return OMPI_SUCCESS; } /* The first information of the file is the collective count */ - nb_coll = getnext(fptr); - - if(nb_coll <= 0) { + if( (getnext_long(fptr, &nb_coll) < 0) || (nb_coll <= 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for collective count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for collective count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_coll); + fname, fileline, nb_coll); mca_coll_han_component.dynamic_rules.nb_collectives = 0; goto file_reading_error; } @@ -126,69 +113,65 @@ mca_coll_han_init_dynamic_rules(void) } /* Iterates on collective rules */ - for(i=0 ; i= COLLCOUNT) { + if( getnext_string(fptr, &coll_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "invalid collective id %d at line %d: the collective " - "must be at least %d and less than %d\n", - coll_id, - fileline, - ALLGATHER, - COLLCOUNT); - coll_rules[i].nb_topologic_levels = 0; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules invalid collective at line %d." + "The rest of the input file will be ignored.\n", + fileline); goto file_reading_error; } + coll_id = mca_coll_base_name_to_colltype(coll_name); + if( (coll_id < ALLGATHER) || (coll_id >= COLLCOUNT)) { + /* maybe the file was in the old format and we read the collective index instead of the name. */ + char* endp; + coll_id = strtol(coll_name, &endp, 10); + if( '\0' != *endp ) { /* there is garbage in the input */ + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules invalid collective %s " + "at line %d: the collective must be at least %d and less than %d. " + "The rest of the input file will be ignored.\n", + coll_name, fileline, ALLGATHER, COLLCOUNT); + goto file_reading_error; + } + free(coll_name); + coll_name = mca_coll_base_colltype_to_str(coll_id); + } if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "read collective id %d at line %d " - "but this collective is not implemented yet. " - "This is not an error but this set of rules " - "will not be used\n", - fname, - coll_id, - fileline); + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "read collective id %ld at line %d but this collective is not implemented yet. " + "This is not an error but this set of rules will not be used\n", + fname, coll_id, fileline); } /* * The first information of a collective rule * is the number of topologic rules */ - nb_topo = getnext(fptr); - if(nb_topo < 0) { + if( (getnext_long(fptr, &nb_topo) < 0) || (nb_topo < 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for topo level count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for topo level count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_topo); - coll_rules[i].nb_topologic_levels = 0; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, nb_topo); goto file_reading_error; } /* Store the collective rule informations */ - coll_rules[i].collective_id = coll_id; coll_rules[i].nb_topologic_levels = nb_topo; + coll_rules[i].collective_id = (COLLTYPE_T)coll_id; if(0 == nb_topo) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for topo level count\n", - fname, - fileline, - nb_topo); + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for topo level count\n", + fname, fileline, nb_topo); continue; } @@ -197,30 +180,21 @@ mca_coll_han_init_dynamic_rules(void) coll_rules[i].topologic_rules = topo_rules; if(NULL == topo_rules) { coll_rules[i].nb_topologic_levels = 0; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; goto cannot_allocate; } /* Iterates on topologic rules */ - for(j=0 ; j= NB_TOPO_LVL) { + if( (getnext_long(fptr, &topo_lvl) < 0) || (topo_lvl < INTRA_NODE) || (topo_lvl >= NB_TOPO_LVL) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid topo level %d is given " - "or the reader encountered an unexpected EOF. " - "Topologic level must be at least %d and " - "less than %d\n", - fname, - fileline, - topo_lvl, - INTRA_NODE, - NB_TOPO_LVL); - topo_rules[j].nb_rules = 0; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid topo level %ld is given or the reader encountered an unexpected EOF. " + "Topologic level must be at least %d and less than %d\n", + fname, fileline, topo_lvl, INTRA_NODE, NB_TOPO_LVL); goto file_reading_error; } @@ -228,38 +202,26 @@ mca_coll_han_init_dynamic_rules(void) * The first information of a topologic rule * is the number of configurations */ - nb_rules = getnext(fptr); - - if(nb_rules < 0) { + nb_rules = -1; + if( (getnext_long(fptr, &nb_rules) < 0) || (nb_rules < 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d " - "is given for rules count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for rules count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_rules); - topo_rules[j].nb_rules = 0; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, nb_rules); goto file_reading_error; } /* Store the topologic rule informations */ topo_rules[j].collective_id = coll_id; - topo_rules[j].topologic_level = topo_lvl; + topo_rules[j].topologic_level = (TOPO_LVL_T)topo_lvl; topo_rules[j].nb_rules = nb_rules; if(0 == nb_rules) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for configuration rules count\n", - fname, - fileline, - nb_rules); + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for configuration rules count\n", + fname, fileline, nb_rules); continue; } @@ -268,32 +230,21 @@ mca_coll_han_init_dynamic_rules(void) topo_rules[j].configuration_rules = conf_rules; if(NULL == conf_rules) { topo_rules[j].nb_rules = 0; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; goto cannot_allocate; } /* Iterate on configuration rules */ - for(k=0 ; k 1)) { + /* Get the configuration size */ + if( (getnext_long(fptr, &conf_size) < 0) || (conf_size < 1) || (0 == k && conf_size > 1) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "invalid configuration size %d at line %d " - "or the reader encountered an unexpected EOF " - "the configuration size must be at least %d " - "and the first configuration size " - "of a topologic level must be %d\n", - conf_size, - fileline, - 1, - 1); - conf_rules[k].nb_msg_size = 0; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules invalid configuration size %ld at line %d " + "or the reader encountered an unexpected EOF the configuration size must be at least %d " + "and the first configuration size of a topologic level must be %d\n", + conf_size, fileline, 1, 1); goto file_reading_error; } @@ -301,21 +252,12 @@ mca_coll_han_init_dynamic_rules(void) * The first information of a configuration rule * is the number of message size rules */ - nb_msg_size = getnext(fptr); - if(nb_msg_size < 0) { + if( (getnext_long(fptr, &nb_msg_size) < 0) || (nb_msg_size < 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d " - "is given for message size rules count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for message size rules count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_msg_size); - conf_rules[k].nb_msg_size = 0; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, nb_msg_size); goto file_reading_error; } @@ -327,13 +269,9 @@ mca_coll_han_init_dynamic_rules(void) if(0 == nb_msg_size) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for message size rules count\n", - fname, - fileline, - nb_msg_size); + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for message size rules count\n", + fname, fileline, nb_msg_size); continue; } @@ -342,88 +280,99 @@ mca_coll_han_init_dynamic_rules(void) conf_rules[k].msg_size_rules = msg_size_rules; if(NULL == msg_size_rules) { conf_rules[k].nb_msg_size = 0; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; goto cannot_allocate; } /* Iterate on message size rules */ - for(l=0 ; l 1)) { + rc = getnext_size_t(fptr, &msg_size); + if( (rc < 0) || + (0 == l && msg_size > 1)) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d " - "is given for message size " - "or the reader encountered " - "an unexpected EOF. " - "The first message size rule of " - "a configuration must be 0\n", - fname, - fileline, - msg_size); - conf_rules[k].nb_msg_size = l+1; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %" PRIsize_t " is given for message size " + "or the reader encountered an unexpected EOF. " + "The first message size rule of a configuration must be 0\n", + fname, fileline, msg_size); goto file_reading_error; } /* Get the component identifier for this message size rule */ - component = getnext(fptr); - if(component < SELF || component >= COMPONENTS_COUNT) { + if( getnext_string(fptr, &target_comp_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid collective " - "component id %d is given or the " - "reader encountered an unexpected EOF. " - "Collective component id must be at " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: cannot read the name of a collective component\n", + fname, fileline); + goto file_reading_error; + } + component = mca_coll_han_component_name_to_id(target_comp_name); + if( (component < SELF) || (component >= COMPONENTS_COUNT) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid collective component name %s was given or the " + "reader encountered an unexpected EOF. Collective component id must be at " "least %d and less than %d\n", - fname, - fileline, - component, - SELF, - COMPONENTS_COUNT); - conf_rules[k].nb_msg_size = l+1; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, target_comp_name, SELF, COMPONENTS_COUNT); + free(target_comp_name); goto file_reading_error; } - /* Store message size rule informations */ + /* Store message size rule information */ msg_size_rules[l].collective_id = coll_id; msg_size_rules[l].topologic_level = topo_lvl; msg_size_rules[l].configuration_size = conf_size; msg_size_rules[l].msg_size = msg_size; - msg_size_rules[l].component = component; + msg_size_rules[l].component = (COMPONENT_T)component; nb_entries++; + /* do we have the optional segment length */ + if( 1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '[') ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found optional pipelining segment lengths\n"); + long seglength; + if( 0 != topo_lvl ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "file %s line %d found segment lengths for topological collective at level != 0 " + "for collective %s component %s. These values will be ignored.\n", + fname, fileline, coll_name, target_comp_name); + } + while( 0 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, ']') ) { + if( getnext_long(fptr, &seglength) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "file %s line %d found end of file while reading the optional list " + "of segment lengths for collective %s component %s\n", + fname, fileline, coll_name, target_comp_name); + free(target_comp_name); + goto file_reading_error; + } + } + } + free(target_comp_name); } } } + if( NULL != coll_name ) { + free(coll_name); + coll_name = NULL; + } } - if(MYEOF != getnext(fptr)) { + if( getnext_long(fptr, &nb_coll) > 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on file %s at line %d: " - "rule reading is over but reader does not seem " - "to have reached the end of the file\n", - fname, - fileline); + "coll:han:mca_coll_han_init_dynamic_rules. Warning on file %s at line %d: " + "rule reading is over but reader does not seem to have reached the end of the file\n", + fname, fileline); } opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "read %d rules from %s\n", - nb_entries, - fname); + "coll:han:mca_coll_han_init_dynamic_rules read %d rules from %s\n", + nb_entries, fname); if(mca_coll_han_component.dump_dynamic_rules) { mca_coll_han_dump_dynamic_rules(); @@ -447,6 +396,9 @@ cannot_allocate: return OMPI_ERROR; file_reading_error: + if( NULL != coll_name ) { + free(coll_name); + } opal_output_verbose(0, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules " "could not fully read dynamic rules file. " @@ -531,7 +483,8 @@ static void check_dynamic_rules(void) configuration_rule_t *conf_rules; /* Message size informations */ - int nb_msg_size, msg_size; + int nb_msg_size; + size_t msg_size; msg_size_rule_t *msg_size_rules; /* Component informations */ @@ -540,73 +493,49 @@ static void check_dynamic_rules(void) nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; - for(i=0 ; i=1 && conf_rules[k-1].configuration_size > conf_size) { + if( k >= 1 && conf_rules[k-1].configuration_size > conf_size) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:check_dynamic_rules " - "Han found an issue on dynamic rules " - "for collective %d " - "on topological level %d: " - "configuration sizes %d and %d are " - "not sorted by increasing value\n", - coll_id, - topo_lvl, - conf_rules[k-1].configuration_size, - conf_size); + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d: " + "configuration sizes %d and %d are not sorted by increasing value\n", + coll_id, topo_lvl, conf_rules[k-1].configuration_size, conf_size); } - for(l=0 ; l=1 && msg_size_rules[l-1].msg_size > msg_size) { + if( l >= 1 && msg_size_rules[l-1].msg_size > msg_size) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:check_dynamic_rules " - "Han found an issue on dynamic rules " - "for collective %d " - "on topological level %d " - "with configuration size %d: " - "message sizes %d and %d are " + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d with configuration size %d: " + "message sizes %" PRIsize_t " and %" PRIsize_t " are " "not sorted by increasing value\n", - coll_id, - topo_lvl, - conf_size, - msg_size_rules[l-1].msg_size, - msg_size); + coll_id, topo_lvl, conf_size, msg_size_rules[l-1].msg_size, msg_size); } - if(HAN == component - && GLOBAL_COMMUNICATOR != topo_lvl) { + if( (HAN == component) && (GLOBAL_COMMUNICATOR != topo_lvl) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:check_dynamic_rules " - "Han found an issue on dynamic rules " - "for collective %d " - "on topological level %d " - "with configuration size %d " - "for message size %d: " - "han collective component %d " - "can only be activated for " - "topology level %d\n", - coll_id, - topo_lvl, - conf_size, - msg_size, - HAN, - GLOBAL_COMMUNICATOR); + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d with configuration size %d " + "for message size %" PRIsize_t ": han collective component %d " + "can only be activated for topology level %d\n", + coll_id, topo_lvl, conf_size, msg_size, HAN, GLOBAL_COMMUNICATOR); } } } @@ -618,9 +547,6 @@ void mca_coll_han_dump_dynamic_rules(void) { int nb_entries = 0; - /* Loop counters */ - int i, j, k, l; - /* Collective informations */ int nb_coll; COLLTYPE_T coll_id; @@ -645,42 +571,32 @@ void mca_coll_han_dump_dynamic_rules(void) nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; - for(i=0 ; i collective component %d (%s)\n", - nb_entries, - coll_id, - mca_coll_han_colltype_to_str(coll_id), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - conf_size, - msg_size, - component, - components_name[component]); + "coll:han:dump_dynamic_rules %d collective %d (%s) " + "topology level %d (%s) configuration size %d " + "mesage size %d -> collective component %d (%s)\n", + nb_entries, coll_id, mca_coll_base_colltype_to_str(coll_id), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), conf_size, + msg_size, component, available_components[component].component_name); nb_entries++; } diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.h b/ompi/mca/coll/han/coll_han_dynamic_file.h index 846b9b74cc..b61ba0c5d8 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.h +++ b/ompi/mca/coll/han/coll_han_dynamic_file.h @@ -1,5 +1,8 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved. * * $COPYRIGHT$ @@ -60,10 +63,9 @@ * communicator and the corresponding level for sub-communicators * created by han. * - Configuration size: - * The configuration size is the number of elements of the actual - * topology level in the upper topology level. For example, if - * topology levels are intra-node and inter-node, it can be the - * number of MPI ranks per node or the number of nodes in the global + * The configuration size is the number of elements in a topology level. + * For example, if topology levels are intra-node and inter-node, it can + * be the number of MPI ranks per node or the number of nodes in the global * communicator. For the GLOBAL_COMMUNICATOR topologic level, * the configuration size is the communicator size. * - Message_size Component: @@ -101,11 +103,8 @@ * the reader. */ -BEGIN_C_DECLS - int mca_coll_han_init_dynamic_rules(void); void mca_coll_han_free_dynamic_rules(void); void mca_coll_han_dump_dynamic_rules(void); -END_C_DECLS #endif diff --git a/ompi/mca/coll/han/coll_han_gather.c b/ompi/mca/coll/han/coll_han_gather.c index 2cbd6d976c..14b87bde92 100644 --- a/ompi/mca/coll/han/coll_han_gather.c +++ b/ompi/mca/coll/han/coll_han_gather.c @@ -16,40 +16,45 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" +static int mca_coll_han_gather_lg_task(void *task_args); +static int mca_coll_han_gather_ug_task(void *task_args); + /* only work with regular situation (each node has equal number of processes) */ -void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req) +static inline void +mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, bool is_mapbycore, ompi_request_t * req) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->sbuf_inter_free = sbuf_inter_free; - argu->scount = scount; - argu->sdtype = sdtype; - argu->rbuf = rbuf; - argu->rcount = rcount; - argu->rdtype = rdtype; - argu->root = root; - argu->root_up_rank = root_up_rank; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->w_rank = w_rank; - argu->noop = noop; - argu->req = req; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root = root; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->is_mapbycore = is_mapbycore; + args->req = req; } int @@ -61,50 +66,56 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - int i; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; int w_rank, w_size; /* information about the global communicator */ int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ char *reorder_buf = NULL, *reorder_rbuf = NULL; - ptrdiff_t rsize, rgap = 0, rextent; - int *vranks, low_rank, low_size; - int * topo; - + int i, err, *vranks, low_rank, low_size, *topo; ompi_request_t *temp_request = NULL; - w_rank = ompi_comm_rank(comm); - w_size = ompi_comm_size(comm); /* Create the subcommunicators */ - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + err = mca_coll_han_comm_create(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ topo = mca_coll_han_topo_init(comm, han_module, 2); - - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle gather with this communicator. It need to fall back on another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, root, - comm, han_module->previous_gather_module); + "han cannot handle gather with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); } + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; /* create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; + han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; + han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; /* Get the 'virtual ranks' mapping correspondong to the communicators */ vranks = han_module->cached_vranks; @@ -115,10 +126,9 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", - w_rank, root, root_low_rank, root_up_rank)); + "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", + w_rank, root, root_low_rank, root_up_rank)); - ompi_datatype_type_extent(rdtype, &rextent); /* Allocate reorder buffers */ if (w_rank == root) { @@ -127,17 +137,30 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, * in a increasing order for both patterns */ if (han_module->is_mapbycore) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Han Gather is_bycore: ", w_rank)); + "[%d]: Han Gather is_bycore: ", w_rank)); reorder_rbuf = (char *)rbuf; } else { /* Need a buffer to store unordered final result */ + ptrdiff_t rsize, rgap; rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * w_size, &rgap); reorder_buf = (char *)malloc(rsize); //TODO:free /* rgap is the size of unused space at the start of the datatype */ reorder_rbuf = reorder_buf - rgap; + + if (MPI_IN_PLACE == sbuf) { + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); + ptrdiff_t block_size = rextent * (ptrdiff_t)rcount; + ptrdiff_t src_shift = block_size * w_rank; + ptrdiff_t dest_shift = block_size * w_rank; + ompi_datatype_copy_content_same_ddt(rdtype, + (ptrdiff_t)rcount, + (char *)rbuf + dest_shift, + reorder_rbuf + src_shift); + } } } @@ -145,12 +168,12 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, /* Create lg task */ mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); /* Setup lg task arguments */ - mca_gather_argu_t *lg_argu = malloc(sizeof(mca_gather_argu_t)); - mac_coll_han_set_gather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, + mca_coll_han_gather_args_t *lg_args = malloc(sizeof(mca_coll_han_gather_args_t)); + mca_coll_han_set_gather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, up_comm, - low_comm, w_rank, low_rank != root_low_rank, temp_request); + low_comm, w_rank, low_rank != root_low_rank, han_module->is_mapbycore, temp_request); /* Init lg task */ - init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_argu)); + init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_args)); /* Issure lg task */ issue_task(lg); @@ -166,19 +189,21 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, */ /* reorder rbuf based on rank */ if (w_rank == root && !han_module->is_mapbycore) { + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); for (i=0; iw_rank)); - OBJ_RELEASE(t->cur_task); + ompi_datatype_t *dtype; + size_t count; + if (t->w_rank == t->root) { + dtype = t->rdtype; + count = t->rcount; + } else { + dtype = t->sdtype; + count = t->scount; + } /* If the process is one of the node leader */ char *tmp_buf = NULL; @@ -201,33 +234,45 @@ int mca_coll_han_gather_lg_task(void *task_argu) /* if the process is one of the node leader, allocate the intermediary * buffer to gather on the low sub communicator */ int low_size = ompi_comm_size(t->low_comm); + int low_rank = ompi_comm_rank(t->low_comm); ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&t->rdtype->super, - (int64_t)t->rcount * low_size, - &rgap); + rsize = opal_datatype_span(&dtype->super, + count * low_size, + &rgap); tmp_buf = (char *) malloc(rsize); tmp_rbuf = tmp_buf - rgap; + if (t->w_rank == t->root) { + if (MPI_IN_PLACE == t->sbuf) { + ptrdiff_t rextent; + ompi_datatype_type_extent(dtype, &rextent); + ptrdiff_t block_size = rextent * (ptrdiff_t)count; + ptrdiff_t src_shift = block_size * t->w_rank; + ptrdiff_t dest_shift = block_size * low_rank; + ompi_datatype_copy_content_same_ddt(dtype, + (ptrdiff_t)count, + tmp_rbuf + dest_shift, + (char *)t->rbuf + src_shift); + } + } } - /* shared memory node gather */ + /* Low level (usually intra-node or shared memory) node gather */ t->low_comm->c_coll->coll_gather((char *)t->sbuf, - t->scount, - t->sdtype, - tmp_rbuf, - t->rcount, - t->rdtype, - t->root_low_rank, - t->low_comm, - t->low_comm->c_coll->coll_gather_module); + count, + dtype, + tmp_rbuf, + count, + dtype, + t->root_low_rank, + t->low_comm, + t->low_comm->c_coll->coll_gather_module); /* Prepare up comm gather */ t->sbuf = tmp_rbuf; t->sbuf_inter_free = tmp_buf; /* Create ug (upper level all-gather) task */ - mca_coll_task_t *ug = OBJ_NEW(mca_coll_task_t); - /* Setup ug task arguments */ - t->cur_task = ug; + mca_coll_task_t *ug = t->cur_task; /* Init ug task */ init_task(ug, mca_coll_han_gather_ug_task, (void *) t); /* Issure ug task */ @@ -237,26 +282,37 @@ int mca_coll_han_gather_lg_task(void *task_argu) } /* ug: upper level (intra-node) gather task */ -int mca_coll_han_gather_ug_task(void *task_argu) +int mca_coll_han_gather_ug_task(void *task_args) { - mca_gather_argu_t *t = (mca_gather_argu_t *) task_argu; + mca_coll_han_gather_args_t *t = (mca_coll_han_gather_args_t *) task_args; OBJ_RELEASE(t->cur_task); if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Gather: ug noop\n", t->w_rank)); } else { + ompi_datatype_t *dtype; + size_t count; + if (t->w_rank == t->root) { + dtype = t->rdtype; + count = t->rcount; + } else { + dtype = t->sdtype; + count = t->scount; + } + + int low_size = ompi_comm_size(t->low_comm); /* inter node gather */ t->up_comm->c_coll->coll_gather((char *)t->sbuf, - t->scount*low_size, - t->sdtype, - (char *)t->rbuf, - t->rcount*low_size, - t->rdtype, - t->root_up_rank, - t->up_comm, - t->up_comm->c_coll->coll_gather_module); + count*low_size, + dtype, + (char *)t->rbuf, + count*low_size, + dtype, + t->root_up_rank, + t->up_comm, + t->up_comm->c_coll->coll_gather_module); if (t->sbuf_inter_free != NULL) { free(t->sbuf_inter_free); @@ -274,36 +330,56 @@ int mca_coll_han_gather_ug_task(void *task_argu) /* only work with regular situation (each node has equal number of processes) */ int mca_coll_han_gather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - int w_rank = ompi_comm_rank(comm); + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + int *topo, w_rank = ompi_comm_rank(comm); int w_size = ompi_comm_size(comm); - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - /* Topo must be initialized to know rank distribution which then is used to - * determine if han can be used */ - int *topo = mca_coll_han_topo_init(comm, han_module, 2); - - /* Here root needs to reach all nodes on up_comm. - * But in case of unbalance some up_comms are smaller, - * as the comm_split is made on the base of low_rank */ - if (han_module->are_ppn_imbalanced){ + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle gather with this communicator. It need to fall back on another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, root, - comm, han_module->previous_gather_module); + "han cannot handle gather with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + topo = mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); } - /* create the subcommunicators */ - mca_coll_han_comm_create_new(comm, han_module); ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + ompi_datatype_t *dtype; + size_t count; + + if (w_rank == root) { + dtype = rdtype; + count = rcount; + } else { + dtype = sdtype; + count = scount; + } + /* Get the 'virtual ranks' mapping corresponding to the communicators */ int *vranks = han_module->cached_vranks; @@ -325,11 +401,11 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, reorder_buf_start = (char *)rbuf; } else { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Gather needs reordering: ", w_rank)); + "[%d]: Future Gather needs reordering: ", w_rank)); ptrdiff_t rgap = 0; ptrdiff_t rsize = opal_datatype_span(&rdtype->super, - (int64_t)rcount * w_size, - &rgap); + (int64_t)rcount * w_size, + &rgap); reorder_buf = (char *)malloc(rsize); /* rgap is the size of unused space at the start of the datatype */ reorder_buf_start = reorder_buf - rgap; @@ -338,40 +414,40 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, } /* allocate the intermediary buffer - * * to gather on leaders on the low sub communicator */ + * to gather on leaders on the low sub communicator */ char *tmp_buf = NULL; // allocated memory char *tmp_buf_start = NULL; // start of the data if (low_rank == root_low_rank) { - ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&rdtype->super, - (int64_t)rcount * low_size, - &rgap); + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&dtype->super, + count * low_size, + &rgap); tmp_buf = (char *) malloc(rsize); tmp_buf_start = tmp_buf - rgap; } /* 1. low gather on nodes leaders */ low_comm->c_coll->coll_gather((char *)sbuf, - scount, - sdtype, - tmp_buf_start, - rcount, - rdtype, - root_low_rank, - low_comm, - low_comm->c_coll->coll_gather_module); + count, + dtype, + tmp_buf_start, + count, + dtype, + root_low_rank, + low_comm, + low_comm->c_coll->coll_gather_module); /* 2. upper gather (inter-node) between node leaders */ if (low_rank == root_low_rank) { up_comm->c_coll->coll_gather((char *)tmp_buf_start, - scount*low_size, - sdtype, - (char *)reorder_buf_start, - rcount*low_size, - rdtype, - root_up_rank, - up_comm, - up_comm->c_coll->coll_gather_module); + count*low_size, + dtype, + (char *)reorder_buf_start, + count*low_size, + dtype, + root_up_rank, + up_comm, + up_comm->c_coll->coll_gather_module); if (tmp_buf != NULL) { free(tmp_buf); @@ -379,7 +455,7 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, tmp_buf_start = NULL; } OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] Future Gather: ug gather finish\n", t->w_rank)); + "[%d] Future Gather: ug gather finish\n", w_rank)); } /* 3. reorder data on root into rbuf @@ -388,8 +464,8 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, */ if (w_rank == root && !han_module->is_mapbycore) { ompi_coll_han_reorder_gather(reorder_buf_start, - rbuf, rcount, rdtype, - comm, topo); + rbuf, rcount, rdtype, + comm, topo); free(reorder_buf); } @@ -408,28 +484,28 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, */ void ompi_coll_han_reorder_gather(const void *sbuf, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - int * topo) { - int i; - int topolevel = 2; // always 2 levels in topo - int w_rank = ompi_comm_rank(comm); - int w_size = ompi_comm_size(comm); - ptrdiff_t rextent; - ompi_datatype_type_extent(rdtype, &rextent); - for (i=0; ifallback.NAME.NAME = NULL; \ + (HANDLE)->fallback.NAME.module = NULL; \ + } while (0) + /* * Module constructor */ static void han_module_clear(mca_coll_han_module_t *han_module) { - int i; + CLEAN_PREV_COLL(han_module, allgather); + CLEAN_PREV_COLL(han_module, allgatherv); + CLEAN_PREV_COLL(han_module, allreduce); + CLEAN_PREV_COLL(han_module, bcast); + CLEAN_PREV_COLL(han_module, reduce); + CLEAN_PREV_COLL(han_module, gather); + CLEAN_PREV_COLL(han_module, scatter); - for (i = 0; i < COLLCOUNT; i++) { - /* - * Since the previous routines function pointers are declared as - * a union, initializing the dummy routineis enough - */ - han_module->previous_routines[i].previous_routine.dummy = NULL; - han_module->previous_routines[i].previous_module = NULL; - } han_module->reproducible_reduce = NULL; han_module->reproducible_reduce_module = NULL; han_module->reproducible_allreduce = NULL; @@ -50,19 +54,18 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module) { int i; - module->enabled = false; + module->enabled = true; module->super.coll_module_disable = mca_coll_han_module_disable; - module->cached_comm = NULL; module->cached_low_comms = NULL; module->cached_up_comms = NULL; module->cached_vranks = NULL; module->cached_topo = NULL; module->is_mapbycore = false; module->storage_initialized = false; - for (i = 0 ; i < NB_TOPO_LVL ; i++) { + for( i = 0; i < NB_TOPO_LVL; i++ ) { module->sub_comm[i] = NULL; } - for (i=SELF ; imodules_storage.modules[i].module_handler = NULL; } @@ -72,16 +75,18 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module) } -#define OBJ_RELEASE_IF_NOT_NULL(obj) do { \ - if (NULL != (obj)) { \ - OBJ_RELEASE(obj); \ - } \ -} while (0) +#define OBJ_RELEASE_IF_NOT_NULL(obj) \ + do { \ + if (NULL != (obj)) { \ + OBJ_RELEASE(obj); \ + } \ + } while (0) /* * Module destructor */ -static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) +static void +mca_coll_han_module_destruct(mca_coll_han_module_t * module) { int i; @@ -126,7 +131,6 @@ static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) han_module_clear(module); } - OBJ_CLASS_INSTANCE(mca_coll_han_module_t, mca_coll_base_module_t, mca_coll_han_module_construct, @@ -155,6 +159,8 @@ int mca_coll_han_init_query(bool enable_progress_threads, mca_coll_base_module_t * mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) { + int flag; + char info_val[OPAL_MAX_INFO_VAL+1]; mca_coll_han_module_t *han_module; /* @@ -172,7 +178,13 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) comm->c_contextid, comm->c_name); return NULL; } - + if( !ompi_group_have_remote_peers(comm->c_local_group) ) { + /* The group only contains local processes. Disable HAN for now */ + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): comm has only local processes; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } /* Get the priority level attached to this module. If priority is less * than or equal to 0, then the module is unavailable. */ *priority = mca_coll_han_component.han_priority; @@ -189,52 +201,46 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) } /* All is good -- return a module */ - han_module->topologic_level = mca_coll_han_component.topo_level; + han_module->topologic_level = GLOBAL_COMMUNICATOR; + + if (NULL != comm->super.s_info) { + /* Get the info value disaqualifying coll components */ + opal_info_get(comm->super.s_info, "ompi_comm_coll_han_topo_level", + sizeof(info_val), info_val, &flag); + + if (flag) { + if (0 == strcmp(info_val, "INTER_NODE")) { + han_module->topologic_level = INTER_NODE; + } else { + han_module->topologic_level = INTRA_NODE; + } + } + } + + han_module->super.coll_module_enable = han_module_enable; + han_module->super.ft_event = NULL; + han_module->super.coll_alltoall = NULL; + han_module->super.coll_alltoallv = NULL; + han_module->super.coll_alltoallw = NULL; + han_module->super.coll_barrier = NULL; + han_module->super.coll_exscan = NULL; + han_module->super.coll_gatherv = NULL; + han_module->super.coll_reduce_scatter = NULL; + han_module->super.coll_scan = NULL; + han_module->super.coll_scatterv = NULL; + han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; + han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; + han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; + han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; + han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; + han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; - /* - * TODO: When the selector is fully implemented, - * this if will be meaningless - */ if (GLOBAL_COMMUNICATOR == han_module->topologic_level) { /* We are on the global communicator, return topological algorithms */ - han_module->super.coll_module_enable = han_module_enable; - han_module->super.ft_event = NULL; - han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; han_module->super.coll_allgatherv = NULL; - han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; - han_module->super.coll_alltoall = NULL; - han_module->super.coll_alltoallv = NULL; - han_module->super.coll_alltoallw = NULL; - han_module->super.coll_barrier = NULL; - han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; - han_module->super.coll_exscan = NULL; - han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; - han_module->super.coll_gatherv = NULL; - han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; - han_module->super.coll_reduce_scatter = NULL; - han_module->super.coll_scan = NULL; - han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; - han_module->super.coll_scatterv = NULL; } else { /* We are on a topologic sub-communicator, return only the selector */ - han_module->super.coll_module_enable = han_module_enable; - han_module->super.ft_event = NULL; - han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic; - han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; - han_module->super.coll_alltoall = NULL; - han_module->super.coll_alltoallv = NULL; - han_module->super.coll_alltoallw = NULL; - han_module->super.coll_barrier = NULL; - han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; - han_module->super.coll_exscan = NULL; - han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; - han_module->super.coll_gatherv = NULL; - han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; - han_module->super.coll_reduce_scatter = NULL; - han_module->super.coll_scan = NULL; - han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; - han_module->super.coll_scatterv = NULL; } opal_output_verbose(10, ompi_coll_base_framework.framework_output, @@ -247,28 +253,28 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) /* * In this macro, the following variables are supposed to have been declared * in the caller: - * . ompi_communicator_t *comm + * . ompi_communicator_t *comm * . mca_coll_han_module_t *han_module - */ -#define HAN_SAVE_PREV_COLL_API(__api) do { \ - han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ - han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module;\ - if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ - opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ - "(%d/%s): no underlying " # __api"; disqualifying myself", \ - comm->c_contextid, comm->c_name); \ - return OMPI_ERROR; \ - } \ - /* TODO add a OBJ_RELEASE at module disabling */ \ - /* + FIXME find why releasing generates memory corruption */ \ - OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ + */ +#define HAN_SAVE_PREV_COLL_API(__api) \ + do { \ + if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ + opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ + "(%d/%s): no underlying " # __api"; disqualifying myself", \ + comm->c_contextid, comm->c_name); \ + goto handle_error; \ + } \ + han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ + han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \ + OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ } while(0) /* * Init module on the communicator */ -static int han_module_enable(mca_coll_base_module_t * module, - struct ompi_communicator_t *comm) +static int +han_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) { mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module; @@ -285,13 +291,25 @@ static int han_module_enable(mca_coll_base_module_t * module, mca_coll_han_allreduce_reproducible_decision(comm, module); return OMPI_SUCCESS; + +handle_error: + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module); + + return OMPI_ERROR; } /* * Module disable */ -static int mca_coll_han_module_disable(mca_coll_base_module_t * module, - struct ompi_communicator_t *comm) +static int +mca_coll_han_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) { mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module; diff --git a/ompi/mca/coll/han/coll_han_reduce.c b/ompi/mca/coll/han/coll_han_reduce.c index d0dc337ce8..03968b6f47 100644 --- a/ompi/mca/coll/han/coll_han_reduce.c +++ b/ompi/mca/coll/han/coll_han_reduce.c @@ -15,33 +15,38 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" -void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, - int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop) +static int mca_coll_han_reduce_t0_task(void *task_args); +static int mca_coll_han_reduce_t1_task(void *task_args); + +static inline void +mca_coll_han_set_reduce_args(mca_coll_han_reduce_args_t * args, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, + int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop, bool is_tmp_rbuf) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->rbuf = rbuf; - argu->seg_count = seg_count; - argu->dtype = dtype; - argu->op = op; - argu->root_low_rank = root_low_rank; - argu->root_up_rank = root_up_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->num_segments = num_segments; - argu->cur_seg = cur_seg; - argu->w_rank = w_rank; - argu->last_seg_count = last_seg_count; - argu->noop = noop; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->rbuf = rbuf; + args->seg_count = seg_count; + args->dtype = dtype; + args->op = op; + args->root_low_rank = root_low_rank; + args->root_up_rank = root_up_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; + args->is_tmp_rbuf = is_tmp_rbuf; } -/* - * Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce: +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce: * lb: low level (shared-memory or intra-node) reduce. * ub: upper level (inter-node) reduce * Hence, in each iteration, there is a combination of collective operations which is called a task. @@ -53,49 +58,62 @@ void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cu * iter 4 | | | | ur | task: t1, contains ur */ int -mca_coll_han_reduce_intra(const void *sbuf, +mca_coll_han_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, ompi_op_t* op, int root, - struct ompi_communicator_t *comm, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - ptrdiff_t extent, lb; - ompi_datatype_get_extent(dtype, &lb, &extent); - int w_rank; - w_rank = ompi_comm_rank(comm); - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - /* Do not initialize topology if the operation cannot commute */ - if(!ompi_op_is_commute(op)){ + ptrdiff_t extent, lb; + int seg_count = count, w_rank; + size_t dtype_size; + + /* No support for non-commutative operations */ + if(!ompi_op_is_commute(op)) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this operation. It needs to fall back on another component\n")); + "han cannot handle reduce with this operation. Fall back on another component\n")); goto prev_reduce_intra; } + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all modules */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); - goto prev_reduce_intra; + "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); } - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); + ompi_datatype_get_extent(dtype, &lb, &extent); + w_rank = ompi_comm_rank(comm); + ompi_datatype_type_size(dtype, &dtype_size); + ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; /* use MCA parameters for now */ low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; up_comm = han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module]; - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, typelng, + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, dtype_size, seg_count); int num_segments = (count + seg_count - 1) / seg_count; @@ -106,6 +124,7 @@ mca_coll_han_reduce_intra(const void *sbuf, int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); + int up_rank = ompi_comm_rank(up_comm); int root_low_rank; int root_up_rank; @@ -114,14 +133,22 @@ mca_coll_han_reduce_intra(const void *sbuf, "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, root_up_rank)); + void *tmp_rbuf = rbuf; + void *tmp_rbuf_to_free = NULL; + if (low_rank == root_low_rank && root_up_rank != up_rank) { + /* allocate 2 segments on node leaders that are not the global root */ + tmp_rbuf = malloc(2*extent*seg_count); + tmp_rbuf_to_free = tmp_rbuf; + } + /* Create t0 tasks for the first segment */ mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); /* Setup up t0 task arguments */ - mca_reduce_argu_t *t = malloc(sizeof(mca_reduce_argu_t)); - mac_coll_han_set_reduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, + mca_coll_han_reduce_args_t *t = malloc(sizeof(mca_coll_han_reduce_args_t)); + mca_coll_han_set_reduce_args(t, t0, (char *) sbuf, (char *) tmp_rbuf, seg_count, dtype, op, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, - low_rank != root_low_rank); + low_rank != root_low_rank, (NULL != tmp_rbuf_to_free)); /* Init the first task */ init_task(t0, mca_coll_han_reduce_t0_task, (void *) t); issue_task(t0); @@ -140,7 +167,9 @@ mca_coll_han_reduce_intra(const void *sbuf, /* Setup up t1 task arguments */ t->cur_task = t1; t->sbuf = (char *) t->sbuf + extent * t->seg_count; - t->rbuf = (char *) t->rbuf + extent * t->seg_count; + if (up_rank == root_up_rank) { + t->rbuf = (char *) t->rbuf + extent * t->seg_count; + } t->cur_seg = t->cur_seg + 1; /* Init the t1 task */ init_task(t1, mca_coll_han_reduce_t1_task, (void *) t); @@ -148,19 +177,20 @@ mca_coll_han_reduce_intra(const void *sbuf, } free(t); + free(tmp_rbuf_to_free); return OMPI_SUCCESS; -prev_reduce_intra: + prev_reduce_intra: return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, comm, han_module->previous_reduce_module); } /* t0 task: issue and wait for the low level reduce of segment 0 */ -int mca_coll_han_reduce_t0_task(void *task_argu) +int mca_coll_han_reduce_t0_task(void *task_args) { - mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; + mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); @@ -173,41 +203,55 @@ int mca_coll_han_reduce_t0_task(void *task_argu) } /* t1 task */ -int mca_coll_han_reduce_t1_task(void *task_argu) { - mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; +int mca_coll_han_reduce_t1_task(void *task_args) { + mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); ptrdiff_t extent, lb; + int cur_seg = t->cur_seg; ompi_datatype_get_extent(t->dtype, &lb, &extent); ompi_request_t *ireduce_req = NULL; - int tmp_count = t->seg_count; if (!t->noop) { + int tmp_count = t->seg_count; + if (cur_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } int up_rank = ompi_comm_rank(t->up_comm); /* ur of cur_seg */ if (up_rank == t->root_up_rank) { - t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, tmp_count, t->dtype, t->op, t->root_up_rank, t->up_comm, &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); } else { - t->up_comm->c_coll->coll_ireduce((char *) t->rbuf, (char *) t->rbuf, t->seg_count, + /* this is a node leader that is not root so alternate between the two allocated segments */ + char *tmp_sbuf = (char*)t->rbuf + (cur_seg % 2)*(extent * t->seg_count); + t->up_comm->c_coll->coll_ireduce(tmp_sbuf, NULL, tmp_count, t->dtype, t->op, t->root_up_rank, t->up_comm, &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); } } /* lr of cur_seg+1 */ - if (t->cur_seg <= t->num_segments - 2) { - if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + int next_seg = cur_seg + 1; + if (next_seg <= t->num_segments - 1) { + int tmp_count = t->seg_count; + char *tmp_rbuf = NULL; + if (next_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) { tmp_count = t->last_seg_count; } + if (t->is_tmp_rbuf) { + tmp_rbuf = (char*)t->rbuf + (next_seg % 2)*(extent * t->seg_count); + } else if (NULL != t->rbuf) { + tmp_rbuf = (char*)t->rbuf + extent * t->seg_count; + } t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count, - (char *) t->rbuf + extent * t->seg_count, tmp_count, + (char *) tmp_rbuf, tmp_count, t->dtype, t->op, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_reduce_module); } if (!t->noop && ireduce_req) { - ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); + ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE); } return OMPI_SUCCESS; @@ -217,13 +261,13 @@ int mca_coll_han_reduce_t1_task(void *task_argu) { * a fallback is made on the next component that provides a reduce in priority order */ int mca_coll_han_reduce_intra_simple(const void *sbuf, - void* rbuf, - int count, - struct ompi_datatype_t *dtype, - ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { int w_rank; /* information about the global communicator */ int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ @@ -234,23 +278,37 @@ mca_coll_han_reduce_intra_simple(const void *sbuf, mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - /* Do not initialize topology if the operation cannot commute */ + /* No support for non-commutative operations */ if(!ompi_op_is_commute(op)){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this operation. It needs to fall back on another component\n")); - goto prev_reduce_intra_simple; + "han cannot handle reduce with this operation. Fall back on another component\n")); + goto prev_reduce_intra; + } + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); } /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); - goto prev_reduce_intra_simple; + "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); } - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; ompi_communicator_t *up_comm = @@ -289,7 +347,7 @@ mca_coll_han_reduce_intra_simple(const void *sbuf, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/REDUCE: low comm reduce failed. " "Falling back to another component\n")); - goto prev_reduce_intra_simple; + goto prev_reduce_intra; } /* Up_comm reduce */ @@ -315,10 +373,9 @@ mca_coll_han_reduce_intra_simple(const void *sbuf, } return OMPI_SUCCESS; -prev_reduce_intra_simple: + prev_reduce_intra: return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, - comm, - han_module->previous_reduce_module); + comm, han_module->previous_reduce_module); } @@ -341,15 +398,14 @@ mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, int i; for (i=0; imodules_storage - .modules[fallback] - .module_handler; + mca_coll_base_module_t *fallback_module + = han_module->modules_storage.modules[fallback].module_handler; if (fallback_module != NULL && fallback_module->coll_reduce != NULL) { if (0 == w_rank) { opal_output_verbose(30, mca_coll_han_component.han_output, "coll:han:reduce_reproducible: " "fallback on %s\n", - components_name[fallback]); + available_components[fallback].component_name); } han_module->reproducible_reduce_module = fallback_module; han_module->reproducible_reduce = fallback_module->coll_reduce; diff --git a/ompi/mca/coll/han/coll_han_scatter.c b/ompi/mca/coll/han/coll_han_scatter.c index b2a8752938..c52cc1911a 100644 --- a/ompi/mca/coll/han/coll_han_scatter.c +++ b/ompi/mca/coll/han/coll_han_scatter.c @@ -15,96 +15,105 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" +static int mca_coll_han_scatter_us_task(void *task_args); +static int mca_coll_han_scatter_ls_task(void *task_args); + /* Only work with regular situation (each node has equal number of processes) */ -void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - void *sbuf_reorder_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req) +static inline void +mca_coll_han_set_scatter_args(mca_coll_han_scatter_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + void *sbuf_reorder_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, ompi_request_t * req) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->sbuf_inter_free = sbuf_inter_free; - argu->sbuf_reorder_free = sbuf_reorder_free; - argu->scount = scount; - argu->sdtype = sdtype; - argu->rbuf = rbuf; - argu->rcount = rcount; - argu->rdtype = rdtype; - argu->root = root; - argu->root_up_rank = root_up_rank; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->w_rank = w_rank; - argu->noop = noop; - argu->req = req; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->sbuf_reorder_free = sbuf_reorder_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root = root; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->req = req; } int mca_coll_han_scatter_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - int i, j; - int w_rank, w_size; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + int i, j, w_rank, w_size; w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - int *topo = mca_coll_han_topo_init(comm, han_module, 2); - /* Topo must be initialized to know rank distribution which then is used to - * determine if han can be used */ - mca_coll_han_topo_init(comm, han_module, 2); - if (han_module->are_ppn_imbalanced){ + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle scatter with this communicator. It needs to fall back on another component\n")); - goto prev_scatter_intra; + "han cannot handle scatter with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, + comm, comm->c_coll->coll_scatter_module); + } + + /* Topo must be initialized to know rank distribution which then is used to + * determine if han can be used */ + int* topo = mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle scatter with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, scatter); + return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, + comm, comm->c_coll->coll_scatter_module); } - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; + han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; + han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); int up_size = ompi_comm_size(up_comm); - ompi_request_t *temp_request = NULL; /* Set up request */ - temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); + ompi_request_t *temp_request = OBJ_NEW(ompi_request_t); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; int root_low_rank; int root_up_rank; - mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank, @@ -149,42 +158,55 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, } } + + void *dest_buf = rbuf; + int dest_count = rcount; + ompi_datatype_t *dest_dtype = rdtype; + if (MPI_IN_PLACE == rbuf) { + dest_buf = (void*)sbuf; + dest_count = scount; + dest_dtype = sdtype; + } + /* Create us task */ mca_coll_task_t *us = OBJ_NEW(mca_coll_task_t); /* Setup us task arguments */ - mca_scatter_argu_t *us_argu = malloc(sizeof(mca_scatter_argu_t)); - mac_coll_han_set_scatter_argu(us_argu, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, - (char *) rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, + mca_coll_han_scatter_args_t *us_args = malloc(sizeof(mca_coll_han_scatter_args_t)); + mca_coll_han_set_scatter_args(us_args, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, + (char *) dest_buf, dest_count, dest_dtype, root, root_up_rank, root_low_rank, up_comm, low_comm, w_rank, low_rank != root_low_rank, temp_request); /* Init us task */ - init_task(us, mca_coll_han_scatter_us_task, (void *) (us_argu)); + init_task(us, mca_coll_han_scatter_us_task, (void *) (us_args)); /* Issure us task */ issue_task(us); ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); return OMPI_SUCCESS; -prev_scatter_intra: - return han_module->previous_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module->previous_scatter_module); } /* us: upper level (intra-node) scatter task */ -int mca_coll_han_scatter_us_task(void *task_argu) +int mca_coll_han_scatter_us_task(void *task_args) { - mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; - OBJ_RELEASE(t->cur_task); + mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args; if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: us noop\n", t->w_rank)); } else { + size_t count; + ompi_datatype_t *dtype; + if (t->w_rank == t->root) { + dtype = t->sdtype; + count = t->scount; + } else { + dtype = t->rdtype; + count = t->rcount; + } int low_size = ompi_comm_size(t->low_comm); ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); + rsize = opal_datatype_span(&dtype->super, (int64_t) count * low_size, &rgap); char *tmp_buf = (char *) malloc(rsize); char *tmp_rbuf = tmp_buf - rgap; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, @@ -202,9 +224,7 @@ int mca_coll_han_scatter_us_task(void *task_argu) t->sbuf_reorder_free = NULL; } /* Create ls tasks for the current union segment */ - mca_coll_task_t *ls = OBJ_NEW(mca_coll_task_t); - /* Setup up ls task arguments */ - t->cur_task = ls; + mca_coll_task_t *ls = t->cur_task; /* Init ls task */ init_task(ls, mca_coll_han_scatter_ls_task, (void *) t); /* Issure ls task */ @@ -213,14 +233,14 @@ int mca_coll_han_scatter_us_task(void *task_argu) return OMPI_SUCCESS; } -/* ls: lower level (shared memory) scatter task */ -int mca_coll_han_scatter_ls_task(void *task_argu) +/* ls: lower level (shared memory or intra-node) scatter task */ +int mca_coll_han_scatter_ls_task(void *task_args) { - mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; + mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls\n", t->w_rank)); OBJ_RELEASE(t->cur_task); - /* Shared memory scatter */ + t->low_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount, t->sdtype, (char *) t->rbuf, t->rcount, t->rdtype, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_scatter_module); diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c index e99f3e614b..bf5b4df523 100644 --- a/ompi/mca/coll/han/coll_han_subcomms.c +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -26,157 +26,100 @@ #include "coll_han.h" #include "coll_han_dynamic.h" +#define HAN_SUBCOM_SAVE_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \ + do { \ + (FALLBACKS).COLL.COLL = (COMM)->c_coll->coll_ ## COLL; \ + (FALLBACKS).COLL.module = (COMM)->c_coll->coll_ ## COLL ## _module; \ + (COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \ + } while(0) -/* - * Local functions - */ -static void create_intranode_comm_new(ompi_communicator_t *, - ompi_communicator_t **); -static void create_internode_comm_new(ompi_communicator_t *, - int, int, - ompi_communicator_t **); -static void create_intranode_comm(ompi_communicator_t *, - const char *, - int, - ompi_communicator_t **); -static void create_internode_comm(ompi_communicator_t *, - const char *, - int, int, - ompi_communicator_t **); - -/** - * Create a sub-communicator containing the ranks that share my node. - * - * @param comm (IN) original communicator for the collective - * target module priority - * @param sub_comm (OUT) created sub-communicator - */ -static void create_intranode_comm_new(ompi_communicator_t *comm, - ompi_communicator_t **sub_comm) -{ - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, - (opal_info_t *)(&ompi_mpi_info_null), sub_comm); - return; -} - -/** - * Create a sub-communicator containing one rank per node. - * - * @param comm (IN) original communicator for the collective - * @param my_rank (IN) my rank in comm - * @param intra_rank (IN) local rank in the intra-node sub-communicator - * @param sub_comm (OUT) created sub-communicator - */ -static void create_internode_comm_new(ompi_communicator_t *comm, - int my_rank, - int intra_rank, - ompi_communicator_t **sub_comm) -{ - ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); - return; -} +#define HAN_SUBCOM_LOAD_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \ + do { \ + (COMM)->c_coll->coll_ ## COLL = (FALLBACKS).COLL.COLL; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (FALLBACKS).COLL.module; \ + } while(0) /* * Routine that creates the local hierarchical sub-communicators * Called each time a collective is called. * comm: input communicator of the collective */ -void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, +int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module) { - int low_rank, low_size; - int up_rank; - int w_rank; - int w_size; + int low_rank, low_size, up_rank, w_rank, w_size; ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]); ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]); - const int *origin_priority; - int han_var_id; - int tmp_han_priority; + mca_coll_han_collectives_fallback_t fallbacks; int vrank, *vranks; - - mca_coll_base_module_allreduce_fn_t old_allreduce; - mca_coll_base_module_t *old_allreduce_module; - - mca_coll_base_module_allgather_fn_t old_allgather; - mca_coll_base_module_t *old_allgather_module; - - mca_coll_base_module_bcast_fn_t old_bcast; - mca_coll_base_module_t *old_bcast_module; - - mca_coll_base_module_gather_fn_t old_gather; - mca_coll_base_module_t *old_gather_module; - - mca_coll_base_module_reduce_fn_t old_reduce; - mca_coll_base_module_t *old_reduce_module; + opal_info_t comm_info; /* The sub communicators have already been created */ - if (NULL != han_module->sub_comm[INTRA_NODE] + if (han_module->enabled && NULL != han_module->sub_comm[INTRA_NODE] && NULL != han_module->sub_comm[INTER_NODE] && NULL != han_module->cached_vranks) { - return; + return OMPI_SUCCESS; } /* - * We cannot use han allreduce and allgather without sub-communicators - * Temporary set previous ones + * We cannot use han allreduce and allgather without sub-communicators, + * but we are in the creation of the data structures for the HAN, and + * temporarily need to save back the old collective. * * Allgather is used to compute vranks * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new * Reduce + Bcast may be called by the allreduce implementation * Gather + Bcast may be called by the allgather implementation */ - old_allreduce = comm->c_coll->coll_allreduce; - old_allreduce_module = comm->c_coll->coll_allreduce_module; + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter); - old_allgather = comm->c_coll->coll_allgather; - old_allgather_module = comm->c_coll->coll_allgather_module; + /** + * HAN is not yet optimized for a single process per node case, we should + * avoid selecting it for collective communication support in such cases. + * However, in order to decide if this is tru, we need to know how many + * local processes are on each node, a condition that cannot be verified + * outside the MPI support (with PRRTE the info will be eventually available, + * but we don't want to delay anything until then). We can achieve the same + * goal by using a reduction over the maximum number of peers per node among + * all participants. + */ + int local_procs = ompi_group_count_local_peers(comm->c_local_group); + comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT, + MPI_MAX, comm, + comm->c_coll->coll_allreduce_module); + if( local_procs == 1 ) { + /* restore saved collectives */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + han_module->enabled = false; /* entire module set to pass-through from now on */ + return OMPI_ERR_NOT_SUPPORTED; + } - old_reduce = comm->c_coll->coll_reduce; - old_reduce_module = comm->c_coll->coll_reduce_module; - - old_bcast = comm->c_coll->coll_bcast; - old_bcast_module = comm->c_coll->coll_bcast_module; - - old_gather = comm->c_coll->coll_gather; - old_gather_module = comm->c_coll->coll_gather_module; - - comm->c_coll->coll_allreduce = han_module->previous_allreduce; - comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; - - comm->c_coll->coll_allgather = han_module->previous_allgather; - comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; - - comm->c_coll->coll_reduce = han_module->previous_reduce; - comm->c_coll->coll_reduce_module = han_module->previous_reduce_module; - - comm->c_coll->coll_bcast = han_module->previous_bcast; - comm->c_coll->coll_bcast_module = han_module->previous_bcast_module; - - comm->c_coll->coll_gather = han_module->previous_gather; - comm->c_coll->coll_gather_module = han_module->previous_gather_module; + OBJ_CONSTRUCT(&comm_info, opal_info_t); /* Create topological sub-communicators */ w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); - origin_priority = NULL; - mca_base_var_find_by_name("coll_han_priority", &han_var_id); - mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); - - /* - * Maximum priority for selector on sub-communicators - */ - tmp_han_priority = 100; - mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - /* * This sub-communicator contains the ranks that share my node. */ - mca_coll_han_component.topo_level = INTRA_NODE; - create_intranode_comm_new(comm, low_comm); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "han"); + opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTRA_NODE"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, low_comm); /* * Get my local rank and the local size @@ -188,8 +131,8 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, * This sub-communicator contains one process per node: processes with the * same intra-node rank id share such a sub-communicator */ - mca_coll_han_component.topo_level = INTER_NODE; - create_internode_comm_new(comm, w_rank, low_rank, up_comm); + opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTER_NODE"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, up_comm, false); up_rank = ompi_comm_rank(*up_comm); @@ -208,216 +151,116 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, * vrank */ comm->c_coll->coll_allgather(&vrank, - 1, - MPI_INT, - vranks, - 1, - MPI_INT, - comm, - comm->c_coll->coll_allgather_module); + 1, + MPI_INT, + vranks, + 1, + MPI_INT, + comm, + comm->c_coll->coll_allgather_module); /* * Set the cached info */ han_module->cached_vranks = vranks; - /* - * Come back to the original han module priority - */ - mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); + /* Reset the saved collectives to point back to HAN */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); - /* Put allreduce, allgather, reduce and bcast back */ - comm->c_coll->coll_allreduce = old_allreduce; - comm->c_coll->coll_allreduce_module = old_allreduce_module; - - comm->c_coll->coll_allgather = old_allgather; - comm->c_coll->coll_allgather_module = old_allgather_module; - - comm->c_coll->coll_reduce = old_reduce; - comm->c_coll->coll_reduce_module = old_reduce_module; - - comm->c_coll->coll_bcast = old_bcast; - comm->c_coll->coll_bcast_module = old_bcast_module; - - comm->c_coll->coll_gather = old_gather; - comm->c_coll->coll_gather_module = old_gather_module; - - mca_coll_han_component.topo_level = GLOBAL_COMMUNICATOR; + OBJ_DESTRUCT(&comm_info); + return OMPI_SUCCESS; } -/** - * Create a sub-communicator containing the ranks that share my node. - * Associate this sub-communicator a given collective module. - * module can be one of: - * . sm - * . shared - * - * @param comm (IN) original communicator for the collective - * @param prio_string (IN) string containing the mca variable associated to - * target module priority - * @param my_rank (IN) my rank in comm - * @param sub_comm (OUT) created sub-communicator - */ -static void create_intranode_comm(ompi_communicator_t *comm, - const char *prio_string, - int my_rank, - ompi_communicator_t **sub_comm) -{ - int var_id; - const int *sav_priority; - int tmp_priority = 100; - - /* - * Upgrade the target module priority to make the resulting sub-communicator - * use that collective module - */ - mca_base_var_find_by_name(prio_string, &var_id); - mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] %s origin %d\n", - my_rank, prio_string, *sav_priority)); - - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - /* - * Create the sub-communicator - * Since the target priority has been set to the highest value, this - * sub-communicator will inherit it as a collective module. - */ - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, - (opal_info_t *)(&ompi_mpi_info_null), sub_comm); - /* - * Come back to the target module's original priority - */ - mca_base_var_set_value(var_id, sav_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - return; -} - -/** - * Create a sub-communicator containing one rank per node. - * Associate this sub-communicator a given collective module. - * module can be one of: - * . libnbc - * . adapt - * - * @param comm (IN) original communicator for the collective - * @param prio_string (IN) string containing the mca variable associated to - * target module priority - * @param my_rank (IN) my rank in comm - * @param intra_rank (IN) local rank in the intra-node sub-communicator - * @param sub_comm (OUT) created sub-communicator - */ -static void create_internode_comm(ompi_communicator_t *comm, - const char *prio_string, - int my_rank, - int intra_rank, - ompi_communicator_t **sub_comm) -{ - int var_id; - const int *sav_priority; - int tmp_priority = 100; - - /* - * Upgrade the target module priority to make the resulting sub-communicator - * use that collective module - */ - mca_base_var_find_by_name(prio_string, &var_id); - mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] %s origin %d\n", my_rank, prio_string, - *sav_priority)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - /* - * Create the sub-communicator - * Since the target priority has been set to the highest value, this - * sub-communicator will inherit it as a collective module. - */ - ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); - mca_base_var_set_value(var_id, sav_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - return; -} - - /* * Routine that creates the local hierarchical sub-communicators * Called each time a collective is called. * comm: input communicator of the collective */ -void mca_coll_han_comm_create(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module) +int mca_coll_han_comm_create(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) { - int low_rank, low_size; - int up_rank; - int w_rank; - int w_size; + int low_rank, low_size, up_rank, w_rank, w_size; + mca_coll_han_collectives_fallback_t fallbacks; ompi_communicator_t **low_comms; ompi_communicator_t **up_comms; - const int *origin_priority; - int han_var_id; - int tmp_han_priority; int vrank, *vranks; - - mca_coll_base_module_allreduce_fn_t old_allreduce; - mca_coll_base_module_t *old_allreduce_module; - mca_coll_base_module_allgather_fn_t old_allgather; - mca_coll_base_module_t *old_allgather_module; + opal_info_t comm_info; /* use cached communicators if possible */ - if (han_module->cached_comm == comm && - han_module->cached_low_comms != NULL && - han_module->cached_up_comms != NULL && - han_module->cached_vranks != NULL) { - return; + if (han_module->enabled && han_module->cached_low_comms != NULL && + han_module->cached_up_comms != NULL && + han_module->cached_vranks != NULL) { + return OMPI_SUCCESS; } - /* We cannot use han allreduce and allgather without sub-communicators - * Temporary set previous ones */ - old_allreduce = comm->c_coll->coll_allreduce; - old_allreduce_module = comm->c_coll->coll_allreduce_module; + /* + * We cannot use han allreduce and allgather without sub-communicators, + * but we are in the creation of the data structures for the HAN, and + * temporarily need to save back the old collective. + * + * Allgather is used to compute vranks + * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new + * Reduce + Bcast may be called by the allreduce implementation + * Gather + Bcast may be called by the allgather implementation + */ + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter); - old_allgather = comm->c_coll->coll_allgather; - old_allgather_module = comm->c_coll->coll_allgather_module; - - comm->c_coll->coll_allreduce = han_module->previous_allreduce; - comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; - - comm->c_coll->coll_allgather = han_module->previous_allgather; - comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; + /** + * HAN is not yet optimized for a single process per node case, we should + * avoid selecting it for collective communication support in such cases. + * However, in order to decide if this is tru, we need to know how many + * local processes are on each node, a condition that cannot be verified + * outside the MPI support (with PRRTE the info will be eventually available, + * but we don't want to delay anything until then). We can achieve the same + * goal by using a reduction over the maximum number of peers per node among + * all participants. + */ + int local_procs = ompi_group_count_local_peers(comm->c_local_group); + comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT, + MPI_MAX, comm, + comm->c_coll->coll_allreduce_module); + if( local_procs == 1 ) { + /* restore saved collectives */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + han_module->enabled = false; /* entire module set to pass-through from now on */ + return OMPI_ERR_NOT_SUPPORTED; + } /* create communicators if there is no cached communicator */ - w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES * sizeof(struct ompi_communicator_t *)); up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES * sizeof(struct ompi_communicator_t *)); - origin_priority = NULL; - mca_base_var_find_by_name("coll_han_priority", &han_var_id); - mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); - /* - * Lower down our current priority - */ - tmp_han_priority = 0; - mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); + OBJ_CONSTRUCT(&comm_info, opal_info_t); /* * Upgrade sm module priority to set up low_comms[0] with sm module * This sub-communicator contains the ranks that share my node. */ - create_intranode_comm(comm, "coll_sm_priority", w_rank, &(low_comms[0])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, &(low_comms[0])); /* * Get my local rank and the local size @@ -429,15 +272,17 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, * Upgrade shared module priority to set up low_comms[1] with shared module * This sub-communicator contains the ranks that share my node. */ - create_intranode_comm(comm, "coll_shared_priority", w_rank, &(low_comms[1])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "shared,^han"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, &(low_comms[1])); /* * Upgrade libnbc module priority to set up up_comms[0] with libnbc module * This sub-communicator contains one process per node: processes with the * same intra-node rank id share such a sub-communicator */ - create_internode_comm(comm, "coll_libnbc_priority", w_rank, low_rank, - &(up_comms[0])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "libnbc,^han"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[0]), false); up_rank = ompi_comm_rank(up_comms[0]); @@ -445,8 +290,8 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, * Upgrade adapt module priority to set up up_comms[0] with adapt module * This sub-communicator contains one process per node. */ - create_internode_comm(comm, "coll_adapt_priority", w_rank, low_rank, - &(up_comms[1])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "adapt,^han"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[1]), false); /* * Set my virtual rank number. @@ -468,23 +313,21 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, /* * Set the cached info */ - han_module->cached_comm = comm; han_module->cached_low_comms = low_comms; han_module->cached_up_comms = up_comms; han_module->cached_vranks = vranks; - /* - * Come back to the original han module priority - */ - mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); + /* Reset the saved collectives to point back to HAN */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); - /* Put allreduce and allgather back */ - comm->c_coll->coll_allreduce = old_allreduce; - comm->c_coll->coll_allreduce_module = old_allreduce_module; - - comm->c_coll->coll_allgather = old_allgather; - comm->c_coll->coll_allgather_module = old_allgather_module; + OBJ_DESTRUCT(&comm_info); + return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_topo.c b/ompi/mca/coll/han/coll_han_topo.c index cbcfd698d0..e25e37207e 100644 --- a/ompi/mca/coll/han/coll_han_topo.c +++ b/ompi/mca/coll/han/coll_han_topo.c @@ -35,244 +35,24 @@ #include "coll_han.h" -/* - * Local functions - */ - -static int mca_coll_han_hostname_to_number(char* hostname, int size); -static void mca_coll_han_topo_get(int *topo, - struct ompi_communicator_t* comm, - int num_topo_level); -static void mca_coll_han_topo_sort(int *topo, int start, int end, - int level, int num_topo_level); -static bool mca_coll_han_topo_is_mapbycore(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level); -static void mca_coll_han_topo_print(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level); - - -/* - * takes the number part of a host: hhh2031 -->2031 - */ -static int mca_coll_han_hostname_to_number(char* hostname, int size) +#if OPAL_ENABLE_DEBUG +static void +mca_coll_han_topo_print(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) { - int i, j; - char *number_array = (char *)malloc(sizeof(char) * size); - int number = 0; - - for (i = 0, j = 0; hostname[i] != '\0'; i++) { - if ('0' <= hostname[i] && '9' >= hostname[i]) { - number_array[j++] = hostname[i]; - } - } - number_array[j] = '\0'; - number = atoi(number_array); - free(number_array); - return number; -} - -/* - * Set the virtual topo id. It is made of num_topo_level ints (2 today): - * . the integer part of the host id - * . the rank in the main communicator - * Gather the virtual topoid from each process so every process will know other - * processes virtual topids - */ -static void mca_coll_han_topo_get(int *topo, - struct ompi_communicator_t* comm, - int num_topo_level) -{ - int *self_topo = (int *)malloc(sizeof(int) * num_topo_level); - char hostname[1024]; - - gethostname(hostname, 1024); - self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024); - self_topo[1] = ompi_comm_rank(comm); - - ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT, - topo, num_topo_level, MPI_INT, comm, - comm->c_coll->coll_allgather_module); - free(self_topo); - - return; -} - -/* - * Sort the topology array in order to have ranks sharing the same node - * contiguous in the topology array. - * Called from topo_init whenever the processes are not mapped by core. - * ex: 4 ranks executing on 2 nodes, mapped by node - * ranks 0 and 2 on hid0 - * ranks 1 and 3 on hid1 - * On entry the topo array looks like - * hid0 0 hid1 1 hid0 2 hid1 3 - * After the sort: - * hid0 0 hid0 2 hid1 1 hid1 3 - * This is to have the gather result in the right order - * - * @param topo (IN/OUT) topology description array (sorted in out) - * @param start (IN) where to begin the processing - * The index in topo will actually be: - * start * num_topo_level + level - * topo contains num_topo_level ids per rank. - * @param end (IN) where to stop the processing - * The index in topo will actually be: - * end * num_topo_level + level - * topo contains num_topo_level ids per rank. - * @param level (IN) level number we are currently processing - * @param num_topo_level (IN) number of topological levels - * - */ -static void mca_coll_han_topo_sort(int *topo, int start, int end, - int level, int num_topo_level) -{ - int i, j; - int min, min_loc; - int last, new_start, new_end; - - if (level > num_topo_level-1 || start >= end) { - return; - } - - min = INT_MAX; - min_loc = -1; - for (i = start; i <= end; i++) { - int temp; - /* get the min value for current level and its location */ - for (j = i; j <= end; j++) { - /* topo contains num_topo_level ids per rank. */ - if (topo[j * num_topo_level + level] < min) { - min = topo[j*num_topo_level+level]; - min_loc = j; - - } - } - /* - * swap i and min_loc - * We have num_topo_level ids to swap - */ - for (j = 0; j < num_topo_level; j++) { - temp = topo[i * num_topo_level + j]; - topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j]; - topo[min_loc * num_topo_level + j] = temp; - } - min = INT_MAX; - min_loc = -1; - } - - /* Process next level */ - last = 0; - new_start = 0; - new_end = 0; - for (i = start; i <= end; i++) { - if (i == start) { - last = topo[i * num_topo_level + level]; - new_start = start; - } else if (i == end) { - new_end = end; - mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, - num_topo_level); - } else if (last != topo[i * num_topo_level + level]) { - new_end = i - 1; - mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, - num_topo_level); - new_start = i; - last = topo[i * num_topo_level + level]; - } - } - return; -} - -/* - * Check whether the ranks in the communicator given as input are mapped by core - * Mapped by core: each node is first filled with as many ranks as needed before - * moving to the next one - * This is checked as follows: - * . 2 contiguous ranks should be either on the same node or on node ids in - * ascending order - * The topology is actually an array of ints: - * +----------+-------+----------+-------+------+----------+-------+-----+ - * | host_id0 | rank0 | host_id1 | rank1 | .... | host_idX | rankX | ... | - * +----------+-------+----------+-------+------+----------+-------+-----+ - */ -static bool mca_coll_han_topo_is_mapbycore(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level) -{ - int i; + int rank = ompi_comm_rank(comm); int size = ompi_comm_size(comm); - for (i = 1; i < size; i++) { - /* - * The host id for a given rank should be < host id for the next rank - */ - if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level]) { - return false; - } - /* - * For the same host id, consecutive ranks should be sorted in - * ascending order. - */ - if (topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) { - return false; + if (rank == 0) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han topo: ", rank)); + for( int i = 0; i < size*num_topo_level; i++ ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i])); } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n")); } - return true; } - -/* The topo is supposed sorted by host */ -static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level){ - int i; - int size = ompi_comm_size(comm); - if (size < 2){ - return false; - } - int ppn; - int last_host = topo[0]; - - /* Find the ppn for the first node */ - for (i = 1; i < size; i++) { - if (topo[i * num_topo_level] != last_host){ - break; - } - } - ppn = i; - - /* All on one node */ - if ( size == ppn){ - return false; - } - /* Trivial case */ - if (size % ppn != 0){ - return true; - } - - last_host = topo[ppn * num_topo_level]; - /* Check that the 2nd and next hosts also this ppn. Since the topo is sorted - * one just need to jump ppn ranks to check the supposed switch of host */ - for (i = 2 * ppn; i < size; i += ppn ){ - /* the list of ranks for the last known host have ended before */ - if (topo[(i-1) * num_topo_level] != last_host){ - return true; - } - /* the list of ranks for the last known host are bigger than excpected */ - if (topo[(i-1) * num_topo_level] == topo[i*num_topo_level]){ - return true; - } - last_host = topo[i * num_topo_level]; - } - /* Check the last host */ - if (topo[(size-1) * num_topo_level] != last_host){ - return true; - } - - return false; -} - +#endif /* OPAL_ENABLE_DEBUG */ /** * Topology initialization phase @@ -280,68 +60,136 @@ static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo, * * @param num_topo_level (IN) Number of the topological levels */ -int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module, - int num_topo_level) +int* +mca_coll_han_topo_init(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module, + int num_topo_level) { - int size; - int *topo; - - size = ompi_comm_size(comm); - - if (!((han_module->cached_topo) && (han_module->cached_comm == comm))) { - if (han_module->cached_topo) { - free(han_module->cached_topo); - han_module->cached_topo = NULL; - } - - topo = (int *)malloc(sizeof(int) * size * num_topo_level); - - /* get topo infomation */ - mca_coll_han_topo_get(topo, comm, num_topo_level); - mca_coll_han_topo_print(topo, comm, num_topo_level); - - /* - * All the ranks now have the topo information - */ - - /* check if the processes are mapped by core */ - han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level); - - /* - * If not, sort the topo such that each group of ids is sorted by rank - * i.e. ids for rank i are contiguous to ids for rank i+1. - * This will be needed for the operations that are order sensitive - * (like gather) - */ - if (!han_module->is_mapbycore) { - mca_coll_han_topo_sort(topo, 0, size-1, 0, num_topo_level); - } - han_module->are_ppn_imbalanced = mca_coll_han_topo_are_ppn_imbalanced(topo, comm , num_topo_level); - han_module->cached_topo = topo; - han_module->cached_comm = comm; - } else { - topo = han_module->cached_topo; + if ( NULL != han_module->cached_topo ) { + return han_module->cached_topo; } + ompi_communicator_t *up_comm, *low_comm; + ompi_request_t *request = MPI_REQUEST_NULL; + int *my_low_rank_map = NULL; + int *ranks_map = NULL; + + int size = ompi_comm_size(comm); + + if (NULL != han_module->cached_up_comms) { + up_comm = han_module->cached_up_comms[0]; + low_comm = han_module->cached_low_comms[0]; + } else { + up_comm = han_module->sub_comm[INTER_NODE]; + low_comm = han_module->sub_comm[INTRA_NODE]; + } + assert(up_comm != NULL && low_comm != NULL); + + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + + int *topo = (int *)malloc(sizeof(int) * size * num_topo_level); + int is_imbalanced = 1; + int ranks_consecutive = 1; + + /* node leaders translate the node-local ranks to global ranks and check whether they are placed consecutively */ + if (0 == low_rank) { + my_low_rank_map = malloc(sizeof(int)*low_size); + for (int i = 0; i < low_size; ++i) { + topo[i] = i; + } + ompi_group_translate_ranks(low_comm->c_local_group, low_size, topo, + comm->c_local_group, my_low_rank_map); + /* check if ranks are consecutive */ + int rank = my_low_rank_map[0] + 1; + for (int i = 1; i < low_size; ++i, ++rank) { + if (my_low_rank_map[i] != rank) { + ranks_consecutive = 0; + break; + } + } + + int reduce_vals[] = {ranks_consecutive, -ranks_consecutive, low_size, -low_size}; + + up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 4, + MPI_INT, MPI_MAX, up_comm, + up_comm->c_coll->coll_allreduce_module); + + /* is the distribution of processes balanced per node? */ + is_imbalanced = (reduce_vals[2] == -reduce_vals[3]) ? 0 : 1; + ranks_consecutive = (reduce_vals[0] == -reduce_vals[1]) ? 1 : 0; + + if ( !ranks_consecutive && !is_imbalanced ) { + /* kick off up_comm allgather to collect non-consecutive rank information at node leaders */ + ranks_map = malloc(sizeof(int)*size); + up_comm->c_coll->coll_iallgather(my_low_rank_map, low_size, MPI_INT, + ranks_map, low_size, MPI_INT, up_comm, &request, + up_comm->c_coll->coll_iallgather_module); + } + } + + + /* broadcast balanced and consecutive properties from node leaders to remaining ranks */ + int bcast_vals[] = {is_imbalanced, ranks_consecutive}; + low_comm->c_coll->coll_bcast(bcast_vals, 2, MPI_INT, 0, + low_comm, low_comm->c_coll->coll_bcast_module); + is_imbalanced = bcast_vals[0]; + ranks_consecutive = bcast_vals[1]; + + /* error out if the rank distribution is not balanced */ + if (is_imbalanced) { + assert(MPI_REQUEST_NULL == request); + han_module->are_ppn_imbalanced = true; + free(topo); + if( NULL != my_low_rank_map ) free(my_low_rank_map); + if( NULL != ranks_map ) free(ranks_map); + return NULL; + } + + han_module->are_ppn_imbalanced = false; + + if (ranks_consecutive) { + /* fast-path: all ranks are consecutive and balanced so fill topology locally */ + for (int i = 0; i < size; ++i) { + topo[2*i] = (i/low_size); // node leader is node ID + topo[2*i+1] = i; + } + han_module->is_mapbycore = true; + } else { + /* + * Slow path: gather global-to-node-local rank mappings at node leaders + * + * The topology will contain a mapping from global consecutive positions + * to ranks in the communicator. + * + * ex: 4 ranks executing on 2 nodes, mapped by node + * ranks 0 and 2 on hid0 + * ranks 1 and 3 on hid1 + * On entry the topo array looks like + * hid0 0 hid1 1 hid0 2 hid1 3 + * After the sort: + * hid0 0 hid0 2 hid1 1 hid1 3 + */ + if (0 == low_rank) { + ompi_request_wait(&request, MPI_STATUS_IGNORE); + /* fill topology */ + for (int i = 0; i < size; ++i) { + topo[2*i] = ranks_map[(i/low_size)*low_size]; // node leader is node ID + topo[2*i+1] = ranks_map[i]; + } + free(ranks_map); + } + } + + /* broadcast topology from node leaders to remaining ranks */ + low_comm->c_coll->coll_bcast(topo, num_topo_level*size, MPI_INT, 0, + low_comm, low_comm->c_coll->coll_bcast_module); + free(my_low_rank_map); + han_module->cached_topo = topo; +#if OPAL_ENABLE_DEBUG mca_coll_han_topo_print(topo, comm, num_topo_level); +#endif /* OPAL_ENABLE_DEBUG */ + return topo; } -static void mca_coll_han_topo_print(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level) -{ - int rank = ompi_comm_rank(comm); - int size = ompi_comm_size(comm); - - if (rank == 0) { - int i; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter topo: ", rank)); - for (i=0; ifunc_ptr = NULL; - t->func_argu = NULL; + t->func_args = NULL; } static void mca_coll_task_destructor(mca_coll_task_t * t) { t->func_ptr = NULL; - t->func_argu = NULL; + t->func_args = NULL; } OBJ_CLASS_INSTANCE(mca_coll_task_t, opal_object_t, mca_coll_task_constructor, mca_coll_task_destructor); - -/* Init task */ -int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu) -{ - t->func_ptr = func_ptr; - t->func_argu = func_argu; - return OMPI_SUCCESS; -} - -/* Issue the task */ -int issue_task(mca_coll_task_t * t) -{ - t->func_ptr(t->func_argu); - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/han/coll_han_trigger.h b/ompi/mca/coll/han/coll_han_trigger.h index c7314d25fb..413e393be6 100644 --- a/ompi/mca/coll/han/coll_han_trigger.h +++ b/ompi/mca/coll/han/coll_han_trigger.h @@ -12,25 +12,17 @@ #ifndef MCA_COLL_HAN_TRIGGER_EXPORT_H #define MCA_COLL_HAN_TRIGGER_EXPORT_H -#include "ompi_config.h" -#include "mpi.h" -#include "ompi/mca/mca.h" -#include "ompi/mca/coll/coll.h" #include "ompi/communicator/communicator.h" -#include "ompi/win/win.h" -#include "ompi/mca/coll/base/coll_base_functions.h" -#include "opal/util/info.h" #include "ompi/op/op.h" -#include "opal/runtime/opal_progress.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/datatype/ompi_datatype.h" + typedef int (*task_func_ptr) (void *); struct mca_coll_task_s { opal_object_t super; task_func_ptr func_ptr; - void *func_argu; + void *func_args; }; typedef struct mca_coll_task_s mca_coll_task_t; @@ -38,9 +30,20 @@ typedef struct mca_coll_task_s mca_coll_task_t; OBJ_CLASS_DECLARATION(mca_coll_task_t); /* Init task */ -int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu); +static inline int +init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args) +{ + OBJ_CONSTRUCT(t, mca_coll_task_t); + t->func_ptr = func_ptr; + t->func_args = func_args; + return OMPI_SUCCESS; +} /* Issue the task */ -int issue_task(mca_coll_task_t * t); +static inline int +issue_task(mca_coll_task_t * t) +{ + return t->func_ptr(t->func_args); +} -#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ +#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_utils.c b/ompi/mca/coll/han/coll_han_utils.c deleted file mode 100644 index 293777a256..0000000000 --- a/ompi/mca/coll/han/coll_han_utils.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018-2020 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "coll_han.h" - -/* Get root's low_rank and up_rank from vranks array */ -void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, - int *root_up_rank) -{ - *root_up_rank = vranks[root] / low_size; - *root_low_rank = vranks[root] % low_size; -} - -uint32_t han_auto_tuned_get_n(uint32_t n) -{ - uint32_t avail[5] = { 4, 8, 16, 32, 64 }; - uint32_t i; - for (i = 0; i < 5; i++) { - if (avail[i] >= n) { - return i; - } - } - return i - 1; -} - -uint32_t han_auto_tuned_get_c(uint32_t c) -{ - uint32_t avail[3] = { 4, 8, 12 }; - uint32_t i; - for (i = 0; i < 3; i++) { - if (avail[i] >= c) { - return i; - } - } - return i - 1; -} - -uint32_t han_auto_tuned_get_m(uint32_t m) -{ - uint32_t avail[21] = - { 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, -262144, 524288, 1048576, 2097152, 4194304 }; - uint32_t i; - for (i = 0; i < 21; i++) { - if (avail[i] >= m) { - return i; - } - } - return i - 1; -} diff --git a/ompi/mca/coll/sm/coll_sm_module.c b/ompi/mca/coll/sm/coll_sm_module.c index 781215251e..25e9c77946 100644 --- a/ompi/mca/coll/sm/coll_sm_module.c +++ b/ompi/mca/coll/sm/coll_sm_module.c @@ -176,7 +176,7 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority) if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || ompi_group_have_remote_peers (comm->c_local_group)) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name); - return NULL; + return NULL; } /* Get the priority level attached to this module. If priority is less diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index cc73fcf835..637122185e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -1446,7 +1446,8 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, int scount, communicator_size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - if (rank == root) { + /* Determine block size */ + if ( (rank == root) || (MPI_IN_PLACE == sbuf) ) { ompi_datatype_type_size(rdtype, &dsize); total_dsize = dsize * (ptrdiff_t)rcount; } else { diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c index 098a4fa949..a259c789ac 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -40,7 +40,7 @@ static int fileline=0; /* used for verbose error messages */ -#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline) +#define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) /* * Reads a rule file called fname @@ -56,9 +56,8 @@ static int fileline=0; /* used for verbose error messages */ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives) { + long CI, NCS, CS, ALG, NMS, FANINOUT, X, MS, SS; FILE *fptr = (FILE*) NULL; - int X, CI, NCS, CS, ALG, NMS, FANINOUT; - long MS, SS; int x, ncs, nms; ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */ @@ -101,45 +100,42 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** goto on_file_error; } - X = (int)getnext(fptr); - if (X<0) { + if( (getnext(fptr, &X) < 0) || (X < 0) ) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline)); goto on_file_error; } if (X>n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %d is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); goto on_file_error; } for (x=0;x=n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %d is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); goto on_file_error; } if (alg_rules[CI].alg_rule_id != CI) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", CI)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", CI)); alg_p = &alg_rules[CI]; alg_p->alg_rule_id = CI; alg_p->n_com_sizes = 0; alg_p->com_rules = (ompi_coll_com_rule_t *) NULL; - NCS = (int)getnext (fptr); - if (NCS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %d at around line %d\n", CI, fileline)); + if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", CI, fileline)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %d for dynamic rule for collective ID %d\n", NCS, CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCS, CI)); alg_p->n_com_sizes = NCS; alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI); if (NULL == alg_p->com_rules) { @@ -151,20 +147,18 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** com_p = &(alg_p->com_rules[ncs]); - CS = (int)getnext (fptr); - if (CS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline)); + if( (getnext (fptr, &CS) < 0) || (CS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); goto on_file_error; } com_p->mpi_comsize = CS; - NMS = (int)getnext (fptr); - if (NMS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline)); + if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n", + OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n", NMS, CI, CS)); com_p->n_msg_sizes = NMS; com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS); @@ -179,37 +173,33 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** msg_p = &(com_p->msg_rules[nms]); - MS = getnext (fptr); - if (MS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &MS) < 0) || (MS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->msg_size = (size_t)MS; - ALG = (int)getnext (fptr); - if (ALG<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_alg = ALG; - FANINOUT = (int)getnext (fptr); - if (FANINOUT<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_topo_faninout = FANINOUT; - SS = getnext (fptr); - if (SS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &SS) < 0) || (SS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_segsize = SS; if (!nms && MS) { OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %d com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); goto on_file_error; } @@ -222,7 +212,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** } /* comm size */ total_alg_count++; - OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", CI)); } /* per collective */ diff --git a/ompi/request/request.c b/ompi/request/request.c index a8ddb68ad3..abf33449d8 100644 --- a/ompi/request/request.c +++ b/ompi/request/request.c @@ -54,7 +54,7 @@ static void ompi_request_construct(ompi_request_t* req) /* don't call _INIT, we don't to set the request to _INACTIVE and there will * be no matching _FINI invocation */ req->req_state = OMPI_REQUEST_INVALID; - req->req_complete = false; + req->req_complete = REQUEST_COMPLETED; req->req_persistent = false; req->req_start = NULL; req->req_free = NULL; From cc6432b4a2881ee89eaa52ea862311d0244f0a6f Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 24 Aug 2020 17:48:29 -0400 Subject: [PATCH 4/4] Fix partial packing of non data elements. There was a bug allowing for partial packing of non-data elements (such as loop and end_loop markers) during the exit condition of a pack/unpack call. This has basically no meaning. Prevent this bug from happening by making sure the element point to a data before trying to partially pack it. Signed-off-by: George Bosilca --- opal/datatype/opal_datatype_unpack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index efed62451a..6f9fdce277 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -380,7 +380,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, } complete_loop: assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); - if( 0 != iov_len_local ) { + if( (pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) && (0 != iov_len_local) ) { unsigned char* temp = conv_ptr; /* We have some partial data here. Let's copy it into the convertor * and keep it hot until the next round. @@ -391,7 +391,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, opal_unpack_partial_datatype( pConvertor, pElem, iov_ptr, 0, iov_len_local, &temp ); - + pConvertor->partial_length = iov_len_local; iov_len_local = 0; }