diff --git a/ompi/mca/bcol/Makefile.am b/ompi/mca/bcol/Makefile.am deleted file mode 100644 index 06c2ef5770..0000000000 --- a/ompi/mca/bcol/Makefile.am +++ /dev/null @@ -1,35 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# main library setup -noinst_LTLIBRARIES = libmca_bcol.la -libmca_bcol_la_SOURCES = - -# header setup -nobase_ompi_HEADERS = -nobase_nodist_ompi_HEADERS = - -# local files -headers = bcol.h -libmca_bcol_la_SOURCES += $(headers) $(nodist_headers) - -# Conditionally install the header files -if WANT_INSTALL_HEADERS -nobase_ompi_HEADERS += $(headers) -nobase_nodist_ompi_HEADERS += $(nodist_headers) -ompidir = $(ompiincludedir)/ompi/mca/bcol -else -ompidir = $(includedir) -endif - -include base/Makefile.am - -distclean-local: - rm -f base/static-components.h diff --git a/ompi/mca/bcol/base/Makefile.am b/ompi/mca/bcol/base/Makefile.am deleted file mode 100644 index 929bef0f5b..0000000000 --- a/ompi/mca/bcol/base/Makefile.am +++ /dev/null @@ -1,16 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - - -headers += \ - base/base.h -libmca_bcol_la_SOURCES += \ - base/bcol_base_frame.c \ - base/bcol_base_init.c diff --git a/ompi/mca/bcol/base/base.h b/ompi/mca/bcol/base/base.h deleted file mode 100644 index b95bea398b..0000000000 --- a/ompi/mca/bcol/base/base.h +++ /dev/null @@ -1,49 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_BASE_H -#define MCA_BCOL_BASE_H - -#include "ompi_config.h" - -#include "ompi/mca/mca.h" -#include "opal/class/opal_list.h" -#include "ompi/mca/bcol/bcol.h" - -/* - * Global functions for BCOL - */ - -BEGIN_C_DECLS - -OMPI_DECLSPEC extern opal_list_t mca_bcol_base_components_in_use; -OMPI_DECLSPEC extern char *ompi_bcol_bcols_string; - -OMPI_DECLSPEC extern mca_base_framework_t ompi_bcol_base_framework; - -OMPI_DECLSPEC int mca_bcol_base_init(bool enable_progress_threads, bool enable_mpi_threads); - -struct mca_bcol_base_module_t; -OMPI_DECLSPEC int mca_bcol_base_bcol_fns_table_init(struct mca_bcol_base_module_t *bcol_module); - -OMPI_DECLSPEC int mca_bcol_base_fn_table_construct(struct mca_bcol_base_module_t *bcol_module); - -OMPI_DECLSPEC int mca_bcol_base_fn_table_destroy(struct mca_bcol_base_module_t *bcol_module); - -OMPI_DECLSPEC int mca_bcol_base_set_attributes(struct mca_bcol_base_module_t *bcol_module, - mca_bcol_base_coll_fn_comm_attributes_t *comm_attribs, - mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs, - mca_bcol_base_module_collective_fn_primitives_t bcol_fn, - mca_bcol_base_module_collective_fn_primitives_t progress_fn); - -END_C_DECLS - -#endif /* MCA_BCOL_BASE_H */ diff --git a/ompi/mca/bcol/base/bcol_base_frame.c b/ompi/mca/bcol/base/bcol_base_frame.c deleted file mode 100644 index e7b6d68d26..0000000000 --- a/ompi/mca/bcol/base/bcol_base_frame.c +++ /dev/null @@ -1,374 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - - -#include "ompi_config.h" -#include - -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNIST_H */ -#include "ompi/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/util/argv.h" - -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/include/ompi/constants.h" -#include "opal/mca/mpool/mpool.h" -#include "opal/class/opal_list.h" -/* - * The following file was created by configure. It contains extern - * statements and the definition of an array of pointers to each - * component's public mca_base_component_t struct. - */ - -#include "ompi/mca/bcol/base/static-components.h" - -static int mca_bcol_base_open(mca_base_open_flag_t flags); -static int mca_bcol_base_close (void); -static int mca_bcol_base_register(mca_base_register_flag_t flags); - -/* -** * Global variables -** */ -MCA_BASE_FRAMEWORK_DECLARE(ompi, bcol, NULL, mca_bcol_base_register, mca_bcol_base_open, mca_bcol_base_close, - mca_bcol_base_static_components, 0); - -OMPI_DECLSPEC opal_list_t mca_bcol_base_components_in_use = {{0}}; -OMPI_DECLSPEC char *ompi_bcol_bcols_string = NULL; -OMPI_DECLSPEC int bcol_mpool_compatibility[BCOL_SIZE][BCOL_SIZE] = {{0}}; -OMPI_DECLSPEC int bcol_mpool_index[BCOL_SIZE][BCOL_SIZE] = {{0}}; - -static void bcol_base_module_constructor(mca_bcol_base_module_t *module) -{ - int fnc; - - module->bcol_component = NULL; - module->network_context = NULL; - module->context_index = -1; - module->supported_mode = 0; - module->init_module = NULL; - module->sbgp_partner_module = NULL; - module->squence_number_offset = 0; - module->n_poll_loops = 0; - - for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) { - module->bcol_function_table[fnc] = NULL; - module->small_message_thresholds[fnc] = BCOL_THRESHOLD_UNLIMITED; - } - - module->set_small_msg_thresholds = NULL; - - module->header_size = 0; - module->bcol_memory_init = NULL; - - module->next_inorder = NULL; - - mca_bcol_base_fn_table_construct(module); -} - -static void bcol_base_module_destructor(mca_bcol_base_module_t *module) -{ - int fnc; - - module->bcol_component = NULL; - - module->context_index = -1; - module->init_module = NULL; - module->sbgp_partner_module = NULL; - module->squence_number_offset = 0; - module->n_poll_loops = 0; - - for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) { - module->bcol_function_table[fnc] = NULL; - } - - module->bcol_memory_init = NULL; -} - -OBJ_CLASS_INSTANCE(mca_bcol_base_module_t, - opal_object_t, - bcol_base_module_constructor, - bcol_base_module_destructor); - -static void bcol_base_network_context_constructor(bcol_base_network_context_t *nc) -{ - nc->context_id = -1; - nc->context_data = NULL; -} - -static void bcol_base_network_context_destructor(bcol_base_network_context_t *nc) -{ - nc->context_id = -1; - nc->context_data = NULL; - nc->register_memory_fn = NULL; - nc->deregister_memory_fn = NULL; -} - -OBJ_CLASS_INSTANCE(bcol_base_network_context_t, - opal_object_t, - bcol_base_network_context_constructor, - bcol_base_network_context_destructor); - -/* get list of subgrouping coponents to use */ -static int mca_bcol_base_set_components_to_use(opal_list_t *bcol_components_avail, - opal_list_t *bcol_components_in_use) -{ - /* local variables */ - const mca_base_component_t *b_component; - - mca_base_component_list_item_t *b_cli; - mca_base_component_list_item_t *b_clj; - - char **bcols_requested; - const char *b_component_name; - - /* split the requst for the bcol modules */ - bcols_requested = opal_argv_split(ompi_bcol_bcols_string, ','); - if (NULL == bcols_requested) { - return OMPI_ERROR; - } - - /* Initialize list */ - OBJ_CONSTRUCT(bcol_components_in_use, opal_list_t); - - /* figure out basic collective modules to use */ - /* loop over list of components requested */ - for (int i = 0 ; bcols_requested[i] ; ++i) { - /* loop over discovered components */ - OPAL_LIST_FOREACH(b_cli, bcol_components_avail, mca_base_component_list_item_t) { - b_component = b_cli->cli_component; - b_component_name = b_component->mca_component_name; - - if (0 == strcmp (b_component_name, bcols_requested[i])) { - /* found selected component */ - b_clj = OBJ_NEW(mca_base_component_list_item_t); - if (NULL == b_clj) { - opal_argv_free (bcols_requested); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - b_clj->cli_component = b_component; - opal_list_append(bcol_components_in_use, - (opal_list_item_t *) b_clj); - break; - } /* end check for bcol component */ - } - } - - /* Note: Need to add error checking to make sure all requested functions - ** were found */ - - /* - ** release resources - ** */ - - opal_argv_free (bcols_requested); - - return OMPI_SUCCESS; -} - -static int mca_bcol_base_register(mca_base_register_flag_t flags) -{ - /* figure out which bcol and sbgp components will actually be used */ - /* get list of sub-grouping functions to use */ - ompi_bcol_bcols_string = "basesmuma,basesmuma,iboffload,ptpcoll,ugni"; - (void) mca_base_var_register("ompi", "bcol", "base", "string", - "Default set of basic collective components to use", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_bcol_bcols_string); - - return OMPI_SUCCESS; -} - -/** - * Function for finding and opening either all MCA components, or the one - * that was specifically requested via a MCA parameter. - */ -static int mca_bcol_base_open(mca_base_open_flag_t flags) -{ - int ret; - - /* Open up all available components */ - if (OMPI_SUCCESS != - (ret = mca_base_framework_components_open(&ompi_bcol_base_framework, flags))) { - return ret; - } - - ret = mca_bcol_base_set_components_to_use(&ompi_bcol_base_framework.framework_components, - &mca_bcol_base_components_in_use); - if (OMPI_SUCCESS != ret) { - return ret; - } - - /* memory registration compatibilities */ - bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_SHARED_MEMORY_UMA]=1; - bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_SHARED_MEMORY_SOCKET]=1; - bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_POINT_TO_POINT]=1; - bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_IB_OFFLOAD]=1; - bcol_mpool_compatibility[BCOL_SHARED_MEMORY_SOCKET][BCOL_SHARED_MEMORY_UMA]=1; - bcol_mpool_compatibility[BCOL_POINT_TO_POINT] [BCOL_SHARED_MEMORY_UMA]=1; - bcol_mpool_compatibility[BCOL_IB_OFFLOAD] [BCOL_SHARED_MEMORY_UMA]=1; - - return OMPI_SUCCESS; -} - -static int mca_bcol_base_close (void) -{ - opal_list_item_t *item; - - while (NULL != (item = opal_list_remove_first (&mca_bcol_base_components_in_use))) { - OBJ_RELEASE(item); - } - - OBJ_DESTRUCT(&mca_bcol_base_components_in_use); - - return mca_base_framework_components_close(&ompi_bcol_base_framework, NULL); -} - -/* - * Prototype implementation of selection logic - */ -int mca_bcol_base_fn_table_construct(struct mca_bcol_base_module_t *bcol_module){ - - int bcol_fn; - /* Call all init functions */ - - /* Create a function table */ - for (bcol_fn = 0; bcol_fn < BCOL_NUM_OF_FUNCTIONS; bcol_fn++){ - /* Create a list object for each bcol type list */ - OBJ_CONSTRUCT(&(bcol_module->bcol_fns_table[bcol_fn]), opal_list_t); - } - - return OMPI_SUCCESS; -} - -int mca_bcol_base_fn_table_destroy(struct mca_bcol_base_module_t *bcol_module){ - - int bcol_fn; - - for (bcol_fn = 0; bcol_fn < BCOL_NUM_OF_FUNCTIONS; bcol_fn++){ - /* gvm FIX: Go through the list and destroy each item */ - /* Destroy the function table object for each bcol type list */ - OBJ_DESTRUCT(&(bcol_module->bcol_fns_table[bcol_fn])); - } - - return OMPI_SUCCESS; -} - -int mca_bcol_base_set_attributes(struct mca_bcol_base_module_t *bcol_module, - mca_bcol_base_coll_fn_comm_attributes_t *arg_comm_attribs, - mca_bcol_base_coll_fn_invoke_attributes_t *arg_inv_attribs, - mca_bcol_base_module_collective_fn_primitives_t bcol_fn, - mca_bcol_base_module_collective_fn_primitives_t progress_fn - ) -{ - mca_bcol_base_coll_fn_comm_attributes_t *comm_attribs = NULL; - mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs = NULL; - struct mca_bcol_base_coll_fn_desc_t *fn_filtered = NULL; - int coll_type; - - comm_attribs = malloc(sizeof(mca_bcol_base_coll_fn_comm_attributes_t)); - if (NULL == comm_attribs) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - inv_attribs = malloc(sizeof(mca_bcol_base_coll_fn_invoke_attributes_t)); - - if (NULL == inv_attribs) { - free(comm_attribs); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - coll_type = comm_attribs->bcoll_type = arg_comm_attribs->bcoll_type; - comm_attribs->comm_size_min = arg_comm_attribs->comm_size_min; - comm_attribs->comm_size_max = arg_comm_attribs->comm_size_max; - comm_attribs->data_src = arg_comm_attribs->data_src; - comm_attribs->waiting_semantics = arg_comm_attribs->waiting_semantics; - - inv_attribs->bcol_msg_min = arg_inv_attribs->bcol_msg_min; - inv_attribs->bcol_msg_max = arg_inv_attribs->bcol_msg_max ; - inv_attribs->datatype_bitmap = arg_inv_attribs->datatype_bitmap ; - inv_attribs->op_types_bitmap = arg_inv_attribs->op_types_bitmap; - - fn_filtered = OBJ_NEW(mca_bcol_base_coll_fn_desc_t); - - fn_filtered->coll_fn = bcol_fn; - fn_filtered->progress_fn = progress_fn; - - fn_filtered->comm_attr = comm_attribs; - fn_filtered->inv_attr = inv_attribs; - - - opal_list_append(&(bcol_module->bcol_fns_table[coll_type]),(opal_list_item_t*)fn_filtered); - - return OMPI_SUCCESS; -} - -int mca_bcol_base_bcol_fns_table_init(struct mca_bcol_base_module_t *bcol_module){ - - int ret, bcol_init_fn; - - for (bcol_init_fn =0; bcol_init_fn < BCOL_NUM_OF_FUNCTIONS; bcol_init_fn++) { - if (NULL != bcol_module->bcol_function_init_table[bcol_init_fn]) { - ret = (bcol_module->bcol_function_init_table[bcol_init_fn]) (bcol_module); - if (OMPI_SUCCESS != ret) { - return OMPI_ERROR; - } - } - } - - return OMPI_SUCCESS; -} - -static void mca_bcol_base_coll_fn_desc_constructor(mca_bcol_base_coll_fn_desc_t *fn) -{ - fn->comm_attr = NULL; - fn->inv_attr = NULL; -} - -static void mca_bcol_base_coll_fn_desc_destructor(mca_bcol_base_coll_fn_desc_t *fn) -{ - if (fn->comm_attr) { - free(fn->comm_attr); - } - - if (fn->inv_attr) { - free(fn->inv_attr); - } -} - -OBJ_CLASS_INSTANCE(mca_bcol_base_coll_fn_desc_t, - opal_list_item_t, - mca_bcol_base_coll_fn_desc_constructor, - mca_bcol_base_coll_fn_desc_destructor); - -static void lmngr_block_constructor(mca_bcol_base_lmngr_block_t *item) -{ - item->base_addr = NULL; -} - -static void lnmgr_block_destructor(mca_bcol_base_lmngr_block_t *item) -{ - /* I have nothing to do here */ -} -OBJ_CLASS_INSTANCE(mca_bcol_base_lmngr_block_t, - opal_list_item_t, - lmngr_block_constructor, - lnmgr_block_destructor); diff --git a/ompi/mca/bcol/base/bcol_base_init.c b/ompi/mca/bcol/base/bcol_base_init.c deleted file mode 100644 index f6f0360cd9..0000000000 --- a/ompi/mca/bcol/base/bcol_base_init.c +++ /dev/null @@ -1,45 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/mca.h" -#include "opal/mca/base/base.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/include/ompi/constants.h" - -int mca_bcol_base_init(bool enable_progress_threads, bool enable_mpi_threads) -{ - mca_bcol_base_component_t *bcol_component; - mca_base_component_list_item_t *cli; - int ret; - - OPAL_LIST_FOREACH(cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) { - bcol_component = (mca_bcol_base_component_t *) cli->cli_component; - - if (false == bcol_component->init_done) { - ret = bcol_component->collm_init_query(true, true); - if (OMPI_SUCCESS != ret) { - return ret; - } - - bcol_component->init_done = true; - } - } - - return OMPI_SUCCESS; -} - - - diff --git a/ompi/mca/bcol/base/owner.txt b/ompi/mca/bcol/base/owner.txt deleted file mode 100644 index 1c86df367b..0000000000 --- a/ompi/mca/bcol/base/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL -status: unmaintained diff --git a/ompi/mca/bcol/basesmuma/Makefile.am b/ompi/mca/bcol/basesmuma/Makefile.am deleted file mode 100644 index 9a9d288f49..0000000000 --- a/ompi/mca/bcol/basesmuma/Makefile.am +++ /dev/null @@ -1,66 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources = \ - bcol_basesmuma.h \ - bcol_basesmuma_utils.h \ - bcol_basesmuma_bcast.c \ - bcol_basesmuma_component.c \ - bcol_basesmuma_module.c \ - bcol_basesmuma_buf_mgmt.c \ - bcol_basesmuma_mem_mgmt.c \ - bcol_basesmuma_fanin.c \ - bcol_basesmuma_fanout.c \ - bcol_basesmuma_progress.c \ - bcol_basesmuma_reduce.h \ - bcol_basesmuma_reduce.c \ - bcol_basesmuma_allreduce.c \ - bcol_basesmuma_setup.c \ - bcol_basesmuma_rd_barrier.c \ - bcol_basesmuma_rd_nb_barrier.c \ - bcol_basesmuma_rk_barrier.c \ - bcol_basesmuma_utils.c \ - bcol_basesmuma_bcast_prime.c \ - bcol_basesmuma_lmsg_knomial_bcast.c \ - bcol_basesmuma_lmsg_bcast.c \ - bcol_basesmuma_gather.c \ - bcol_basesmuma_allgather.c \ - bcol_basesmuma_smcm.h \ - bcol_basesmuma_smcm.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -component_noinst = -component_install = -if MCA_BUILD_ompi_bcol_basesmuma_DSO -component_install += mca_bcol_basesmuma.la -else -component_noinst += libmca_bcol_basesmuma.la -endif - -# See ompi/mca/btl/sm/Makefile.am for an explanation of -# libmca_common_sm.la. - -AM_CPPFLAGS = $(btl_portals_CPPFLAGS) - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_bcol_basesmuma_la_SOURCES = $(sources) -mca_bcol_basesmuma_la_LDFLAGS = -module -avoid-version $(btl_portals_LDFLAGS) -mca_bcol_basesmuma_la_LIBADD = \ - $(btl_portals_LIBS) - - -noinst_LTLIBRARIES = $(component_noinst) -libmca_bcol_basesmuma_la_SOURCES =$(sources) -libmca_bcol_basesmuma_la_LDFLAGS = -module -avoid-version $(btl_portals_LDFLAGS) diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma.h b/ompi/mca/bcol/basesmuma/bcol_basesmuma.h deleted file mode 100644 index 7b6c69d2c3..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma.h +++ /dev/null @@ -1,1270 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#ifndef MCA_BCOL_basesmuma_EXPORT_H -#define MCA_BCOL_basesmuma_EXPORT_H - -#include "ompi_config.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "opal/mca/mpool/mpool.h" -#include "ompi/request/request.h" -#include "ompi/proc/proc.h" -#include "ompi/patterns/net/netpatterns.h" - -#include "ompi/mca/mca.h" -#include "opal/util/arch.h" -#include "opal/util/argv.h" -#include "opal/datatype/opal_datatype.h" -#include "opal/util/output.h" - -#include "bcol_basesmuma_smcm.h" -BEGIN_C_DECLS - -struct list_data_t { - opal_list_item_t super; - void *data; -}; -typedef struct list_data_t list_data_t; -OBJ_CLASS_DECLARATION(list_data_t); - -/* - * Macro's for manipulating the 64 bit shared memory control bits. - * The 64 bit field is devided into 4 bit fields - * - * | 48-63: src | 32-47: index | 16-31: flag | 0-15: sequence number | - * - * Only the low 16 bits of the sequence number will be put in the header - * space. We will use the fact that the use of the shared buffers is - * synchronous, and get the upper 48 bits from the local process space. - */ - -#define BASESMUMA_CACHE_LINE_SIZE 128 - -#define SHIFT_UP << -#define SHIFT_DOWN >> - -#define SEQ_WIDTH 16 -#define SEQ_BASE 0 -#define FIELD_SEQ_MASK ( ( 1 SHIFT_UP SEQ_WIDTH ) - 1 ) -#define INPLACE_SEQ_MASK ( (int64_t)FIELD_SEQ_MASK SHIFT_UP SEQ_BASE) - -#define FLAG_WIDTH 16 -#define FLAG_BASE 16 -#define FIELD_FLAG_MASK ( ( 1 SHIFT_UP FLAG_WIDTH ) - 1 ) -#define INPLACE_FLAG_MASK ( (int64_t)FIELD_FLAG_MASK SHIFT_UP FLAG_BASE) - -#define INDX_WIDTH 16 -#define INDX_BASE 32 -#define FIELD_INDX_MASK ( ( 1 SHIFT_UP INDX_WIDTH ) - 1 ) -#define INPLACE_INDX_MASK ( (int64_t)FIELD_INDX_MASK SHIFT_UP INDX_BASE) - -#define SRC_WIDTH 16 -#define SRC_BASE 48 -#define FIELD_SRC_MASK ( ( 1 SHIFT_UP SRC_WIDTH ) - 1 ) -#define INPLACE_SRC_MASK ( (int64_t)FIELD_SRC_MASK SHIFT_UP SRC_BASE) -/*int64_t INPLACE_SRC_MASK= ((int64_t)FIELD_SRC_MASK SHIFT_UP SRC_BASE); */ - - -#define EXTRACT_FLAG(INPUT, OUTPUT, OUTPUT_TYPE, FIELD_BASE, FIELD_MASK) \ - OUTPUT = (OUTPUT_TYPE) ( (INPUT SHIFT_DOWN FIELD_BASE ) & FIELD_MASK ) - -#define STORE_FLAG(INPUT, OUTPUT, INPUT_TYPE, OUTPUT_TYPE, FIELD_BASE, INPLACE_FIELD_MASK ) \ - OUTPUT = \ - ( \ - /* 3 */ \ - ( \ - /* 2 */ \ - ( \ - /* 1 - shift the input field to the proper location */ \ - (OUTPUT_TYPE)( \ - ((OUTPUT_TYPE)((INPUT_TYPE) (INPUT))) \ - SHIFT_UP FIELD_BASE ) \ - /* mask off the extra bits */ \ - & ((OUTPUT_TYPE)INPLACE_FIELD_MASK) \ - ) \ - /* store back to the OUTPUT field, w/o destroying other fields */ \ - ) | OUTPUT \ - ) - -/** - * Structure to hold the basic shared memory bcoll component. - */ -struct mca_bcol_basesmuma_component_t { - /** Base coll component */ - mca_bcol_base_component_2_0_0_t super; - - /* management data for collectives with no user data */ - - /** MCA parameter: number of memory banks */ - int basesmuma_num_mem_banks; - - /** MCA parameter: number of regions per memory bank */ - int basesmuma_num_regions_per_bank; - - /** MCA parameter: Number of simultaneous groups supported */ - int n_groups_supported; - - /* management data for collectives with user data (ud) - the memory - * is actually obtained at the ML level - */ - - /** MCA paramenter: number of polling loops to run while waiting - * for children or parent to complete their work - */ - int n_poll_loops; - - /* mpool size */ - size_t mpool_size; - - - /* mpool inited - will use this to test whether or not the - * shared memory has been inited - */ - bool mpool_inited; - - /* shared memory control buffer - the control structures reside - * in shared memory */ - bcol_basesmuma_smcm_mmap_t *sm_ctl_structs; - - /* shared memory payload buffer - */ - bcol_basesmuma_smcm_mmap_t *sm_payload_structs; - - /* - * list of shared memory control structures - */ - opal_list_t ctl_structures; - - - /** opal list in which the list of peers that I am "connected" to is stored - */ - opal_list_t sm_connections_list; - - /* opal list in which the list of payload peers that I am "connected" to - * is stored - */ - opal_list_t sm_payload_connections_list; - - /* - * list of non-blocking admin barriers to progress */ - opal_mutex_t nb_admin_barriers_mutex; - opal_list_t nb_admin_barriers; - - /* - * order of fan-in tree - */ - int radix_fanin; - - /* - * order of fan-out tree - */ - int radix_fanout; - - /* - * Order of read tree - */ - int radix_read_tree; - - /* - * order of reduction fan-out tree - */ - int order_reduction_tree; - - /* - * K-nomial tree radix - */ - int k_nomial_radix; - - /* - * K-ary scatter tree radix - */ - int scatter_kary_radix; - - /* - * number of polling loops - */ - int num_to_probe; - - /* - * Portals addressing info - * void*: because wanted to keep portal library dependencies - * as local as possible - */ - void *portals_info; - bool portals_init; - - /* - * verbosity level - */ - int verbose; - - /* - * control file name base string - */ - char *clt_base_fname; - - /* - * data file name base string - */ - char *payload_base_fname; - - /* - * shared memory scratch space. This is mapped at the end of the - * segement of memory holding the control structures. - */ - char *my_scratch_shared_memory; - - /* - * size of scratch memory - */ - size_t my_scratch_shared_memory_size; - - /* the offset will be the same for all ranks */ - size_t scratch_offset_from_base_ctl_file; -}; - -static inline int mca_bcol_basesmuma_err(const char* fmt, ...) -{ - va_list list; - int ret; - - va_start(list, fmt); - ret = vfprintf(stderr, fmt, list); - va_end(list); - return ret; -} - -#if OPAL_ENABLE_DEBUG -#define BASESMUMA_VERBOSE(level, args) \ - do { \ - if(mca_bcol_basesmuma_component.verbose >= level) { \ - mca_bcol_basesmuma_err("[%s]%s[%s:%d:%s] BCOL-BASESMUMA ", \ - ompi_process_info.nodename, \ - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - mca_bcol_basesmuma_err args; \ - mca_bcol_basesmuma_err("\n"); \ - } \ - } while(0) -#else -#define BASESMUMA_VERBOSE(level, args) -#endif - - -/** - * Convenience typedef */ -typedef struct mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component_t; - -#if 0 -/* - * Implemented function index list - */ - -/* barrier */ -enum{ - FANIN_FAN_OUT_BARRIER_FN, - RECURSIVE_DOUBLING_BARRIER_FN, - N_BARRIER_FNS -}; - -/* reduce */ -enum{ - FANIN_REDUCE_FN, - REDUCE_SCATTER_GATHER_FN, - N_REDUCE_FNS -}; -enum{ - SHORT_DATA_FN_REDUCE, - LONG_DATA_FN_REDUCE, - N_REDUCE_FNS_USED -}; - -/* all-reduce */ -enum{ - FANIN_FANOUT_ALLREDUCE_FN, - REDUCE_SCATTER_ALLGATHER_FN, - N_ALLREDUCE_FNS -}; -enum{ - SHORT_DATA_FN_ALLREDUCE, - LONG_DATA_FN_ALLREDUCE, - N_ALLREDUCE_FNS_USED -}; - - -/* enum for node type */ -enum{ - ROOT_NODE, - LEAF_NODE, - INTERIOR_NODE -}; - - -/* - * N-order tree node description - */ -struct tree_node_t { - /* my rank within the group */ - int my_rank; - /* my node type - root, leaf, or interior */ - int my_node_type; - /* number of nodes in the tree */ - int tree_size; - /* number of parents (0/1) */ - int n_parents; - /* number of children */ - int n_children; - /* parent rank within the group */ - int parent_rank; - /* chidren ranks within the group */ - int *children_ranks; -}; -typedef struct tree_node_t tree_node_t; - -/* - * Pair-wise data exchange - */ -/* enum for node type */ -enum{ - EXCHANGE_NODE, - EXTRA_NODE -}; - -struct pair_exchange_node_t { - - /* my rank within the group */ - int my_rank; - - /* number of nodes this node will exchange data with */ - int n_exchanges; - - /* ranks of nodes involved in data exchnge */ - int *rank_exchanges; - - /* number of extra sources of data - outside largest power of 2 in - * this group */ - int n_extra_sources; - - /* rank of the extra source */ - int rank_extra_source; - - /* number of tags needed per stripe */ - int n_tags; - - /* log 2 of largest full power of 2 for this node set */ - int log_2; - - /* largest power of 2 that fits in this group */ - int n_largest_pow_2; - - /* node type */ - int node_type; - -}; -typedef struct pair_exchange_node_t pair_exchange_node_t; -#endif -/* - * descriptor for managing the admin nonblocking barrier routine. - * This is an sm internal routine, and assumes only 1 outstanding - * nb-barrier collective call per block. - */ -/* forward declarations */ -struct mca_bcol_basesmuma_module_t; -struct sm_buffer_mgmt; - -struct sm_nbbar_desc_t { - /* make sure we can put this on a list */ - opal_list_item_t super; - - /* phase of the collective operation - needed to know how to continue - * progressing the nb-barrier */ - int collective_phase; - - /* iteration to continue at */ - int recursive_dbl_iteration; - - /* pointer to the collective module this is associated with */ - struct mca_bcol_basesmuma_module_t *sm_module; - - /* pointer to payload/control structs buffers */ - struct sm_buffer_mgmt *coll_buff; - - /* pool index */ - int pool_index; - - /* pointer to the mca_bcol_base_memory_block_desc_t structure - * that is actually managing this registration. - * This is meaningful when these control structures - * are used in conjunction with the user payload - * data that is allocated at the ml level. - */ - void *ml_memory_block_descriptor; - -}; -typedef struct sm_nbbar_desc_t sm_nbbar_desc_t; - -/* - * Barrier request objects - */ - -/* shared memory data strucutures */ -struct mca_bcol_basesmuma_nb_request_process_shared_mem_t { - volatile uint64_t coll_index; - /* flag used to indicate the status of this memory region */ - volatile uint64_t flag; - volatile uint64_t index; - - /* pading */ - /* Note: need to change this so it takes less memory */ - char padding[BASESMUMA_CACHE_LINE_SIZE-3*sizeof(uint64_t)]; -}; - -typedef struct mca_bcol_basesmuma_nb_request_process_shared_mem_t -mca_bcol_basesmuma_nb_request_process_shared_mem_t; - -/* enum for phase at which the nb barrier is in */ -enum{ - NB_BARRIER_INACTIVE, - - /* fan-in/fan-out */ - NB_BARRIER_FAN_IN, - NB_BARRIER_FAN_OUT, - - /* recursive doubling */ - NB_PRE_PHASE, - NB_RECURSIVE_DOUBLING, - NB_POST_PHASE, - - /* done and not started are the same for all practicle - * purposes, as the init funtion always sets this flag - */ - NB_BARRIER_DONE -}; - - - -/* forward declartion */ -struct mca_bcol_basesmuma_module_t; - -struct mca_basesmuma_ctrl_4_hdl_t { - int fd; - bool status; - volatile char buf[128]; - /*volatile char buf[OPAL_PATH_MAX];*/ -}; -typedef struct mca_basesmuma_ctrl_4_hdl_t mca_basesmuma_ctrl_4_hdl_t; - -/* control segment for shared memory */ -struct mca_bcol_basesmuma_ctl_struct_t { - /* collective identifier */ - volatile int64_t sequence_number; - volatile int64_t flag; - volatile int64_t index; - volatile int64_t offset; - volatile int64_t offset_zip; - - - /* used for non-blocking algorithms */ - int status; - int active_requests; - int iteration; - - int *src_ptr; - - int start; - - /* process private data */ - int starting_flag_value; - - /* experiment for large data colls */ - int n_sends; - int length; - - /* hdl framework control structure*/ - /* no need to pad at this level anymore */ - volatile int64_t data_hdl; - volatile mca_basesmuma_ctrl_4_hdl_t hdl_ctrl; - -#ifdef __PORTALS_AVAIL__ - struct mca_bcol_basesmuma_portal_buf_addr_t portals_buf_addr; -#endif - /* padding */ - /*char padding[BASESMUMA_CACHE_LINE_SIZE-4*sizeof(uint64_t)-3*sizeof(int)];*/ - char padding[BASESMUMA_CACHE_LINE_SIZE-6*sizeof(int64_t)-5*sizeof(int)]; -}; -typedef struct mca_bcol_basesmuma_ctl_struct_t mca_bcol_basesmuma_ctl_struct_t; - - -#define SM_BCOLS_MAX 2 - -/* enum for signaling flag bank, when - * adding to this list, please keep - * it alphabetical - */ -enum { - ALLGATHER_FLAG, - ALLREDUCE_FLAG, - BARRIER_FANIN_FLAG, - BARRIER_FANOUT_FLAG, - BARRIER_RKING_FLAG, - BCAST_FLAG, - GATHER_FLAG, - REDUCE_FLAG, - NUM_SIGNAL_FLAGS -}; - - -/* control region for colls with user data - shared memory */ -struct mca_bcol_basesmuma_header_t { - /* collective identifier */ - volatile int64_t sequence_number; - volatile int8_t flags[NUM_SIGNAL_FLAGS][SM_BCOLS_MAX]; - volatile int32_t src; /* src of bcast data for unknown root, - bcol id for known root - */ - /* starting flag - hierarchies */ - int8_t starting_flag_value[SM_BCOLS_MAX]; - int8_t ready_flag; - - /* Manju: Cached array of receive buffer offsets - * - * This array stores the receive buffer offsets (rbuf_offsets) of data buffer. - * In general, we assume that sbuf_offset and rbuf_offset of - * processes invoking the collective primitive is same. This is - * true when the order in which processes invoke their hierarchies are - * same. - * - * For some algorithms (like broadcast, reduce) we split the ML buffer - * and use first half as - * source and second half as receive buffer. We swap these buffers for - * each change when we change levels i.e., if first half is source for - * level 1, in the level 2 of hierarchy it becomes the receive buffer. - * For reduce algorithm, each process can invoke hierarchies - * (primitives) in different order. For example, process 1 might have level 1 as SM - * and level 2 as p2p, and process 2 might have different order where its - * level 1 is p2p and level 2 SM. In this case, if in basesmuma reduce - * algorithm, if parent assumes its rbuf_offset as child's rbuf_offset - * it is wrong. So we cache rbuf_offset of each process so - * it could be accessed by processes to obtain the data. - */ - - volatile int32_t roffsets[SM_BCOLS_MAX]; - - /* Manju Start: Experimental ctl fields and should be removed later; - * This is used for lmsg reduce for testing - * during transition to HDL primitives - */ -#if 0 - int lmsg_reduce_snd_completes; - /* There can be atmost 20 ranks in the subgroup. Since this - * only for testing this should be good enough */ - int lmsg_reduce_peers[20]; - int lmsg_reduce_send_offsets[20]; - /* End: Experimental ctl fields */ - - - /* no need to pad at this level anymore */ - volatile int64_t data_hdl; -#endif -}; -typedef struct mca_bcol_basesmuma_header_t mca_bcol_basesmuma_header_t; - -/* data needed for large messages */ -struct mca_bcol_basesmuma_large_msg_t { - /* scatter allgather data */ - uint64_t offset; - uint64_t n_sends; - uint64_t length; - - /* portals data */ - -}; -typedef struct mca_bcol_basesmuma_large_msg_t mca_bcol_basesmuma_large_msg_t; - -/* payload struct */ -struct mca_bcol_basesmuma_payload_t { - - /* base pointer to shared memory control structure */ - mca_bcol_basesmuma_header_t *ctl_struct; - void *payload; - -}; - -typedef struct mca_bcol_basesmuma_payload_t mca_bcol_basesmuma_payload_t; - - - - -/* memory bank memory management structure */ -struct mem_bank_management_t { - - /* generation counter */ - uint64_t bank_gen_counter; - - /* descriptor for the non-blocking barrier. This is - * used to manage this bank of memory. - */ - sm_nbbar_desc_t nb_barrier_desc; - - /* the number of buffers that are not in use, and are - * available. The assumption is that the buffers are - * recycled all at once, so are available for re-use - * until all buffers have been made available for re-use. - */ - volatile int available_buffers; - - /* - * number of buffers freed */ - volatile int n_buffs_freed; - - /* mutex to ensure atomic recycling of resrouces */ - opal_mutex_t mutex; - - /* number of buffers being managed */ - int number_of_buffers; - - /* shared memory control structures */ - int index_shared_mem_ctl_structs; - - -}; -typedef struct mem_bank_management_t mem_bank_management_t; - -/* data structure for shared buffers */ -struct sm_buffer_mgmt { - /* number of buffers per process */ - int number_of_buffs; - - /* size of group */ - int size_of_group; - - /* number of memory banks */ - int num_mem_banks; - - /* number of buffers per memory bank */ - int num_buffs_per_mem_bank; - - /* log base 2 of num_buffs_per_mem_bank */ - int log2_num_buffs_per_mem_bank; - - /* log base 2 total number of buffers */ - int log2_number_of_buffs; - - /* mask - masks off the bits corresponding to buffer index */ - int mask; - - /* control buffers - these point to regions in shared memory */ - /* leading dimension is the group size - all pointers for a given - * set of buffers appear consecutively in this array - */ - volatile void **ctl_buffs; - - /* management data for the control structures - - * one per bank of control structures - Will be used for - * the payload buffers as well. - */ - mem_bank_management_t *ctl_buffs_mgmt; - - /* data buffers - these point to regions in shared memory */ - /* leading dimension is the group size - all pointers for a given - * set of buffers appear consecutively in this array - */ - - volatile mca_bcol_basesmuma_payload_t *data_buffs; - - - -}; -typedef struct sm_buffer_mgmt sm_buffer_mgmt; - - -struct mca_bcol_basesmuma_nb_coll_buff_desc_t { - void *data_addr; - uint64_t bank_index; - uint64_t buffer_index; - int active_requests; - ompi_request_t **requests; - int data_src; - int radix_mask; - int radix_mask_pow; - int iteration; - int status; - /* this is for testing */ - int tag; - - volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; - volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer; -}; - -typedef struct mca_bcol_basesmuma_nb_coll_buff_desc_t mca_bcol_basesmuma_nb_coll_buff_desc_t; - -struct mca_bcol_basesmuma_local_mlmem_desc_t { - - uint32_t bank_index_for_release; - struct mca_bcol_base_memory_block_desc_t *ml_mem_desc; - uint32_t num_banks; - uint32_t num_buffers_per_bank; - uint32_t size_buffer; - uint32_t *bank_release_counter; - - /* - * Number of descriptors allocated is equivalent to number of ml buffers - * (number of banks * number of buffers per bank) - */ - mca_bcol_basesmuma_nb_coll_buff_desc_t *nb_coll_desc; -}; - -typedef struct mca_bcol_basesmuma_local_mlmem_desc_t mca_bcol_basesmuma_local_mlmem_desc_t; - -#ifdef __PORTALS_AVAIL__ -#define MAX_SM_GROUP_SIZE 32 - - -struct portals_scatter_allgather_nb_bcast_state_t -{ - /* local variables */ - uint64_t length; - int my_rank, src, matched; - int src_list[MAX_SM_GROUP_SIZE]; - int group_size; - int64_t ready_flag; - int pow_2, pow_2_levels; - int src_list_index; - uint64_t fragment_size; /* user buffer size */ - - /* Input argument variables */ - void *my_userbuf; - int64_t sequence_number; - - /* Extra source variables */ - bool secondary_root; - int partner , extra_partner; - - /* Scatter Allgather offsets */ - uint64_t local_sg_offset , global_sg_offset , partner_offset ; - - /* Portals messaging relevant variables */ - /* - * ptl_handle_eq_t allgather_eq_h; - */ - ptl_handle_eq_t read_eq; - ptl_event_t allgather_event; - bool msg_posted; - - /* OMPI module and component variables */ - mca_bcol_basesmuma_component_t *cs; - struct mca_bcol_basesmuma_module_t *bcol_module; - - /* Control structure and payload variables */ - volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* scatter source */ - volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer; /* scatter source */ - - int phase; -}; - - -typedef struct portals_scatter_allgather_nb_bcast_state_t sg_state_t; -#endif - -#define SM_ARRAY_INDEX(LEAD_DIM,BUF_INDEX,PROC_INDEX) \ - ((LEAD_DIM)*(BUF_INDEX)+(PROC_INDEX)) -/* debug */ -#define BARRIER_BANK_LIST_SIZE 32 -/* end debug */ - -struct mca_bcol_basesmuma_module_t { - /* base structure */ - mca_bcol_base_module_t super; - - /* free list item with the control structures used for - * the no user data collective operations - */ - list_data_t *no_userdata_ctl; - - /* free list item with the control structures used for - * the with user data collective operations - */ - list_data_t *userdata_ctl; - - /* - * information on sm control backing files for the subgroup - * associated with this module. - */ - bcol_basesmuma_smcm_proc_item_t **ctl_backing_files_info; - - /* - * information on sm payload backing files for the subgroup - * associated with this module. - */ - bcol_basesmuma_smcm_proc_item_t **payload_backing_files_info; - - /* - * buffers for the collective that do not involve user data - - * barrier, fanin, fanout. - */ - sm_buffer_mgmt colls_no_user_data; - - /* - * buffers for the collective with user data. - */ - sm_buffer_mgmt colls_with_user_data; - - /* recursive-doubling tree node */ - netpatterns_pair_exchange_node_t recursive_doubling_tree; - - /* k-nomial gather/allgather tree */ - netpatterns_k_exchange_node_t knomial_allgather_tree; - - /* fanin tree node - root is rank 0 */ - netpatterns_tree_node_t fanin_node; - - /* fanout tree node - root is rank 0 */ - netpatterns_tree_node_t fanout_node; - - /* index of blocking barrier memory region to use */ - int index_blocking_barrier_memory_bank; - - /* comm to shared memory map */ - int *comm_to_sm_map; - - /* reduction fanout tree */ - netpatterns_tree_node_t* reduction_tree; - - /* broadcast fanout tree */ - netpatterns_tree_node_t* fanout_read_tree; - - /* scatter - k-ary tree */ - int scatter_kary_radix; - netpatterns_tree_node_t *scatter_kary_tree; - - /* Knomial exchange tree */ - /* Currently used for only large message reduce */ - netpatterns_k_exchange_node_t knomial_exchange_tree; - - /* sequence number offset - want to make sure that we start - * id'ing collectives with id 0, so we can have simple - * resource management. - */ - int64_t squence_number_offset; - - /* basesmuma specific header size into ml buffer - * was calculated at ml level - it is the sum of - * all headers from all bcols and then aligned to - * whatever alignment was requested - */ - uint32_t total_header_size; - - /* list of possible sources */ - int *src_list; - - /* Number of possible sources */ - int src_size; - - /* smallest power of k that is smaller - * than or equal in size to the uma group - */ - int pow_k_levels; - - /* size of power-of-k group */ - int pow_k; - - /* smallest power of 2 that is smaller - * than or equal to the smuma group size - */ - int pow_2_levels; - - /* size of power-of-2 group */ - int pow_2; - - /* pointer to the shared memory scratch array of each - * process in the group. - */ - void **shared_memory_scratch_space; - - /* - * Caching information for re-entrant collectives - */ - mca_bcol_basesmuma_local_mlmem_desc_t ml_mem; - - /* - * Cached offsets for lmsg reduce - */ - int **reduce_offsets; - - /*XXX: - * Starting to explore the beauty of zero-copy for large message - */ - struct mca_hdl_base_module_t **hdl_module; - -#ifdef __PORTALS_AVAIL__ - /* - * Store state for NB blocking functions - */ - sg_state_t sg_state; - -#endif -}; - -typedef struct mca_bcol_basesmuma_module_t mca_bcol_basesmuma_module_t; -OBJ_CLASS_DECLARATION(mca_bcol_basesmuma_module_t); - -/* shared memory specific arguments for the bcol registration function */ -typedef struct bcol_basesmuma_registration_data_t { - char *file_name; /* filename for payload */ - void *base_addr; /* base address to be mapped */ - size_t size; /* size of memory block to be "registered" */ - size_t size_ctl_structure; - size_t data_seg_alignment; - bcol_basesmuma_smcm_mmap_t *sm_mmap; /* shared memory map struct */ - mca_bcol_base_release_buff_fn_t buff_release_cb; /* buffer release - call back */ -} bcol_basesmuma_registration_data_t; - - -enum { - BUFFER_AVAILABLE, - STARTED, - FANIN, - FANOUT -}; - -/* enum used for non-blocking large - * message bcast - */ - -enum { - INIT, - START, - NOT_STARTED, - SCATTER, - ALLGATHER, - EXTRA_RANK, - PROBE, - SCATTER_ROOT_WAIT, - SCATTER_EXTRA_ROOT_WAIT, - SCATTER_PARENT_WAIT, - FINISHED -}; - -/** - * Global component instance - */ -OMPI_MODULE_DECLSPEC extern mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component; - - -/* - * coll module functions - */ - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_bcol_basesmuma_init_query(bool enable_progress_threads, - bool enable_mpi_threads); - -/* query to see if the module is available for use on the given - * communicator, and if so, what it's priority is. - */ -mca_bcol_base_module_t ** -mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules); - - - -/* shared memory specific memory registration function - this will be passed into the mpool */ -int mca_bcol_basesmuma_register_sm(void *context_data, void *base, size_t size, - void **reg); - -/* shared memory specific memory deregistration function - also needed by the mpool */ -int mca_bcol_basesmuma_deregister_sm(void *context_data, void *reg); - -/* setup the new k_nomial tree for collectives */ -int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super); - -/* allocate the memory pool for the shared memory control structures */ -int mca_bcol_basesmuma_allocate_pool_memory(mca_bcol_basesmuma_component_t - *component); - -/* initialize the internal scratch buffers and control structs that will be - used by the module */ -int base_bcol_basesmuma_setup_library_buffers( - mca_bcol_basesmuma_module_t *sm_module, - mca_bcol_basesmuma_component_t *cs); - - -/* shared memory recursive doubling initialization */ -int bcol_basesmuma_rd_barrier_init(mca_bcol_base_module_t *module); - -/* shared memory recusive double barrier */ -int bcol_basesmuma_recursive_double_barrier(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); -/* shared memory fanin */ -int bcol_basesmuma_fanin_init(mca_bcol_base_module_t *super); - -/* shared memory fanout */ -int bcol_basesmuma_fanout_init(mca_bcol_base_module_t *super); - -/* shared memory recursive k-ing non-blocking barrier */ -int bcol_basesmuma_barrier_init(mca_bcol_base_module_t *super); - -/* Shared memory broadcast */ -int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super); - -int bcol_basesmuma_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -/* Shared memory non-blocking broadcast */ -int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -/* Shared memory non-blocking broadcast - Large message anyroot */ -int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -#if 0 -/*FIXME: having fun here*/ -int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); -#endif - -int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -/* - * shared memory scatter - */ -int bcol_basesmuma_scatter_init(mca_bcol_base_module_t *super); - -/* shared memory nonblocking scatter - known root */ -int bcol_basesmuma_nb_scatter_k_array_knownroot( - bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -/* shared memory non-blocking k-nomial barrier init */ -int bcol_basesmuma_k_nomial_barrier_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -/* shared memory non-blocking k-nomial barrier progress */ -int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -/*shared memory non-blocking k-nomial allgather init */ -int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -/* shared memory non-blocking k-nomial allgather progress */ -int bcol_basesmuma_k_nomial_allgather_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -/* shared memory allgather -- selection logic api */ -int bcol_basesmuma_allgather_init(mca_bcol_base_module_t *super); - -/* shared memory blocking k-nomial gather */ -int bcol_basesmuma_k_nomial_gather(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -/* shared memory non blocking k-nomial gather */ -int bcol_basesmuma_k_nomial_gather_init(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -/* shared memory non blocking k-nomial gather progress*/ -int bcol_basesmuma_k_nomial_gather_progress(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -/* shared memory init */ -int bcol_basesmuma_gather_init(mca_bcol_base_module_t *super); - -/* allocate shared memory control memory */ -int mca_bcol_basesmuma_allocate_sm_ctl_memory( - mca_bcol_basesmuma_component_t *cs); - -/* Shared memory basesmuma reduce */ -int bcol_basesmuma_reduce_init(mca_bcol_base_module_t *super); -int bcol_basesmuma_reduce_intra_fanin(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); -int bcol_basesmuma_reduce_intra_fanin_old(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_reduce_intra_reducescatter_gather(void *sbuf, void *rbuf, - int count, struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -/* Shared memory basesmuma allreduce */ -int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super); - -int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -/* initialize non-blocking barrier for recycling the memory buffers. - * This is not a general purpose nb_barrier, and relies on the - * fact that we will have only one outstanding nb-barrier per bank - * at a time. - */ -int bcol_basesmuma_rd_nb_barrier_init_admin(sm_nbbar_desc_t *sm_desc); - -/* admin nonblocking barrier - progress function */ -int bcol_basesmuma_rd_nb_barrier_progress_admin(sm_nbbar_desc_t *sm_desc); - -/* Memory syncronization registration function */ -int bcol_basesmuma_memsync_init(mca_bcol_base_module_t *super); - -/* smcm allgather function used to exchange file offsets. */ -int bcol_basesmuma_smcm_allgather_connection( - mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_sbgp_base_module_t *module, - opal_list_t *peer_list, - bcol_basesmuma_smcm_proc_item_t ***backing_files, - ompi_communicator_t *comm, - bcol_basesmuma_smcm_file_t input, char *base_fname, - bool map_all); - -/* clean up the backing files associated with a basesmuma bcol module */ -int bcol_basesmuma_smcm_release_connections (mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_sbgp_base_module_t *sbgp_module, opal_list_t *peer_list, - bcol_basesmuma_smcm_proc_item_t ***back_files); - -/* - * this function initializes the internal scratch buffers and control - * structures that will be used by the module - */ -int base_bcol_masesmuma_setup_library_buffers( - mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_bcol_basesmuma_component_t *sm_bcol_component); - -/* get the index of the shared memory buffer to be used */ -int bcol_basesmuma_get_buff_index( sm_buffer_mgmt * buff_block, - uint64_t buff_id ); - -int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block, - uint64_t buff_id ); - -/* bank init which is used for shared memory optimization, fall back to - * the bank init above if this causes problems - */ -int bcol_basesmuma_bank_init_opti(struct mca_bcol_base_memory_block_desc_t *payload_block, - uint32_t data_offset, - mca_bcol_base_module_t *bcol_module, - void *reg_data); - -/* cleanup nb_coll_buff_desc */ -void cleanup_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc, - uint32_t num_banks, - uint32_t num_buffers_per_bank); - - -/* used for shared memory offset exchange */ -int base_bcol_basesmuma_exchange_offsets( - mca_bcol_basesmuma_module_t *sm_bcol_module, - void **result_array, uint64_t mem_offset, int loop_limit, - int leading_dim); - - -/* the progress function to be called from the opal progress function - */ -int bcol_basesmuma_progress(void); - -/* Macro for initializing my shared memory control structure */ -#define BASESMUMA_HEADER_INIT(my_ctl_pointer,ready_flag, seqn, bcol_id) \ - do{ \ - int i,j; \ - int8_t flag_offset = 0; \ - /* setup resource recycling */ \ - if( (my_ctl_pointer)->sequence_number < (seqn) ) { \ - /* Signal arrival */ \ - for( j = 0; j < SM_BCOLS_MAX; j++){ \ - (my_ctl_pointer)->starting_flag_value[j]=0; \ - for( i = 0; i < NUM_SIGNAL_FLAGS; i++){ \ - (my_ctl_pointer)->flags[i][j] = -1; \ - } \ - } \ - } \ - /* increment the starting flag by one and return */ \ - flag_offset = (my_ctl_pointer)->starting_flag_value[(bcol_id)]; \ - (ready_flag) = flag_offset + 1; \ - opal_atomic_wmb(); \ - (my_ctl_pointer)->sequence_number = (seqn); \ - }while(0) - -/* these are all the same, am using a single macro for all collectives */ - -#define IS_PEER_READY(peer, my_flag, my_sequence_number,flag_index, bcol_id) \ - (((peer)->sequence_number == (my_sequence_number) && \ - (peer)->flags[flag_index][bcol_id] >= (my_flag))? true : false ) - -#if 0 -#define IS_AR_DATA_READY(peer, my_flag, my_sequence_number) \ - (((peer)->sequence_number == (my_sequence_number) && \ - (peer)->flags[ALLREDUCE_FLAG][bcol_id] >= (my_flag) \ - )? true : false ) - -#define IS_GDATA_READY(peer, my_flag, my_sequence_number) \ - (((peer)->sequence_number == (my_sequence_number) && \ - (peer)->flags[GATHER_FLAG][bcol_id] == (my_flag) \ - )? true : false ) - -#define IS_PEER_READY(peer, my_flag, flag_index, my_sequence_number) \ - ((((volatile int64_t)(peer)->sequence_number > (my_sequence_number)) || \ - (((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \ - ((peer)->flags[flag_index][bcol_id] == (my_flag))) \ - )? true : false ) - -#define IS_ALLREDUCE_PEER_READY(peer, my_flag, my_sequence_number) \ - ((((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \ - (((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag))||((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag) + 1)) \ - )? true : false ) -#endif - -#define IS_LAST_BCOL_FUNC(ml_args) \ - ((((ml_args)->n_of_this_type_in_collective == \ - (ml_args)->index_of_this_type_in_collective + 1 ) )? true : false) - -static inline __opal_attribute_always_inline__ -size_t bcol_basesmuma_data_offset_calc( - mca_bcol_basesmuma_module_t *basesmuma_module) -{ - uint32_t offset = basesmuma_module->super.header_size; - offset = ((offset + BCOL_HEAD_ALIGN - 1) / BCOL_HEAD_ALIGN) * BCOL_HEAD_ALIGN; - - return (size_t) offset; -} - - -END_C_DECLS - -#endif /* MCA_BCOL_basesmuma_EXPORT_H */ diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_allgather.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_allgather.c deleted file mode 100644 index 97a857ef0c..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_allgather.c +++ /dev/null @@ -1,352 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h" -/* - #define IS_AGDATA_READY(peer, my_flag, my_sequence_number)\ - (((peer)->sequence_number == (my_sequence_number) && \ - (peer)->flags[ALLGATHER_FLAG][bcol_id] >= (my_flag) \ - )? true : false ) -*/ - -#define CALC_ACTIVE_REQUESTS(active_requests,peers, tree_order) \ - do{ \ - for( j = 0; j < (tree_order - 1); j++){ \ - if( 0 > peers[j] ) { \ - /* set the bit */ \ - *active_requests ^= (1<bcol_module; - netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree; - int bcol_id = (int) bcol_module->super.bcol_id; - uint32_t buffer_index = input_args->buffer_index; - int *active_requests = - &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); - - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; - int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; - int leading_dim, buff_idx, idx; - - int64_t sequence_number = input_args->sequence_num; - int my_rank = bcol_module->super.sbgp_partner_module->my_index; - - volatile mca_bcol_basesmuma_payload_t *data_buffs; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - - volatile int8_t ready_flag; - - /* initialize the iteration counter */ - buff_idx = input_args->src_desc->buffer_index; - leading_dim = bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Set pointer to current proc ctrl region */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - /* initialize headers and ready flag */ - BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); - - /* initialize these */ - *iteration = -1; - *active_requests = 0; - *status = ready_flag; - - if (EXTRA_NODE == exchange_node->node_type) { - /* I am ready at this level */ - opal_atomic_wmb (); - my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag; - } - - return bcol_basesmuma_k_nomial_allgather_progress (input_args, const_args); -} - - -/* allgather progress function */ - -int bcol_basesmuma_k_nomial_allgather_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variables */ - int8_t flag_offset; - uint32_t buffer_index = input_args->buffer_index; - volatile int8_t ready_flag; - mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module; - netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree; - int group_size = bcol_module->colls_no_user_data.size_of_group; - int *list_connected = bcol_module->super.list_n_connected; /* critical for hierarchical colls */ - int bcol_id = (int) bcol_module->super.bcol_id; - mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; - int *active_requests = - &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); - - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; - int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; - int leading_dim, idx, buff_idx; - - int i, j, probe; - int knt; - int src; - int recv_offset, recv_len; - int max_requests = 0; /* critical to set this */ - int pow_k, tree_order; - - int64_t sequence_number=input_args->sequence_num; - int my_rank = bcol_module->super.sbgp_partner_module->my_index; - - int pack_len = input_args->count * input_args->dtype->super.size; - - void *data_addr = (void*)( - (unsigned char *) input_args->sbuf + - (size_t) input_args->sbuf_offset); - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char *peer_data_pointer; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer; - -#if 0 - fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n", my_rank, - *active_requests, *iteration, *status); -#endif - - buff_idx = input_args->src_desc->buffer_index; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Set pointer to current proc ctrl region */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - /* increment the starting flag by one and return */ - /* flag offset seems unnecessary here */ - flag_offset = my_ctl_pointer->starting_flag_value[bcol_id]; - ready_flag = *status; - my_ctl_pointer->sequence_number = sequence_number; - /* k-nomial parameters */ - tree_order = exchange_node->tree_order; - pow_k = exchange_node->log_tree_order; - - /* calculate the maximum number of requests - * at each level each rank communicates with - * at most (k - 1) peers - * so if we set k - 1 bit fields in "max_requests", then - * we have max_request == 2^(k - 1) -1 - */ - for(i = 0; i < (tree_order - 1); i++){ - max_requests ^= (1<node_type) { - /* If I'm in here, then I must be looking for data */ - ready_flag = flag_offset + 1 + pow_k + 2; - - src = exchange_node->rank_extra_sources_array[0]; - peer_data_pointer = data_buffs[src].payload; - peer_ctl_pointer = data_buffs[src].ctl_struct; - - /* calculate the count */ - for (i = 0, knt = 0 ; i < group_size ; ++i){ - knt += list_connected[i]; - } - - for (i = 0 ; i < cm->num_to_probe ; ++i) { - if (IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)) { - /* we receive the entire message */ - opal_atomic_mb (); - memcpy (data_addr, (void *) peer_data_pointer, knt * pack_len); - - goto FINISHED; - } - } - - /* haven't found it, state is saved, bail out */ - return BCOL_FN_STARTED; - } else if (0 < exchange_node->n_extra_sources) { - /* I am a proxy for someone */ - src = exchange_node->rank_extra_sources_array[0]; - peer_data_pointer = data_buffs[src].payload; - peer_ctl_pointer = data_buffs[src].ctl_struct; - - /* calculate the offset */ - for (i = 0, knt = 0 ; i < src ; ++i){ - knt += list_connected[i]; - } - - /* probe for extra rank's arrival */ - for (i = 0 ; i < cm->num_to_probe ; ++i) { - if (IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)) { - opal_atomic_mb (); - /* copy it in */ - memcpy ((void *) ((uintptr_t) data_addr + knt * pack_len), - (void *) ((uintptr_t) peer_data_pointer + knt * pack_len), - pack_len * list_connected[src]); - break; - } - } - - if (i == cm->num_to_probe) { - return BCOL_FN_STARTED; - } - } - - /* bump the ready flag to indicate extra node exchange complete */ - ++ready_flag; - *iteration = 0; - } - - /* start the recursive k - ing phase */ - for (i = *iteration ; i < pow_k ; ++i) { - /* I am ready at this level */ - opal_atomic_wmb (); - my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag; - - if (0 == *active_requests) { - /* flip some bits, if we don't have active requests from a previous visit */ - CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[i],tree_order); - } - - for (j = 0; j < (tree_order - 1); ++j) { - - /* recv phase */ - src = exchange_node->rank_exchanges[i][j]; - - if (src < 0) { - /* then not a valid rank, continue */ - continue; - } - - if (!(*active_requests&(1<payload_info[i][j].r_offset * pack_len; - recv_len = exchange_node->payload_info[i][j].r_len * pack_len; - - /* I am putting the probe loop as the inner most loop to achieve - * better temporal locality - */ - for (probe = 0 ; probe < cm->num_to_probe ; ++probe) { - if (IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)) { - /* flip the request's bit */ - *active_requests ^= (1<flags[ALLGATHER_FLAG][bcol_id]; - *iteration = i; - return BCOL_FN_STARTED; - } - } - - /* bump the flag one more time for the extra rank */ - ready_flag = flag_offset + 1 + pow_k + 2; - - /* finish off the last piece, send the data back to the extra */ - if( 0 < exchange_node->n_extra_sources ) { - /* simply announce my arrival */ - opal_atomic_wmb (); - my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag; - } - -FINISHED: - /* bump this up for others to see */ - my_ctl_pointer->starting_flag_value[bcol_id]++; - return BCOL_FN_COMPLETE; -} - -/* Register allreduce functions to the BCOL function table, - * so they can be selected - */ -int bcol_basesmuma_allgather_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_ALLGATHER; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_k_nomial_allgather_init, - bcol_basesmuma_k_nomial_allgather_progress); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_allreduce.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_allreduce.c deleted file mode 100644 index 0058ec770f..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_allreduce.c +++ /dev/null @@ -1,611 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/constants.h" -#include "ompi/op/op.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" - -#include "opal/include/opal_stdint.h" - -#include "ompi/mca/bcol/base/base.h" -#include "bcol_basesmuma.h" - -static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_ALLREDUCE; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1048576; - comm_attribs.data_src = DATA_SRC_KNOWN; - - /* selection logic at the ml level specifies a - * request for a non-blocking algorithm - * however, these algorithms are blocking - * following what was done at the p2p level - * we will specify non-blocking, but beware, - * these algorithms are blocking and will not make use - * of the progress engine - */ - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - /* Set attributes for fanin fanout algorithm */ - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_allreduce_intra_fanin_fanout, - bcol_basesmuma_allreduce_intra_fanin_fanout_progress); - - inv_attribs.bcol_msg_min = 20000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_allreduce_intra_fanin_fanout, - bcol_basesmuma_allreduce_intra_fanin_fanout_progress); - - /* Differs only in comm size */ - - comm_attribs.data_src = DATA_SRC_UNKNOWN; - comm_attribs.waiting_semantics = BLOCKING; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 8; - - /* Set attributes for recursive doubling algorithm */ - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_allreduce_intra_recursive_doubling, - NULL); - - - return OMPI_SUCCESS; -} - -/* - * Small data fanin reduce - * ML buffers are used for both payload and control structures - * This functions works with hierarchical allreduce and - * progress engine - */ -static inline int reduce_children (mca_bcol_basesmuma_module_t *bcol_module, volatile void *rbuf, netpatterns_tree_node_t *my_reduction_node, - int *iteration, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, ompi_datatype_t *dtype, - volatile mca_bcol_basesmuma_payload_t *data_buffs, int count, struct ompi_op_t *op, int process_shift) -{ - volatile mca_bcol_basesmuma_header_t *child_ctl_pointer; - int bcol_id = (int) bcol_module->super.bcol_id; - int64_t sequence_number = my_ctl_pointer->sequence_number; - int8_t ready_flag = my_ctl_pointer->ready_flag; - int group_size = bcol_module->colls_no_user_data.size_of_group; - - if (LEAF_NODE != my_reduction_node->my_node_type) { - volatile char *child_data_pointer; - volatile void *child_rbuf; - - /* for each child */ - /* my_result_data = child_result_data (op) my_source_data */ - - for (int child = *iteration ; child < my_reduction_node->n_children ; ++child) { - int child_rank = my_reduction_node->children_ranks[child] + process_shift; - - if (group_size <= child_rank){ - child_rank -= group_size; - } - - child_ctl_pointer = data_buffs[child_rank].ctl_struct; - - if (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) { - *iteration = child; - return BCOL_FN_STARTED; - } - - child_data_pointer = data_buffs[child_rank].payload; - child_rbuf = child_data_pointer + child_ctl_pointer->roffsets[bcol_id]; - - ompi_op_reduce(op, (void *)child_rbuf, (void *)rbuf, count, dtype); - } /* end child loop */ - } - - if (ROOT_NODE != my_reduction_node->my_node_type) { - opal_atomic_wmb (); - my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag; - } - - /* done with this step. move on to fan out */ - *iteration = -1; - - return BCOL_FN_COMPLETE; -} - -static int allreduce_fanout (mca_bcol_basesmuma_module_t *bcol_module, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, - volatile void *my_data_pointer, int process_shift, volatile mca_bcol_basesmuma_payload_t *data_buffs, - int sequence_number, int group_size, int rbuf_offset, size_t pack_len) -{ - volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer; - int bcol_id = (int) bcol_module->super.bcol_id; - int8_t ready_flag = my_ctl_pointer->ready_flag + 1; - netpatterns_tree_node_t *my_fanout_read_tree; - volatile void *parent_data_pointer; - int my_fanout_parent, my_rank; - void *parent_rbuf, *rbuf; - - my_rank = bcol_module->super.sbgp_partner_module->my_index; - my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_rank]); - - if (ROOT_NODE != my_fanout_read_tree->my_node_type) { - my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift; - if (group_size <= my_fanout_parent) { - my_fanout_parent -= group_size; - } - - rbuf = (void *)((char *) my_data_pointer + rbuf_offset); - - /* - * Get parent payload data and control data. - * Get the pointer to the base address of the parent's payload buffer. - * Get the parent's control buffer. - */ - parent_data_pointer = data_buffs[my_fanout_parent].payload; - parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; - - parent_rbuf = (void *) ((char *) parent_data_pointer + rbuf_offset); - - /* Wait until parent signals that data is ready */ - /* The order of conditions checked in this loop is important, as it can - * result in a race condition. - */ - if (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) { - return BCOL_FN_STARTED; - } - - assert (parent_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] == ready_flag); - - /* Copy the rank to a shared buffer writable by the current rank */ - memcpy ((void *) rbuf, (const void*) parent_rbuf, pack_len); - } - - if (LEAF_NODE != my_fanout_read_tree->my_node_type) { - opal_atomic_wmb (); - - /* Signal to children that they may read the data from my shared buffer (bump the ready flag) */ - my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag; - } - - my_ctl_pointer->starting_flag_value[bcol_id] += 1; - - return BCOL_FN_COMPLETE; - -} - -static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args) -{ - mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - int buff_idx = input_args->src_desc->buffer_index; - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration; - void *data_addr = (void *) input_args->src_desc->data_addr; - int my_node_index, my_rank, group_size, leading_dim, idx; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - int64_t sequence_number = input_args->sequence_num; - volatile mca_bcol_basesmuma_payload_t *data_buffs; - struct ompi_datatype_t *dtype = input_args->dtype; - netpatterns_tree_node_t *my_reduction_node; - struct ompi_op_t *op = input_args->op; - volatile void *my_data_pointer; - int count = input_args->count; - int rc, process_shift; - ptrdiff_t lb, extent; - volatile void *rbuf; - - /* get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim = bcol_module->colls_no_user_data.size_of_group; - idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - /* Align node index to around sbgp root */ - process_shift = input_args->root; - my_node_index = my_rank - input_args->root; - if (0 > my_node_index ) { - my_node_index += group_size; - } - - data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx; - /* Get control structure and payload buffer */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - my_data_pointer = (volatile char *) data_addr; - - my_data_pointer = (volatile char *) data_addr; - rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset); - - /*************************** - * Fan into root phase - ***************************/ - - my_reduction_node = &(bcol_module->reduction_tree[my_node_index]); - if (-1 != *iteration) { - rc = reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer, - dtype, data_buffs, count, op, process_shift); - if (BCOL_FN_COMPLETE != rc) { - return rc; - } - } - - /* there might be non-contig dtype - so compute the length with get_extent */ - ompi_datatype_get_extent(dtype, &lb, &extent); - - /*************************** - * Fan out from root - ***************************/ - - /* all nodes will have the result after fanout */ - input_args->result_in_rbuf = true; - - /* Signal that you are ready for fanout phase */ - return allreduce_fanout (bcol_module, my_ctl_pointer, my_data_pointer, process_shift, data_buffs, - sequence_number, group_size, input_args->rbuf_offset, count * (size_t) extent); -} - -/** - * Shared memory blocking allreduce. - */ -int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - int buff_idx = input_args->src_desc->buffer_index; - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration; - void *data_addr = (void *) input_args->src_desc->data_addr; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_payload_t *data_buffs; - struct ompi_datatype_t *dtype = input_args->dtype; - int bcol_id = (int) bcol_module->super.bcol_id; - int rc, my_rank, leading_dim, idx; - volatile void *my_data_pointer; - volatile void *sbuf, *rbuf; - int8_t ready_flag; - - /* get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - leading_dim = bcol_module->colls_no_user_data.size_of_group; - idx = SM_ARRAY_INDEX(leading_dim, buff_idx, 0); - - data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx; - /* Get control structure */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - my_data_pointer = (volatile char *) data_addr; - rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset); - sbuf = (volatile void *)((char *) my_data_pointer + input_args->sbuf_offset); - - /* Setup resource recycling */ - /* Set for multiple instances of bcols */ - BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, input_args->sequence_num, bcol_id); - - if (sbuf != rbuf) { - rc = ompi_datatype_copy_content_same_ddt (dtype, input_args->count, (char *)rbuf, - (char *)sbuf); - if( 0 != rc ) { - return OMPI_ERROR; - } - } - - *iteration = 0; - my_ctl_pointer->ready_flag = ready_flag; - - return bcol_basesmuma_allreduce_intra_fanin_fanout_progress (input_args, c_input_args); -} - - - -/* this thing uses the old bcol private control structures */ -int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - - int my_rank,group_size,my_node_index; - int pair_rank, exchange, extra_rank, payload_len; - size_t dt_size; - int read_offset, write_offset; - volatile void *my_data_pointer; - volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer = NULL, - *partner_ctl_pointer = NULL, - *extra_ctl_pointer = NULL; - volatile void *my_read_pointer, *my_write_pointer, *partner_read_pointer, - *extra_rank_readwrite_data_pointer,*extra_rank_read_data_pointer; - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - - int8_t ready_flag; - int sbuf_offset,rbuf_offset,flag_offset; - int root,count; - struct ompi_op_t *op; - int64_t sequence_number=input_args->sequence_num; - struct ompi_datatype_t *dtype; - int first_instance = 0; - int leading_dim,idx; - int buff_idx; - mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - /*volatile void **data_buffs;*/ - volatile mca_bcol_basesmuma_payload_t *data_buffs; - netpatterns_pair_exchange_node_t *my_exchange_node; - - - /* - * Get addressing information - */ - buff_idx = input_args->src_desc->buffer_index; - - my_rank = bcol_module->super.sbgp_partner_module->my_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim = bcol_module->colls_no_user_data.size_of_group; - idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - /* - * Get SM control structures and payload buffers - */ - ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **) - bcol_module->colls_with_user_data.ctl_buffs+idx; - /*data_buffs = (volatile void **) - bcol_module->colls_with_user_data.data_buffs+idx;*/ - - data_buffs = (volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs + idx; - - - /* - * Get control structure and payload buffer - */ - my_ctl_pointer = ctl_structs[my_rank]; - if (my_ctl_pointer->sequence_number < sequence_number) { - first_instance=1; - } - my_data_pointer = data_buffs[my_rank].payload; - - /* - * Align node index to around sbgp root - */ - root = input_args->root; - my_node_index = my_rank - root; - if (0 > my_node_index) { - my_node_index += group_size; - } - - /* - * Get data from arguments - */ - sbuf_offset = input_args->sbuf_offset; - rbuf_offset = input_args->rbuf_offset; - op = input_args->op; - count = input_args->count; - dtype = input_args->dtype; - - /* - * Get my node for the reduction tree - */ - my_exchange_node = &(bcol_module->recursive_doubling_tree); - - - if (first_instance) { - my_ctl_pointer->index = 1; - my_ctl_pointer->starting_flag_value = 0; - flag_offset = 0; - my_ctl_pointer->flag = -1; - /* - for( i = 0; i < NUM_SIGNAL_FLAGS; i++){ - my_ctl_pointer->flags[ALLREDUCE_FLAG] = -1; - } - */ - } else { - my_ctl_pointer->index++; - flag_offset = my_ctl_pointer->starting_flag_value; - } - - /* signal that I have arrived */ - /* opal_atomic_wmb (); */ - my_ctl_pointer->sequence_number = sequence_number; - - /* If we use this buffer more than once by an sm module in - * a given collective, will need to distinguish between instances, so - * we pick up the right data. - */ - ready_flag = flag_offset + sequence_number + 1; - - /* - * Set up pointers for using during recursive doubling phase - */ - read_offset = sbuf_offset; - write_offset = rbuf_offset; - fprintf(stderr,"read offset %d write offset %d\n",read_offset,write_offset); - my_read_pointer = (volatile void *)((char *) my_data_pointer + read_offset); - my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset); - - /* - * When there are non-power 2 nodes, the extra nodes' data is copied and - * reduced by partner exchange nodes. - * Extra nodes: Nodes with rank greater nearest power of 2 - * Exchange nodes: Nodes with rank lesser than nearest power of 2 that - * partner with extras nodes during reduction - */ - - if (0 < my_exchange_node->n_extra_sources) { - /* - * Signal extra node that data is ready - */ - opal_atomic_wmb (); - - my_ctl_pointer->flag = ready_flag; - - if (EXCHANGE_NODE == my_exchange_node->node_type) { - extra_rank = my_exchange_node->rank_extra_source; - extra_ctl_pointer = ctl_structs[extra_rank]; - extra_rank_readwrite_data_pointer = (void *) ((char *) data_buffs[extra_rank].payload + - read_offset); - - /* - * Wait for data to get ready - */ - while (!((sequence_number == extra_ctl_pointer->sequence_number) && - (extra_ctl_pointer->flag >= ready_flag))){ - } - - ompi_op_reduce(op,(void *)extra_rank_readwrite_data_pointer, - (void *)my_read_pointer, count, dtype); - } - } - - - /* --Exchange node that reduces with extra node --: Signal to extra node that data is read - * --Exchange node that doesn't reduce data with extra node --: This assignment - * is used so it can sync with other nodes during exchange phase - * --Extra node--: It can pass to next phase - */ - ready_flag++; - /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/ - my_ctl_pointer->flag = ready_flag; - - - /* - * Exchange data with all the nodes that are less than max_power_2 - */ - for (exchange=0 ; exchange < my_exchange_node->n_exchanges ; exchange++) { - int tmp=0; - - /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/ - my_ctl_pointer->flag = ready_flag; - pair_rank=my_exchange_node->rank_exchanges[exchange]; - partner_ctl_pointer = ctl_structs[pair_rank]; - partner_read_pointer = (volatile void *) ((char *)data_buffs[pair_rank].payload + read_offset); - - my_read_pointer = (volatile void *)((char *) my_data_pointer + read_offset); - my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset); - - /* - * Wait for partner to be ready, so we can read - */ - /* - JSL ---- FIX ME !!!!! MAKE ME COMPLIANT WITH NEW BUFFERS - while (!IS_ALLREDUCE_PEER_READY(partner_ctl_pointer, - ready_flag, sequence_number)) { - } - */ - - /* - * Perform reduction operation - */ - ompi_3buff_op_reduce(op,(void *)my_read_pointer, (void *)partner_read_pointer, - (void *)my_write_pointer, count, dtype); - - - /* - * Signal that I am done reading my partner's data - */ - ready_flag++; - /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/ - my_ctl_pointer->flag = ready_flag; - - while (ready_flag > partner_ctl_pointer->flag){ - opal_progress(); - } - - /* - * Swap read and write offsets - */ - tmp = read_offset; - read_offset = write_offset; - write_offset = tmp; - - } - - - /* - * Copy data in from the "extra" source, if need be - */ - - if (0 < my_exchange_node->n_extra_sources) { - - if (EXTRA_NODE == my_exchange_node->node_type) { - - int extra_rank_read_offset=-1,my_write_offset=-1; - - /* Offset the ready flag to sync with - * exchange node which might going through exchange phases - * unlike the extra node - */ - ready_flag = ready_flag + my_exchange_node->log_2; - - if (my_exchange_node->log_2%2) { - extra_rank_read_offset = rbuf_offset; - my_write_offset = rbuf_offset; - - } else { - extra_rank_read_offset = sbuf_offset; - my_write_offset = sbuf_offset; - - } - - my_write_pointer = (volatile void*)((char *)my_data_pointer + my_write_offset); - extra_rank = my_exchange_node->rank_extra_source; - extra_ctl_pointer = ctl_structs[extra_rank]; - - extra_rank_read_data_pointer = (volatile void *) ((char *)data_buffs[extra_rank].payload + - extra_rank_read_offset); - - /* - * Wait for the exchange node to be ready - */ - ompi_datatype_type_size(dtype, &dt_size); - payload_len = count*dt_size; -#if 0 - fix me JSL !!!!! - while (!IS_DATA_READY(extra_ctl_pointer, ready_flag, sequence_number)){ - } -#endif - memcpy((void *)my_write_pointer,(const void *) - extra_rank_read_data_pointer, payload_len); - - ready_flag++; - /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/ - my_ctl_pointer->flag = ready_flag; - - - } else { - - /* - * Signal parent that data is ready - */ - opal_atomic_wmb (); - /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/ - my_ctl_pointer->flag = ready_flag; - - /* wait until child is done to move on - this buffer will - * be reused for the next stripe, so don't want to move - * on too quick. - */ - extra_rank = my_exchange_node->rank_extra_source; - extra_ctl_pointer = ctl_structs[extra_rank]; - } - } - - input_args->result_in_rbuf = my_exchange_node->log_2 & 1; - - my_ctl_pointer->starting_flag_value += 1; - - return BCOL_FN_COMPLETE; -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast.c deleted file mode 100644 index 340c0c4c7f..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast.c +++ /dev/null @@ -1,487 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "ompi_config.h" - -#include "ompi/constants.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" - -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" - -#include "bcol_basesmuma.h" - -#define __TEST_BLOCKING__ 1 -#define __TEST_WAIT__ 0 -#define __TEST_TEST__ 0 - -/* debug - * #include "opal/sys/timer.h" - * - * extern uint64_t timers[7]; - * end debug */ - -/* debug */ -/* end debug */ -int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_BCAST; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1048576; - comm_attribs.data_src = DATA_SRC_KNOWN; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_bcast_k_nomial_knownroot, - bcol_basesmuma_bcast_k_nomial_knownroot); - - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_bcast_k_nomial_knownroot, - bcol_basesmuma_bcast_k_nomial_knownroot); - - comm_attribs.data_src = DATA_SRC_UNKNOWN; - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_bcast_k_nomial_anyroot, - bcol_basesmuma_bcast_k_nomial_anyroot); - - comm_attribs.data_src = DATA_SRC_UNKNOWN; - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - -#ifdef __PORTALS_AVAIL__ - - comm_attribs.waiting_semantics = BLOCKING; - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_lmsg_scatter_allgather_portals_bcast, - bcol_basesmuma_lmsg_scatter_allgather_portals_bcast); - - - comm_attribs.waiting_semantics = NON_BLOCKING; - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast, - bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast); - - comm_attribs.data_src = DATA_SRC_KNOWN; - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast, - bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast); - -#else - /* - if (super->use_hdl) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_hdl_zerocopy_bcast, - bcol_basesmuma_hdl_zerocopy_bcast); - } else { */ - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, NULL, NULL); - /* - bcol_basesmuma_binary_scatter_allgather_segment, - bcol_basesmuma_binary_scatter_allgather_segment); - */ - /* } */ -#endif - - return OMPI_SUCCESS; -} - -/* includes shared memory optimization */ - -/** - * Shared memory blocking Broadcast - fanin, for small data buffers. - * This routine assumes that buf (the input buffer) is a single writer - * multi reader (SWMR) shared memory buffer owned by the calling rank - * which is the only rank that can write to this buffers. - * It is also assumed that the buffers are registered and fragmented - * at the ML level and that buf is sufficiently large to hold the data. - * - * - * @param buf - SWMR shared buffer within a sbgp that the - * executing rank can write to. - * @param count - the number of elements in the shared buffer. - * @param dtype - the datatype of a shared buffer element. - * @param root - the index within the sbgp of the root. - * @param module - basesmuma module. - */ -int bcol_basesmuma_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int group_size, process_shift, my_node_index; - int my_rank; - int rc = OMPI_SUCCESS; - int my_fanout_parent; - int leading_dim, buff_idx, idx; - volatile int8_t ready_flag; - int count=input_args->count; - struct ompi_datatype_t* dtype=input_args->dtype; - int root=input_args->root; - int64_t sequence_number=input_args->sequence_num; - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - int bcol_id = (int) bcol_module->super.bcol_id; - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char* parent_data_pointer; - mca_bcol_basesmuma_header_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer; - netpatterns_tree_node_t* my_fanout_read_tree; - size_t pack_len = 0, dt_size; - - void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr ); - -#if 0 - fprintf(stderr,"Entering sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset); - fflush(stderr); -#endif - - - /* we will work only on packed data - so compute the length*/ - ompi_datatype_type_size(dtype, &dt_size); - pack_len=count*dt_size; - - buff_idx = input_args->src_desc->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Align node index to around sbgp root */ - process_shift = root; - my_node_index = my_rank - root; - if(0 > my_node_index ) { - my_node_index += group_size; - } - - /* get my node for the bcast tree */ - my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); - my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift; - if(group_size <= my_fanout_parent){ - my_fanout_parent -= group_size; - } - - /* Set pointer to current proc ctrl region */ - /*my_ctl_pointer = ctl_structs[my_rank]; */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - /* setup resource recycling */ - - BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); - - /* - * Fan out from root - */ - if(ROOT_NODE == my_fanout_read_tree->my_node_type) { - input_args->result_in_rbuf = false; - /* Root should only signal it is ready */ - my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; - - }else if(LEAF_NODE == my_fanout_read_tree->my_node_type) { - input_args->result_in_rbuf = false; - /* - * Get parent payload data and control data. - * Get the pointer to the base address of the parent's payload buffer. - * Get the parent's control buffer. - */ - parent_data_pointer = data_buffs[my_fanout_parent].payload; - parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; - - /* Wait until parent signals that data is ready */ - /* The order of conditions checked in this loop is important, as it can - * result in a race condition. - */ - while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ - opal_progress(); - } - - /* Copy the rank to a shared buffer writable by the current rank */ - memcpy(data_addr, (void *)parent_data_pointer, pack_len); - - if( 0 != rc ) { - return OMPI_ERROR; - } - - }else{ - input_args->result_in_rbuf = false; - /* Interior node */ - - /* Get parent payload data and control data */ - parent_data_pointer = data_buffs[my_fanout_parent].payload; - parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; - - - /* Wait until parent signals that data is ready */ - /* The order of conditions checked in this loop is important, as it can - * result in a race condition. - */ - while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ - opal_progress(); - } - - /* Copy the rank to a shared buffer writable by the current rank */ - memcpy(data_addr, (void *)parent_data_pointer,pack_len); - - /* Signal to children that they may read the data from my shared buffer */ - opal_atomic_wmb (); - my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; - } - - /* if I am the last instance of a basesmuma function in this collectie, - * release the resrouces */ - - my_ctl_pointer->starting_flag_value[bcol_id]++; - - return rc; -} - - -/*zero-copy large massage communication methods*/ -#if 0 -int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int group_size, process_shift, my_node_index; - int my_rank, first_instance=0, flag_offset; - int rc = OMPI_SUCCESS; - int my_fanout_parent; - int leading_dim, buff_idx, idx; - volatile int64_t ready_flag; - int count=input_args->count; - struct ompi_datatype_t* dtype=input_args->dtype; - int root=input_args->root; - int64_t sequence_number=input_args->sequence_num; - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - - netpatterns_tree_node_t* my_fanout_read_tree; - size_t pack_len = 0, dt_size; - - void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr); - - struct mca_hdl_base_descriptor_t *hdl_desc; - struct mca_hdl_base_segment_t *hdl_seg; - int ret, completed, ridx/*remote rank index*/; - bool status; - volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer= NULL; - volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer= NULL; - volatile mca_bcol_basesmuma_ctl_struct_t *child_ctl_pointer= NULL; - struct mca_hdl_base_module_t* hdl = bcol_module->hdl_module[0]; - - - /* we will work only on packed data - so compute the length*/ - ompi_datatype_type_size(dtype, &dt_size); - pack_len = count * dt_size; - - buff_idx = input_args->src_desc->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **) - bcol_module->colls_with_user_data.ctl_buffs+idx; - my_ctl_pointer = ctl_structs[my_rank]; - - /* Align node index to around sbgp root */ - process_shift = root; - my_node_index = my_rank - root; - if(0 > my_node_index ) { - my_node_index += group_size; - } - - /* get my node for the bcast tree */ - my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); - my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift; - if(group_size <= my_fanout_parent){ - my_fanout_parent -= group_size; - } - - /* setup resource recycling */ - if( my_ctl_pointer->sequence_number < sequence_number ) { - first_instance = 1; - } - - if( first_instance ) { - /* Signal arrival */ - my_ctl_pointer->flag = -1; - my_ctl_pointer->index = 1; - /* this does not need to use any flag values , so only need to - * set the value for subsequent values that may need this */ - my_ctl_pointer->starting_flag_value = 0; - flag_offset = 0; - } else { - /* only one thread at a time will be making progress on this - * collective, so no need to make this atomic */ - my_ctl_pointer->index++; - } - - - /* increment the starting flag by one and return */ - flag_offset = my_ctl_pointer->starting_flag_value; - ready_flag = flag_offset + sequence_number + 1; - my_ctl_pointer->sequence_number = sequence_number; - - hdl_desc = (mca_hdl_base_descriptor_t *) - malloc (sizeof (mca_hdl_base_descriptor_t) * 1); - - /*prepare a hdl data segment*/ - hdl_seg = (mca_hdl_base_segment_t*) - malloc ( sizeof (mca_hdl_base_segment_t) * 1); - hdl_seg->seg_addr.pval = input_args->sbuf; - hdl_seg->seg_len = pack_len; - - - hdl->endpoint->ready_flag = ready_flag; - hdl->endpoint->local_ctrl = my_ctl_pointer; - hdl->endpoint->sbgp_contextid = - bcol_module->super.sbgp_partner_module->group_comm->c_contextid; - - /* - * Fan out from root - */ - if(ROOT_NODE == my_fanout_read_tree->my_node_type) { - input_args->result_in_rbuf = false; - - hdl_desc->des_src = hdl_seg; - hdl_desc->des_src_cnt = 1; - hdl_desc->isroot = true; - - /*As the general semantics, there might multiple pairs of send/recv - *on the topology tree*/ - for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) { - child_ctl_pointer = - ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; - hdl->endpoint->remote_ctrl = child_ctl_pointer; - ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc); - if (ret != OMPI_SUCCESS) { - BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank)); - goto exit_ERROR; - } - } - }else if(LEAF_NODE == my_fanout_read_tree->my_node_type) { - input_args->result_in_rbuf = false; - /* - * Get parent payload data and control data. - * Get the pointer to the base address of the parent's payload buffer. - * Get the parent's control buffer. - */ - parent_ctl_pointer = ctl_structs[my_fanout_parent]; - - hdl_desc->des_dst = hdl_seg; - hdl_desc->des_dst_cnt = 1; - hdl_desc->isroot = false; - hdl->endpoint->remote_ctrl = parent_ctl_pointer; - -#if __TEST_BLOCKING__ - ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc); -#else - ret = hdl->hdl_recvi(hdl, hdl->endpoint, NULL, 0, 0, &hdl_desc); -#endif - -#if __TEST_WAIT__ - ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc); - BASESMUMA_VERBOSE(1,("wait on rank %d is done!", my_rank)); -#endif - if (OMPI_SUCCESS != ret) { - BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank)); - goto exit_ERROR; - } - - status = false; -#if __TEST_TEST__ - while (!status) { - hdl->hdl_test(&hdl_desc, &completed, &status); - opal_progress(); - BASESMUMA_VERBOSE(1, ("test on rank %d ........", my_rank)); - } -#endif - - goto Release; - - }else{ - input_args->result_in_rbuf = false; - /* Interior node */ - - /* Get parent payload data and control data */ - parent_ctl_pointer = ctl_structs[my_fanout_parent]; - - hdl_desc->des_dst = hdl_seg; - hdl_desc->des_dst_cnt = 1; - hdl_desc->isroot = false; - - hdl->endpoint->remote_ctrl = parent_ctl_pointer; - - ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc); - if (OMPI_SUCCESS != ret) { - goto exit_ERROR; - } - if (OMPI_SUCCESS != ret) { - BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank)); - goto exit_ERROR; - } - - /* Signal to children that they may read the data from my shared buffer */ - opal_atomic_wmb (); - hdl_desc->des_src = hdl_seg; - hdl_desc->des_src_cnt = 1; - for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) { - child_ctl_pointer = - ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; - hdl->endpoint->remote_ctrl = child_ctl_pointer; - - ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc); - if (ret != OMPI_SUCCESS) { - BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank)); - goto exit_ERROR; - } - } - goto Release; - } - - Release: - /* if I am the last instance of a basesmuma function in this collectie, - * release the resrouces */ - if (IS_LAST_BCOL_FUNC(c_input_args)) { - rc = bcol_basesmuma_free_buff( - &(bcol_module->colls_with_user_data), - sequence_number); - } - - my_ctl_pointer->starting_flag_value += 1; - - return BCOL_FN_COMPLETE; - exit_ERROR: - return OMPI_ERROR; -} -#endif diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast_prime.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast_prime.c deleted file mode 100644 index 4bb451f907..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast_prime.c +++ /dev/null @@ -1,895 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "ompi/constants.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" - -#include "bcol_basesmuma_utils.h" -#include "bcol_basesmuma.h" - -/* debug - * #include "opal/sys/timer.h" - * - * extern uint64_t timers[7]; - * end debug */ - -/* debug */ -#include -/* end debug */ - -/* includes shared memory optimization */ - -#define BCOL_BASESMUMA_SM_PROBE(src_list, n_src, my_index, matched, src) \ - do { \ - int j; \ - for( j = 0; j < n_src; j++) { \ - parent_ctl_pointer = data_buffs[src_list[j]].ctl_struct; \ - parent_data_pointer = data_buffs[src_list[j]].payload; \ - if( IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) { \ - src = src_list[j]; \ - matched = 1; \ - break; \ - } \ - } \ - } while(0) - -/* - #define IS_LARGE_DATA_READY(peer, my_flag, my_sequence_number) \ - (((peer)->sequence_number == (my_sequence_number) && \ - (peer)->flags[BCAST_FLAG] >= (my_flag) \ - )? true : false ) -*/ - -/* - #define IS_KNOWN_ROOT_DATA_READY(peer, my_flag, my_sequence_number) \ - (((peer)->sequence_number == (my_sequence_number) && \ - (peer)->flags[BCAST_FLAG][bcol_id] >= (my_flag) \ - )? true : false ) -*/ - -#define BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(src_list, n_src, my_index, matched, src, flag_index, bcol_id) \ - do { \ - int j; \ - for( j = 0; j < n_src; j++) { \ - /* fprintf(stderr,"my_rank %d and %d\n",my_rank,1); */ \ - if(src_list[j] != -1) { \ - parent_ctl_pointer = ctl_structs[src_list[j]]; \ - parent_data_pointer = (void *) data_buffs[src_list[j]].ctl_struct; \ - /*fprintf(stderr,"my_rank %d ready flag %d partner flag %d and %d\n",my_rank,ready_flag,parent_ctl_pointer->flag,2); */ \ - if( IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, flag_index, bcol_id)) { \ - src = src_list[j]; \ - matched = 1; \ - index = j; \ - /* fprintf(stderr,"found it from %d!\n",src);*/ \ - break; \ - } \ - } \ - } \ - } while(0) - -#define K_NOMIAL_DATA_SRC(radix, my_group_index, group_size, group_root, data_src, radix_mask) \ - do { \ - int relative_rank = (my_group_index >= group_root) ? my_group_index - group_root : \ - my_group_index - group_root + group_size; \ - radix_mask = 1; \ - while (radix_mask < group_size) { \ - if (relative_rank % (radix * radix_mask)) { \ - data_src = relative_rank/(radix * radix_mask) * (radix * radix_mask) + group_root; \ - if (data_src >= group_size) data_src -= group_size; \ - break; \ - } \ - radix_mask *= radix; \ - } \ - } while (0) - -int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - int i, matched = 0; - int group_size; - int my_rank; - int leading_dim, - buff_idx, - idx; - int count = input_args->count; - struct ompi_datatype_t* dtype = input_args->dtype; - int64_t sequence_number = input_args->sequence_num; - int radix = - mca_bcol_basesmuma_component.k_nomial_radix; - int radix_mask; - int16_t data_src = -1; - - volatile int8_t ready_flag; - int bcol_id = (int) bcol_module->super.bcol_id; - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char* parent_data_pointer; - volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - - size_t pack_len = 0; - void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr + - input_args->sbuf_offset); - -#if 0 - fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset); - fflush(stderr); -#endif - - - /* we will work only on packed data - so compute the length*/ - BASESMUMA_VERBOSE(3, ("Calling bcol_basesmuma_bcast_k_nomial_knownroot")); - - pack_len = mca_bcol_base_get_buff_length(dtype, count); - /* Some hierarchical algorithms have data that is accumulated at each step - * this factor accounts for this - */ - pack_len = pack_len*input_args->hier_factor; - buff_idx = input_args->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim = bcol_module->colls_no_user_data.size_of_group; - idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0); - data_buffs = (volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs + idx; - - /* Set pointer to current proc ctrl region */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - /* setup resource recycling */ - BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); - /* removing dependence on sequence number */ - /* I believe this is resolved now with the signaling flags */ - /* - ready_temp = 1 + (int8_t) flag_offset + (int8_t) bcol_id; - if( ready_temp >= my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) { - ready_flag = ready_temp; - } else { - ready_flag = my_ctl_pointer->flags[BCAST_FLAG][bcol_id]; - } - opal_atomic_wmb (); - my_ctl_pointer->sequence_number = sequence_number; - */ - - - /* non-blocking broadcast algorithm */ - - /* If I am the root, then signal ready flag */ - if(input_args->root_flag) { - BASESMUMA_VERBOSE(10,("I am the root of the data")); - /* - * signal ready flag - */ - opal_atomic_wmb (); - my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; - - /* root is finished */ - goto Release; - } - - - /* Calculate source of the data */ - K_NOMIAL_DATA_SRC(radix, my_rank, group_size, - input_args->root_route->rank, data_src, radix_mask); - - - parent_ctl_pointer = data_buffs[data_src].ctl_struct; - parent_data_pointer = data_buffs[data_src].payload; - - for( i = 0; i < cs->num_to_probe && 0 == matched; i++) { - - if(IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, BCAST_FLAG, bcol_id)) { - matched = 1; - break; - } - } - - /* If not matched, then hop out and put me on progress list */ - if(0 == matched ) { - BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match")); - return BCOL_FN_NOT_STARTED; - } - - /* else, we found our root within the group ... */ - BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", data_src)); - - /* copy the data */ - memcpy(data_addr, (void *) parent_data_pointer, pack_len); - /* set the memory barrier to ensure completion */ - opal_atomic_wmb (); - /* signal that I am done */ - my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; - - - Release: - my_ctl_pointer->starting_flag_value[bcol_id]++; - return BCOL_FN_COMPLETE; -} - - -/** - * Shared memory non-blocking Broadcast - K-nomial fan-out for small data buffers. - * This routine assumes that buf (the input buffer) is a single writer - * multi reader (SWMR) shared memory buffer owned by the calling rank - * which is the only rank that can write to this buffers. - * It is also assumed that the buffers are registered and fragmented - * at the ML level and that buf is sufficiently large to hold the data. - * - * - * @param buf - SWMR shared buffer within a sbgp that the - * executing rank can write to. - * @param count - the number of elements in the shared buffer. - * @param dtype - the datatype of a shared buffer element. - * @param root - the index within the sbgp of the root. - * @param module - basesmuma module. - */ -int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - int i; - int group_size; - int my_rank; - int leading_dim, buff_idx, idx; - int count=input_args->count; - struct ompi_datatype_t* dtype=input_args->dtype; - int64_t sequence_number=input_args->sequence_num; - int radix = cs->k_nomial_radix; - int radix_mask; - int relative_rank; - int pow_k_group_size; - - volatile int8_t ready_flag; - int bcol_id = (int) bcol_module->super.bcol_id; - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile void* parent_data_pointer; - - volatile mca_bcol_basesmuma_header_t *child_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - - size_t pack_len = 0; - void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr + - input_args->sbuf_offset); - -#if 0 - fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset); - fflush(stderr); -#endif - - - - /* we will work only on packed data - so compute the length*/ - pack_len = mca_bcol_base_get_buff_length(dtype, count); - - buff_idx = input_args->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - /* get pow_k_levels and pow_k_group_size */ - pow_k_group_size = bcol_module->pow_k; - - - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Set pointer to current proc ctrl region */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); - - /* non-blocking broadcast algorithm */ - - /* If I am the root, then signal ready flag */ - if(input_args->root_flag) { - - BASESMUMA_VERBOSE(10,("I am the root of the data")); - /* - * set the radix_mask */ - radix_mask = pow_k_group_size; - /* send to children */ - opal_atomic_wmb (); - BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask, - radix,0, - my_rank,group_size, ready_flag); - /* root is finished */ - goto Release; - } - - /* If I am not the root, then poll on possible "senders'" control structs */ - for( i = 0; i < cs->num_to_probe; i++) { - - if( ready_flag == my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) { - - /* else, we found our root within the group ... */ - parent_data_pointer = data_buffs[my_ctl_pointer->src].payload; - BASESMUMA_VERBOSE(5,("%d found it from %d \n",my_rank,my_ctl_pointer->src)); - /* memcopy the data */ - memcpy(data_addr, (void *) parent_data_pointer, pack_len); - /* compute my relative rank */ - relative_rank = (my_rank - my_ctl_pointer->src) < 0 ? my_rank - - my_ctl_pointer->src + group_size : my_rank - my_ctl_pointer->src; - - /* compute my radix mask */ - radix_mask = 1; - while(radix_mask < group_size ){ - if( 0 != relative_rank % (radix*radix_mask)) { - /* found it */ - break; - } - radix_mask *= radix; - } - /* go one step back */ - radix_mask /= radix; - - /* send to children */ - opal_atomic_wmb (); - BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask, - radix, relative_rank, - my_rank, group_size, ready_flag); - /* bail */ - - goto Release; - } - - } - - - - /* If not matched, then hop out and put me on progress list */ - BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match")); - /*fprintf(stderr,"bcol_id %d Not started\n",bcol_id);*/ - return BCOL_FN_NOT_STARTED; - - - - Release: - - - my_ctl_pointer->starting_flag_value[bcol_id]++; - - return BCOL_FN_COMPLETE; -} - - -/* non-blocking binary scatter allgather anyroot algorithm for large data - * broadcast - */ - - -#if 0 -/* prototype code for shared memory scatter/allgather algorithm. Signaling scheme - * works, should be used as a reference for other types of shared memory scatter/allgather - * algorithms. - */ -int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - - /* local variables */ - int i, j; - int length; - int start; - int my_rank, parent_rank; - int partner; - int src = -1; - int matched = 0; - int group_size; - int first_instance=0; - int leading_dim, buff_idx, idx; - int64_t sequence_number=input_args->sequence_num; - - int64_t ready_flag; - int64_t local_offset; - - int flag_offset; - int pow_2, pow_2_levels; - int index = -1; - - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - mca_bcol_basesmuma_module_t *bcol_module = - (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - /* use the old control structs for large messages, - * otherwise we will destroy the shared memory - * optimization - */ - mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer; - mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* binomial fanout */ - mca_bcol_basesmuma_ctl_struct_t *partner_ctl_pointer; /* recursive double */ - - /* for now, we use the payload buffer for single fragment */ - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile void *parent_data_pointer; /* binomial scatter */ - volatile void *partner_data_pointer; /* recursive double */ - - uint32_t fragment_size; /* ml buffer size for now */ - - /* we will transfer the entire buffer, - * so start at the base address of the ml buffer - */ - void *data_addr = (void *) ((unsigned char *) input_args->src_desc->base_data_addr); -#if 0 - fprintf(stderr,"AAA Entering nb-sm large msg broadcast input_args->frag_size %d \n",input_args->frag_size); - fflush(stderr); -#endif - - buff_idx = input_args->src_desc->buffer_index; - - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - - /* get the largest power of two that is smaller than - * or equal to the group size - */ - pow_2_levels = bcol_module->pow_2_levels; - pow_2 = bcol_module->pow_2; - - /* get the fragment size - */ - - /* still just the size of the entire buffer */ - fragment_size = input_args->buffer_size; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - my_rank = bcol_module->super.sbgp_partner_module->my_index; - - - /* grab the control structs */ - ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **) - bcol_module->colls_with_user_data.ctl_buffs+idx; - - /* grab the data buffs */ - data_buffs = (mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - my_ctl_pointer = ctl_structs[my_rank]; - - if(my_ctl_pointer->sequence_number < sequence_number) { - first_instance = 1; - } - - if(first_instance) { - my_ctl_pointer->flag = -1; - my_ctl_pointer->index = 1; - - my_ctl_pointer->starting_flag_value = 0; - - flag_offset = 0; - - } else { - - my_ctl_pointer->index++; - } - - /* increment the starting flag by one and return */ - flag_offset = my_ctl_pointer->starting_flag_value; - ready_flag = flag_offset + sequence_number + 1; - - my_ctl_pointer->sequence_number = sequence_number; - - /* am I the root */ - if(input_args->root_flag) { - /* if I've already been here, then - * hop down to the allgather - */ - if(ALLGATHER == my_ctl_pointer->status) { - goto Allgather; - } - BASESMUMA_VERBOSE(10,("I am the root of the data")); - /* debug print */ - /*fprintf(stderr,"I am the root %d\n",my_rank);*/ - /* - * signal ready flag - */ - /* set the offset into the buffer */ - my_ctl_pointer->offset = 0; - /* how many children do I have */ - my_ctl_pointer->n_sends = pow_2_levels; - /* my data length */ - my_ctl_pointer->length = fragment_size; - - /* important that these be set before my children - * see the ready flag raised - */ - opal_atomic_wmb (); - my_ctl_pointer->flag = ready_flag; - - /* root is finished */ - if( my_rank < pow_2 ) { - /* if I'm in the power of two group, - * then goto the allgather - */ - my_ctl_pointer->status = ALLGATHER; - goto Allgather; - - } else { - - /* if I'm not, then I'm done and release */ - goto Release; - } - - } - - /* what phase am I participating in - */ - switch(my_ctl_pointer->status) { - - case SCATTER: - goto Scatter; - break; - - case ALLGATHER: - goto Allgather; - break; - - case EXTRA_RANK: - goto Extra; - break; - - default: - break; - } - - - Extra: - /* am I part of the non-power-of-2 group */ - if( my_rank >= pow_2 ) { - /* find parent to copy from */ - parent_rank = my_rank&(pow_2-1); - parent_ctl_pointer = ctl_structs[parent_rank]; - /* start at the base */ - parent_data_pointer = (void *) data_buffs[parent_rank].ctl_struct; - - /* now, I need to do some arithmetic to - * arrive at the value everyone else does - * when they have completed the algorithm - */ - - /* compute ready flag value to poll on */ - ready_flag = ready_flag + pow_2_levels; - - /* start to poll */ - for( i = 0; i< cs->num_to_probe; i++) { - if(IS_LARGE_DATA_READY(parent_ctl_pointer,ready_flag, sequence_number)) { - /* copy the data and bail */ - memcpy(data_addr,(void *)parent_data_pointer,fragment_size); - goto Release; - } - /* - else { - opal_progress(); - } - */ - } - my_ctl_pointer->status = EXTRA_RANK; - - /* hop out and put me onto a progress queue */ - return BCOL_FN_NOT_STARTED; - } - - Scatter: - - /* on first entry, compute the list of possible sources */ - if( NULL == my_ctl_pointer->src_ptr ) { - my_ctl_pointer->src_ptr = (int *) malloc(sizeof(int)*(pow_2_levels+1)); - - for( i = 0; i < pow_2_levels; i++) { - my_ctl_pointer->src_ptr[i] = my_rank ^ (1<src_ptr[i] = my_rank + pow_2; - } else { - /* no extra rank to worry about */ - my_ctl_pointer->src_ptr[i] = -1; - } - } - - /* If I am not the root, then poll on possible "senders'" control structs */ - for( i = 0; i < cs->num_to_probe && 0 == matched; i++) { - - /* Shared memory iprobe */ - BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(my_ctl_pointer->src_ptr, pow_2_levels+1, - my_rank, matched, src); - } - - /* If not matched, then hop out and put me on progress list */ - if(0 == matched ) { - - BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match")); - - my_ctl_pointer->status = SCATTER; - return BCOL_FN_NOT_STARTED; - - } else if ( src >= pow_2 ){ - - /* If matched from an extra rank, then get the whole message from partner */ - memcpy((void *) data_addr, (void *) parent_data_pointer, - parent_ctl_pointer->length); - - /* now I am the psuedo-root in the power-of-two group */ - my_ctl_pointer->offset = 0; - my_ctl_pointer->length = parent_ctl_pointer->length; - my_ctl_pointer->n_sends = parent_ctl_pointer->n_sends; - - /* set the memory barrier */ - opal_atomic_wmb (); - - /* fire the ready flag */ - my_ctl_pointer->flag = ready_flag; - my_ctl_pointer->status = ALLGATHER; - /* go to the allgather */ - goto Allgather; - } - - - /* we need to see whether this is really - * who we are looking for - */ - for( i = 0; i < parent_ctl_pointer->n_sends; i++) { - /* debug print */ - /* - fprintf(stderr,"I am %d checking on a hit from %d with n_sends %d\n",my_rank,src,parent_ctl_pointer->n_sends); - fflush(stderr); - */ - /* end debug */ - if( my_rank == (src^(1<n_sends = i; - - if ( i > 0) { - /* compute the size of the chunk to copy */ - length = (parent_ctl_pointer->length)/ - (1<<(parent_ctl_pointer->n_sends - my_ctl_pointer->n_sends)); - my_ctl_pointer->length = length; - my_ctl_pointer->offset = - parent_ctl_pointer->offset+length; - - /*fprintf(stderr,"%d's offset %d and length %d \n",my_rank,my_ctl_pointer->offset,length);*/ - - /* now we can copy the data */ - memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset), - (void *) ((uint64_t) parent_data_pointer+(uint64_t) parent_ctl_pointer->offset + - (uint64_t) length), - (size_t)length); - } else { - /* this "trick" takes care of the first level - * of recurssive doubling - */ - length = parent_ctl_pointer->length/ - (1<<(parent_ctl_pointer->n_sends - 1)); - my_ctl_pointer->length = length; - my_ctl_pointer->offset = parent_ctl_pointer->offset; - - /*fprintf(stderr,"%d's offset %d and length %d\n",my_rank,my_ctl_pointer->offset,length);*/ - /* now we can copy the data */ - memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset), - (void *) ((uint64_t) parent_data_pointer+(uint64_t) my_ctl_pointer->offset), - (size_t)length); - } - /* set the memory barrier to ensure completion */ - opal_atomic_wmb (); - /* signal that I am done */ - my_ctl_pointer->flag = ready_flag; - /* set my status */ - my_ctl_pointer->status = ALLGATHER; - /* time for allgather phase */ - goto Allgather; - } - - } - - /* this is not who we are looking for, - * mark as false positive so we don't - * poll here again - */ - my_ctl_pointer->src_ptr[index] = -1; - /* probably we should jump out and put onto progress list */ - my_ctl_pointer->status = SCATTER; - return BCOL_FN_NOT_STARTED; - - Allgather: - - /* zip it back up - we have already taken care of first level */ - /* needed for non-blocking conditional */ - matched = 0; - - /* get my local_offset */ - local_offset = my_ctl_pointer->offset; - - /* bump the ready flag */ - ready_flag++; - - /* first level of zip up */ - length = 2*fragment_size/pow_2; - - /* first level of zip-up - * already includes first level of - * recursive doubling - */ - start = 1; - - /* for non-blocking, check to see if I need to reset the state */ - if(my_ctl_pointer->flag >= ready_flag) { - /* then reset the state */ - ready_flag = my_ctl_pointer->flag; - start = my_ctl_pointer->start; - /* get the local offset */ - local_offset = my_ctl_pointer->offset_zip; - /* compute the correct length */ - length = length*(1<<(start - 1)); - /* careful! skip over the opal_atomic_wmb () to avoid the - * cost on every re-entry - */ - goto Loop; - } - - - opal_atomic_wmb (); - /* I am ready, set the flag */ - my_ctl_pointer->flag = ready_flag; - - Loop: - - for( i = start; i < pow_2_levels; i++) { - /* get my partner for this level */ - partner = my_rank^(1<num_to_probe && matched == 0; j++) { - if(IS_LARGE_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) { - - /* debug prints - fprintf(stderr,"666 I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d partner_offset %d\n", - my_rank,sequence_number,partner, ready_flag,partner_ctl_pointer->flag,buff_idx,partner_ctl_pointer->offset); - */ - /* debug print */ -#if 0 - fprintf(stderr,"I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d \n", - my_rank,sequence_number,partner, ready_flag,parent_ctl_pointer->flag,buff_idx); -#endif - /* end debug prints */ - - assert(partner_ctl_pointer->flag >= ready_flag); - /* found it */ - matched = 1; - /* only copy it, if you sit at a lower level in the tree */ - if( my_ctl_pointer->n_sends <= partner_ctl_pointer->n_sends ) { - - /* calculate the local offset based on partner's remote offset */ - if( partner_ctl_pointer->offset < my_ctl_pointer->offset ) { - /* then I'm looking "up" the tree */ - local_offset -= length; - /* debug print */ - /*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/ - /* end debug */ - memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset), - (void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset), - length); - } else { - /* I'm looking "down" the tree */ - local_offset += length; - /* debug print */ - /*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/ - /* end debug */ - memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset), - (void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset), - length); - /* reset my local offset */ - local_offset -= length; - } - - } - /* bump the ready flag */ - ready_flag++; - /* ensure completion */ - opal_atomic_wmb (); - - /* fire the flag for the next level */ - my_ctl_pointer->flag = ready_flag; - - /* double the length */ - length *= 2; - } - } - /* check to see what kind of progress I've made */ - if( 0 == matched ) { - /* save state, hop out and try again later */ - my_ctl_pointer->start = i; - /* save the local offset */ - my_ctl_pointer->offset_zip = local_offset; - /* put in progress queue */ - return BCOL_FN_STARTED; - } - /* else, start next level of recursive doubling */ - matched = 0; - - } - - - /* cleanup */ - if(NULL != my_ctl_pointer->src_ptr) { - free(my_ctl_pointer->src_ptr); - my_ctl_pointer->src_ptr = NULL; - } - - Release: - - - /* If I am the last instance, release the resource */ - /* - if( IS_LAST_BCOL_FUNC(c_input_args)) { - rc = bcol_basesmuma_free_buff( - &(bcol_module->colls_with_user_data), - sequence_number); - } - */ - - my_ctl_pointer->starting_flag_value++; - my_ctl_pointer->status = FINISHED; - return BCOL_FN_COMPLETE; - -} -#endif - -#if 0 -int mca_bcol_basesmuma_bcast_binomial_scatter_allgather(void *desc) -{ - /* local variables */ - int rc, n_frags_sent; - uint32_t stripe_number; - int count, count_processed; - size_t dt_size; - uint32_t n_data_segments_to_schedule; - ompi_datatype_t *dtype; - message_descriptor_t *message_descriptor; - mca_bcol_basesmuma_module_t *bcol_module; - int pipe_depth; - - - /* get the full message descriptor */ - - - /* compute the number of fragments to send */ - - - /* start to fill the pipeline */ - - - return OMPI_SUCCESS; - - - - -} -#endif diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_buf_mgmt.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_buf_mgmt.c deleted file mode 100644 index 86a2811b00..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_buf_mgmt.c +++ /dev/null @@ -1,486 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#include "ompi/constants.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/patterns/comm/coll_ops.h" - -#include "opal/dss/dss.h" - -#include "bcol_basesmuma.h" -/* - * With support for nonblocking collectives, we don't have an upper - * limit on the number of outstanding collectives per communicator. - * Also, since we want to avoid communication to figure out which - * buffers other ranks in the group will use, we will rely on the - * fact that collective operations are called in the same order - * in each process, to assign a unique ID to each collective operation. - * We use this to create a static mapping from the index to the buffer - * that will be used. Also, because there is no limit to the number of - * outstanding collective operations, we use a generation index for each - * memory bank, so the collective will use the buffer only when the - * correct generation of the bank is ready for use. - */ -int bcol_basesmuma_get_buff_index( sm_buffer_mgmt *buff_block, - uint64_t buff_id ) -{ - /* local variables */ - int memory_bank; - uint64_t generation; - int index=-1; - - - /* get the bank index that will be used */ - memory_bank=buff_id& buff_block->mask; - memory_bank = memory_bank SHIFT_DOWN buff_block->log2_num_buffs_per_mem_bank; - - /* get the generation of the bank this maps to */ - generation = buff_id SHIFT_DOWN (buff_block->log2_number_of_buffs); - - /* check to see if the bank is available */ - if( generation == buff_block->ctl_buffs_mgmt[memory_bank]. - bank_gen_counter ) { - - /* get the buffer index that will be returned */ - index=buff_id & buff_block->mask; - - /* no in-use counter increment, as the mapping is static, and - * all we need to know if the number of collectives that complete */ - - } else { - /* progress communications so that resources can be freed up */ - opal_progress(); - } - - /* return */ - return index; -} - -/* release the shared memory buffers - * buf_id is the unique ID assigned to the particular buffer - */ -int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block, - uint64_t buff_id ) -{ - /* local variables */ - int ret=OMPI_SUCCESS; - int memory_bank; - uint64_t generation; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - - /* get the bank index that will be used */ - memory_bank=buff_id& buff_block->mask; - memory_bank = memory_bank SHIFT_DOWN buff_block->log2_num_buffs_per_mem_bank; - - /* get the generation of the bank this maps to */ - generation = buff_id SHIFT_DOWN (buff_block->log2_number_of_buffs); - - /* the generation counter should not change until all resrouces - * associated with this bank have been freed. - */ - assert(generation == buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter); - (void)generation; // silence compiler warning - - /* - * increment counter of completed buffers - */ - OPAL_THREAD_ADD32(&(buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed), - 1); - - /* - * If I am the last to checkin - initiate resource recycling - */ - if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed == - buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) { - - /* Lock to ensure atomic recycling of resources */ - OPAL_THREAD_LOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex)); - - /* make sure someone else did not already get to this */ - if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed != - buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) { - /* release lock and exit */ - OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex)); - } else { - sm_nbbar_desc_t *p_sm_nb_desc = NULL; - /* initiate the freeing of resources. Need to make sure the other - * ranks in the group are also done with their resources before this - * block is made available for use again. - * No one else will try to allocate from this block or free back to - * this block until the next genration counter has been incremented, - * so will just reset the number of freed buffers to 0, so no one else - * will try to also initialize the recycling of these resrouces - */ - buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed=0; - - /* Start the nonblocking barrier */ - p_sm_nb_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc); - p_sm_nb_desc->coll_buff = buff_block; - bcol_basesmuma_rd_nb_barrier_init_admin(p_sm_nb_desc); - - if( NB_BARRIER_DONE != - buff_block->ctl_buffs_mgmt[memory_bank]. - nb_barrier_desc.collective_phase) { - - opal_list_t *list=&(cs->nb_admin_barriers); - opal_list_item_t *append_item; - - /* put this onto the progression list */ - OPAL_THREAD_LOCK(&(cs->nb_admin_barriers_mutex)); - append_item=(opal_list_item_t *) - &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc); - opal_list_append(list,append_item); - OPAL_THREAD_UNLOCK(&(cs->nb_admin_barriers_mutex)); - /* progress communications so that resources can be freed up */ - opal_progress(); - } else { - /* mark the block as available */ - (buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter)++; - } - - /* get out of here */ - OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex)); - } - - } - - /* return */ - return ret; -} - -/* - * Allocate buffers for storing non-blocking collective descriptions, required - * for making code re-entrant - * - */ -static int init_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc, - void *base_addr, uint32_t num_banks, - uint32_t num_buffers_per_bank, - uint32_t size_buffer, - uint32_t header_size, - int group_size, - int pow_k) -{ - uint32_t i, j, ci; - mca_bcol_basesmuma_nb_coll_buff_desc_t *tmp_desc = NULL; - int k_nomial_radix = mca_bcol_basesmuma_component.k_nomial_radix; - int pow_k_val = (0 == pow_k) ? 1 : pow_k; - int num_to_alloc = (k_nomial_radix - 1) * pow_k_val * 2 + 1 ; - - - *desc = (mca_bcol_basesmuma_nb_coll_buff_desc_t *)calloc(num_banks * num_buffers_per_bank, sizeof(mca_bcol_basesmuma_nb_coll_buff_desc_t)); - if (NULL == *desc) { - return OMPI_ERROR; - } - - tmp_desc = *desc; - - for (i = 0; i < num_banks; i++) { - for (j = 0; j < num_buffers_per_bank; j++) { - ci = i * num_buffers_per_bank + j; - tmp_desc[ci].bank_index = i; - tmp_desc[ci].buffer_index = j; - /* *2 is for gather session +1 for extra peer */ - tmp_desc[ci].requests = (ompi_request_t **) - calloc(num_to_alloc, sizeof(ompi_request_t *)); - tmp_desc[ci].data_addr = (void *) - ((unsigned char*)base_addr + ci * size_buffer + header_size); - BASESMUMA_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr)); - } - } - - return OMPI_SUCCESS; -} - - -/* - * Free buffers for storing non-blocking collective descriptions. - * - */ -void cleanup_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc, - uint32_t num_banks, - uint32_t num_buffers_per_bank) -{ - uint32_t ci; - if (NULL != *desc) { - for (ci=0; ciml_mem; - - /* first, we get a pointer to the payload buffer management struct */ - pload_mgmt = &(sm_bcol->colls_with_user_data); - - /* go ahead and get the header size that is cached on the payload block - */ - sm_bcol->total_header_size = data_offset; - - /* allocate memory for pointers to mine and my peers' payload buffers - * difference here is that now we use our new data struct - */ - malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank* - pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t); - pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size); - if( !pload_mgmt->data_buffs) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - /* allocate some memory to hold the offsets */ - results_array = (void **) malloc(pload_mgmt->size_of_group * sizeof (void *)); - if (NULL == results_array) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - /* setup the input file for the shared memory connection manager */ - input_file.file_name = sm_reg_data->file_name; - input_file.size = sm_reg_data->size; - input_file.size_ctl_structure = 0; - input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE; - input_file.mpool_size = sm_reg_data->size; - - /* call the connection manager and map my shared memory peers' file - */ - ret = bcol_basesmuma_smcm_allgather_connection( - sm_bcol, - sm_bcol->super.sbgp_partner_module, - &(cs->sm_connections_list), - &(sm_bcol->payload_backing_files_info), - sm_bcol->super.sbgp_partner_module->group_comm, - input_file, cs->payload_base_fname, - false); - if( OMPI_SUCCESS != ret ) { - goto exit_ERROR; - } - - - /* now we exchange offset info - don't assume symmetric virtual memory - */ - - mem_offset = (void *) ((uintptr_t) ml_block->block->base_addr - - (uintptr_t) cs->sm_payload_structs->data_addr); - - /* call into the exchange offsets function */ - ret=comm_allgather_pml(&mem_offset, results_array, sizeof (void *), MPI_BYTE, - sm_bcol_module->super.sbgp_partner_module->my_index, - sm_bcol_module->super.sbgp_partner_module->group_size, - sm_bcol_module->super.sbgp_partner_module->group_list, - sm_bcol_module->super.sbgp_partner_module->group_comm); - if( OMPI_SUCCESS != ret ) { - goto exit_ERROR; - } - - /* convert memory offset to virtual address in current rank */ - leading_dim = pload_mgmt->size_of_group; - loop_limit = ml_block->num_banks*ml_block->num_buffers_per_bank; - for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) { - - /* get the base pointer */ - int array_id=SM_ARRAY_INDEX(leading_dim,0,i); - if( i == sm_bcol_module->super.sbgp_partner_module->my_index) { - /* me */ - base_ptr=cs->sm_payload_structs->map_addr; - } else { - base_ptr=sm_bcol_module->payload_backing_files_info[i]-> - sm_mmap->map_addr; - } - - /* first, set the pointer to the control struct */ - pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *) - (uintptr_t)(((uint64_t)(uintptr_t)results_array[array_id])+(uint64_t)(uintptr_t)base_ptr); - /* second, calculate where to set the data pointer */ - pload_mgmt->data_buffs[array_id].payload=(void *) - (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct + - (uint64_t)(uintptr_t) data_offset); - - for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) { - int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i); - array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i); - /* now, play the same game as above - * - * first, set the control struct's position */ - pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *) - (uintptr_t)(((uint64_t)(uintptr_t)(pload_mgmt->data_buffs[array_id_m1].ctl_struct) + - (uint64_t)(uintptr_t)ml_block->size_buffer)); - - /* second, set the payload pointer */ - pload_mgmt->data_buffs[array_id].payload =(void *) - (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct + - (uint64_t)(uintptr_t) data_offset); - } - - } - - /* done with the index array */ - free (results_array); - results_array = NULL; - - /* initialize my control structures!! */ - my_idx = sm_bcol_module->super.sbgp_partner_module->my_index; - leading_dim = sm_bcol_module->super.sbgp_partner_module->group_size; - for( buf_id = 0; buf_id < loop_limit; buf_id++){ - array_id = SM_ARRAY_INDEX(leading_dim,buf_id,my_idx); - ctl_ptr = pload_mgmt->data_buffs[array_id].ctl_struct; - - /* initialize the data structures */ - for( j = 0; j < SM_BCOLS_MAX; j++){ - for( i = 0; i < NUM_SIGNAL_FLAGS; i++){ - ctl_ptr->flags[i][j] = -1; - } - } - ctl_ptr->sequence_number = -1; - ctl_ptr->src = -1; - } - - - - - /* setup the data structures needed for releasing the payload - * buffers back to the ml level - */ - for( i=0 ; i < (int) ml_block->num_banks ; i++ ) { - sm_bcol->colls_with_user_data. - ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor= - ml_block; - } - - ml_mem->num_banks = ml_block->num_banks; - ml_mem->bank_release_counter = calloc(ml_block->num_banks, sizeof(uint32_t)); - ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank; - ml_mem->size_buffer = ml_block->size_buffer; - /* pointer to ml level descriptor */ - ml_mem->ml_mem_desc = ml_block; - - if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc, - ml_block->block->base_addr, - ml_mem->num_banks, - ml_mem->num_buffers_per_bank, - ml_mem->size_buffer, - data_offset, - sm_bcol_module->super.sbgp_partner_module->group_size, - sm_bcol_module->pow_k)) { - - BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n")); - return OMPI_ERROR; - } - - return OMPI_SUCCESS; - -exit_ERROR: - if (NULL != results_array) { - free(results_array); - } - return ret; -} - -#endif - - - -/* Basesmuma interface function used for buffer release */ -#if 0 -/* gvm - * A collective operation calls this routine to release the payload buffer. - * All processes in the shared memory sub-group of a bcol should call the non-blocking - * barrier on the last payload buffer of a memory bank. On the completion - * of the non-blocking barrier, the ML callback is called which is responsible - * for recycling the memory bank. - */ -mca_bcol_basesmuma_module_t *sm_bcol_module -int bcol_basesmuma_free_payload_buff( - struct mca_bcol_base_memory_block_desc_t *block, - sm_buffer_mgmt *ctl_mgmt, - uint64_t buff_id) -{ - /* local variables */ - int ret = OMPI_SUCCESS; - - memory_bank = BANK_FROM_BUFFER_IDX(buff_id); - ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed++; - - OPAL_THREAD_ADD32(&(ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed),1); - - if (ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed == block->size_buffers_bank){ - - /* start non-blocking barrier */ - bcol_basesmuma_rd_nb_barrier_init_admin( - &(ctl_mgmt->ctl_buffs_mgmt[memory_bank].nb_barrier_desc)); - - if (NB_BARRIER_DONE != - ctl_mgmt->ctl_buffs_mgmt[memory_bank]. - nb_barrier_desc.collective_phase){ - - /* progress the barrier */ - opal_progress(); - } - else{ - /* free the buffer - i.e. initiate callback to ml level */ - block->ml_release_cb(block,memory_bank); - } - } - return ret; -} -#endif diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_component.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_component.c deleted file mode 100644 index 787188522a..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_component.c +++ /dev/null @@ -1,380 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "opal/mca/mpool/base/base.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "opal/align.h" -#include "bcol_basesmuma.h" - -/* - * Public string showing the coll ompi_sm V2 component version number - */ -const char *mca_bcol_basesmuma_component_version_string = - "Open MPI bcol - basesmuma collective MCA component version " OMPI_VERSION; - -/* - * Local functions - */ - -static int basesmuma_register(void); -static int basesmuma_open(void); -static int basesmuma_close(void); -static int mca_bcol_basesmuma_deregister_ctl_sm( - mca_bcol_basesmuma_component_t *bcol_component); - - -static inline int mca_bcol_basesmuma_param_register_int( - const char* param_name, int default_value, int *storage) -{ - *storage = default_value; - return mca_base_component_var_register(&mca_bcol_basesmuma_component.super.bcol_version, param_name, - NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); -} - -static inline int mca_bcol_basesmuma_param_register_bool( - const char* param_name, bool default_value, bool *storage) -{ - *storage = default_value; - return mca_base_component_var_register(&mca_bcol_basesmuma_component.super.bcol_version, param_name, - NULL, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); -} - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ - -mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component = { - - /* First, fill in the super */ - - { - /* First, the mca_component_t struct containing meta - information about the component itself */ - - .bcol_version = { - MCA_BCOL_BASE_VERSION_2_0_0, - - /* Component name and version */ - - .mca_component_name = "basesmuma", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - - /* Component open and close functions */ - - .mca_open_component = basesmuma_open, - .mca_close_component = basesmuma_close, - .mca_register_component_params = basesmuma_register, - }, - - /* Initialization / querying functions */ - - .collm_init_query = mca_bcol_basesmuma_init_query, - .collm_comm_query = mca_bcol_basesmuma_comm_query, - .init_done = false, - .need_ordering = false, - .priority = 0, /* (default) priority */ - }, -}; - -/* - * Register the component - */ -static int basesmuma_register(void) -{ - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - - /* set component priority */ - mca_bcol_basesmuma_param_register_int("priority", 90, &cs->super.priority); - - /* Number of memory banks */ - mca_bcol_basesmuma_param_register_int("basesmuma_num_ctl_banks", 2, - &cs->basesmuma_num_mem_banks); - - /* Number of regions per memory bank */ - mca_bcol_basesmuma_param_register_int("basesmuma_num_buffs_per_bank", 16, - &cs->basesmuma_num_regions_per_bank); - - /* number of polling loops to allow pending resources to - * complete their work - */ - mca_bcol_basesmuma_param_register_int("n_poll_loops", 4, &cs->n_poll_loops); - - - /* Number of groups supported */ - mca_bcol_basesmuma_param_register_int("n_groups_supported", 100, - &cs->n_groups_supported); - - /* order of fanin tree */ - mca_bcol_basesmuma_param_register_int("radix_fanin", 2, &cs->radix_fanin); - - /* order of fanout tree */ - mca_bcol_basesmuma_param_register_int("radix_fanout", 2, &cs->radix_fanout); - - /* order of read tree */ - mca_bcol_basesmuma_param_register_int("radix_read_tree", 3, - &cs->radix_read_tree); - - /* order of reduction fanout tree */ - mca_bcol_basesmuma_param_register_int("order_reduction_tree", 2, - &cs->order_reduction_tree); - - /* k-nomial radix */ - mca_bcol_basesmuma_param_register_int("k_nomial_radix", 3, &cs->k_nomial_radix); - - /* number of polling loops for non-blocking algorithms */ - mca_bcol_basesmuma_param_register_int("num_to_probe", 10, &cs->num_to_probe); - - /* radix of the k-ary scatter tree */ - mca_bcol_basesmuma_param_register_int("scatter_kary_radix", 4, - &cs->scatter_kary_radix); - - /* register parmeters controlling message fragementation */ - mca_bcol_basesmuma_param_register_int("min_frag_size", getpagesize(), - &cs->super.min_frag_size); - mca_bcol_basesmuma_param_register_int("max_frag_size", FRAG_SIZE_NO_LIMIT, - &cs->super.max_frag_size); - - /* by default use pre-registered shared memory segments */ - /* RLG NOTE: When we have a systematic way to handle single memory - * copy semantics, we need to update this logic - */ - mca_bcol_basesmuma_param_register_bool("can_use_user_buffers", false, - &cs->super.can_use_user_buffers); - - mca_bcol_basesmuma_param_register_int("verbose", 0, &cs->verbose); - - return OMPI_SUCCESS; -} - -/* - * Open the component - */ -static int basesmuma_open(void) -{ - - /* local variables */ - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - int ret = OMPI_SUCCESS; - opal_mutex_t *mutex_ptr; - int dummy; - - /* - * Make sure that the number of banks is a power of 2 - */ - cs->basesmuma_num_mem_banks= - ompi_roundup_to_power_radix(2,cs->basesmuma_num_mem_banks, &dummy); - if ( 0 == cs->basesmuma_num_mem_banks ) { - ret=OMPI_ERROR; - goto exit_ERROR; - } - - /* - * Make sure that the the number of buffers is a power of 2 - */ - cs->basesmuma_num_regions_per_bank= - ompi_roundup_to_power_radix(2,cs->basesmuma_num_regions_per_bank, &dummy); - if ( 0 == cs->basesmuma_num_regions_per_bank ) { - ret=OMPI_ERROR; - goto exit_ERROR; - } - - /* Portals initialization */ - cs->portals_init = false; - cs->portals_info = NULL; - - /* - * initialization - */ - cs->sm_ctl_structs=NULL; - OBJ_CONSTRUCT(&(cs->sm_connections_list),opal_list_t); - OBJ_CONSTRUCT(&(cs->nb_admin_barriers),opal_list_t); - mutex_ptr= &(cs->nb_admin_barriers_mutex); - OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t); - - /* Control structures object construct - */ - OBJ_CONSTRUCT(&(cs->ctl_structures), opal_list_t); - - /* shared memory has not been registered yet */ - cs->mpool_inited = false; - - /* initialize base file names */ - cs->clt_base_fname="sm_ctl_mem_"; - cs->payload_base_fname="sm_payload_mem_"; - - /* initialize the size of the shared memory scartch region */ - cs->my_scratch_shared_memory_size=getpagesize(); - cs->my_scratch_shared_memory=NULL; - cs->scratch_offset_from_base_ctl_file=0; - - /* - * register the progess function - */ - ret=opal_progress_register(bcol_basesmuma_progress); - if (MPI_SUCCESS != ret) { - opal_output(ompi_bcol_base_framework.framework_output, "failed to register the progress function"); - } - - return ret; - - exit_ERROR: - return ret; -} - -/* - * release the control structure backing file - */ -static int mca_bcol_basesmuma_deregister_ctl_sm(mca_bcol_basesmuma_component_t *bcol_component) -{ - if (NULL != bcol_component->sm_ctl_structs) { - OBJ_RELEASE(bcol_component->sm_ctl_structs); - } - - return OMPI_SUCCESS; -} - - -/* - * Close the component - */ -static int basesmuma_close(void) -{ - int ret; - bcol_basesmuma_registration_data_t *net_ctx; - bcol_base_network_context_t *net_reg; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - - /* gvm Leak FIX */ - OPAL_LIST_DESTRUCT (&cs->ctl_structures); - - /* deregister the progress function */ - ret=opal_progress_unregister(bcol_basesmuma_progress); - if (MPI_SUCCESS != ret) { - opal_output(ompi_bcol_base_framework.framework_output, "failed to unregister the progress function"); - } - - /* remove the control structure backing file */ - ret=mca_bcol_basesmuma_deregister_ctl_sm(&mca_bcol_basesmuma_component); - if (MPI_SUCCESS != ret) { - opal_output(ompi_bcol_base_framework.framework_output, "failed to remove control structure backing file"); - } - - /* remove the network contexts - only one network context defined for - * this component. - */ - /* file_name returne by asprintf, so need to free the resource */ - if(mca_bcol_basesmuma_component.super.network_contexts ) { - net_reg=(bcol_base_network_context_t *) - mca_bcol_basesmuma_component.super.network_contexts[0]; - if(net_reg) { - net_ctx=(bcol_basesmuma_registration_data_t *)net_reg->context_data; - if( net_ctx) { - if(net_ctx->file_name) { - free(net_ctx->file_name); - } - free(net_ctx); - } - free(net_reg); - } - free(mca_bcol_basesmuma_component.super.network_contexts); - mca_bcol_basesmuma_component.super.network_contexts=NULL; - } - - /* normal return */ - return OMPI_SUCCESS; -} - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_bcol_basesmuma_init_query(bool enable_progress_threads, - bool enable_mpi_threads) -{ - /* done */ - return OMPI_SUCCESS; -} - -/* This routine is used to allocate shared memory for the the shared - * memory control regions. - */ -int mca_bcol_basesmuma_allocate_sm_ctl_memory(mca_bcol_basesmuma_component_t *cs) -{ - /* local variables */ - int name_length, ret = OMPI_SUCCESS; - size_t ctl_length; - char *name; - size_t page_size = getpagesize (); - - /* set the file name */ - name_length=asprintf(&name, - "%s"OPAL_PATH_SEP"%s""%0d", - ompi_process_info.job_session_dir, - cs->clt_base_fname, - (int)getpid()); - if( 0 > name_length ) { - return OMPI_ERROR; - } - /* make sure name is not too long */ - if ( OPAL_PATH_MAX < (name_length-1) ) { - free (name); - return OMPI_ERROR; - } - - /* compute segment length */ - - ctl_length=(cs->basesmuma_num_mem_banks* - cs->basesmuma_num_regions_per_bank+cs->basesmuma_num_mem_banks) - *sizeof(mca_bcol_basesmuma_ctl_struct_t)*cs->n_groups_supported; - /* need two banks of memory per group - for algorithms that have - * user payload, and those that don't - */ - ctl_length*=2; - - /* add space for internal library management purposes */ - ctl_length+=cs->my_scratch_shared_memory_size; - - /* round up to multiple of page size */ - ctl_length = OPAL_ALIGN(ctl_length, page_size, size_t); - - /* allocate the shared file */ - cs->sm_ctl_structs=bcol_basesmuma_smcm_mem_reg (NULL, ctl_length, getpagesize(), name); - if( !cs->sm_ctl_structs) { - opal_output (ompi_bcol_base_framework.framework_output, - "In mca_bcol_basesmuma_allocate_sm_ctl_memory failed to allocathe backing file %s\n", name); - ret = OMPI_ERR_OUT_OF_RESOURCE; - } - - /* free the memory allocated by asprintf for the file name - - * in mca_base_smcm_mem_reg this name is copied into a new - * memory location */ - free (name); - - /* successful return */ - return ret; -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_fanin.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_fanin.c deleted file mode 100644 index 670b9af94c..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_fanin.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/* Recursive doubling blocking barrier */ - -#include "ompi_config.h" -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/patterns/net/netpatterns.h" - -#include "opal/sys/atomic.h" - -#include "ompi/mca/bcol/base/base.h" -#include "bcol_basesmuma.h" - -/********************************************************************************/ -/********************************** New Fan-In **********************************/ -/********************************************************************************/ - -static int bcol_basesmuma_fanin_new(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int64_t sequence_number; - - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - - int i, child_rank, idx, n_children, probe, - my_rank = bcol_module->super.sbgp_partner_module->my_index, - leading_dim = bcol_module->colls_no_user_data.size_of_group; - int8_t ready_flag; - int8_t bcol_id = (int8_t) bcol_module->super.bcol_id; - int buff_index = input_args->buffer_index; - int *active_requests = - &(bcol_module->ml_mem.nb_coll_desc[buff_index].active_requests); - mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; - int matched = 0; - - - volatile mca_bcol_basesmuma_payload_t *ctl_structs; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl; - volatile mca_bcol_basesmuma_header_t *child_ctl; - - - netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node); - - /* Figure out - what instance of the basesmuma bcol I am */ - sequence_number = input_args->sequence_num; - - idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0); - ctl_structs = (volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs + idx; - my_ctl = ctl_structs[my_rank].ctl_struct; - - /* Init the header */ - BASESMUMA_HEADER_INIT(my_ctl, ready_flag, sequence_number, bcol_id); - - /* Cache num of children value in a local variable */ - n_children = my_tree_node->n_children; - - /* initialize the active requests */ - *active_requests = 0; - /* create a bit map for children */ - for( i = 0; i < n_children; i++){ - *active_requests ^= (1<children_ranks[i]; - child_ctl = ctl_structs[child_rank].ctl_struct; - /* I'm sacrificing cache for concurrency */ - for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){ - if(IS_PEER_READY(child_ctl, ready_flag, sequence_number,BARRIER_FANIN_FLAG, bcol_id)) { - matched = 1; - /* flip the bit */ - *active_requests ^= (1<my_node_type){ - /* I have no more active requests, - signal my parent */ - my_ctl->flags[BARRIER_FANIN_FLAG][bcol_id] = ready_flag; - } - } else { - return BCOL_FN_STARTED; - } - - my_ctl->starting_flag_value[bcol_id]++; - - return BCOL_FN_COMPLETE; -} - -static int bcol_basesmuma_fanin_new_progress(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int64_t sequence_number; - - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - - int i, child_rank, flag_offset, idx, n_children, probe, - my_rank = bcol_module->super.sbgp_partner_module->my_index, - leading_dim = bcol_module->colls_no_user_data.size_of_group; - int8_t ready_flag; - int8_t bcol_id = (int8_t) bcol_module->super.bcol_id; - int buff_index = input_args->buffer_index; - int *active_requests = - &(bcol_module->ml_mem.nb_coll_desc[buff_index].active_requests); - mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; - int matched = 0; - - - volatile mca_bcol_basesmuma_payload_t *ctl_structs; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl; - volatile mca_bcol_basesmuma_header_t *child_ctl; - - - netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node); - - sequence_number = input_args->sequence_num; - - idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0); - ctl_structs = (volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs + idx; - my_ctl = ctl_structs[my_rank].ctl_struct; - - - flag_offset = my_ctl->starting_flag_value[bcol_id]; - ready_flag = flag_offset + 1; - my_ctl->sequence_number = sequence_number; - - /* Cache num of children value in a local variable */ - n_children = my_tree_node->n_children; - - - /* Wait until my childeren arrive */ - for (i = 0; i < n_children; ++i) { - matched = 0; - /* Get child ctl struct */ - if ( 1 == ((*active_requests >> i)&1) ) { - child_rank = my_tree_node->children_ranks[i]; - child_ctl = ctl_structs[child_rank].ctl_struct; - /* I'm sacrificing cache for concurrency */ - for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){ - if(IS_PEER_READY(child_ctl, ready_flag, sequence_number, BARRIER_FANIN_FLAG,bcol_id)) { - matched = 1; - /* flip the bit */ - *active_requests ^= (1<my_node_type){ - /* If I am not the root of the fanin tree, - then signal my parent */ - my_ctl->flags[BARRIER_FANIN_FLAG][bcol_id] = ready_flag; - } - } else { - return BCOL_FN_STARTED; - } - - my_ctl->starting_flag_value[bcol_id]++; - - return BCOL_FN_COMPLETE; -} - - -int bcol_basesmuma_fanin_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - BASESMUMA_VERBOSE(10, ("Basesmuma Fan-In register.\n")); - - comm_attribs.bcoll_type = BCOL_FANIN; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - bcol_basesmuma_fanin_new, - bcol_basesmuma_fanin_new_progress); - - return OMPI_SUCCESS; -} - - diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_fanout.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_fanout.c deleted file mode 100644 index f3d3d23c40..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_fanout.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/* Recursive doubling blocking barrier */ - -#include "ompi_config.h" -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/patterns/net/netpatterns.h" - -#include "opal/sys/atomic.h" - -#include "ompi/mca/bcol/base/base.h" -#include "bcol_basesmuma.h" - -/***********************************************************************************/ -/*********************************** New Fan-Out ***********************************/ -/***********************************************************************************/ - -static int bcol_basesmuma_fanout_new( - bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int64_t sequence_number; - - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - - int idx, probe, - my_rank = bcol_module->super.sbgp_partner_module->my_index, - leading_dim = bcol_module->colls_no_user_data.size_of_group; - int8_t ready_flag; - int8_t bcol_id = (int8_t) bcol_module->super.bcol_id; - int buff_index = input_args->buffer_index; - mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; - - - volatile mca_bcol_basesmuma_payload_t *ctl_structs; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl; - volatile mca_bcol_basesmuma_header_t *parent_ctl; - - - netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node); - - /* Figure out - what instance of the basesmuma bcol I am */ - sequence_number = input_args->sequence_num; - - idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0); - ctl_structs = (volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs + idx; - my_ctl = ctl_structs[my_rank].ctl_struct; - - /* init the header */ - BASESMUMA_HEADER_INIT(my_ctl, ready_flag, sequence_number, bcol_id); - - /* Wait on my parent to arrive */ - if (my_tree_node->n_parents) { - parent_ctl = ctl_structs[my_tree_node->parent_rank].ctl_struct; - for( probe = 0; probe < cm->num_to_probe; probe++){ - if (IS_PEER_READY(parent_ctl, ready_flag, sequence_number, BARRIER_FANOUT_FLAG, bcol_id)) { - /* signal my children */ - my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag; - /* bump the starting flag */ - my_ctl->starting_flag_value[bcol_id]++; - return BCOL_FN_COMPLETE; - - } - } - - } else { - /* I am the root of the fanout */ - my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag; - /* bump the starting flag */ - my_ctl->starting_flag_value[bcol_id]++; - return BCOL_FN_COMPLETE; - } - - - - - - return BCOL_FN_STARTED; -} - -int bcol_basesmuma_fanout_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - BASESMUMA_VERBOSE(10, ("Basesmuma Fan-Out register.\n")); - - comm_attribs.bcoll_type = BCOL_FANOUT; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - bcol_basesmuma_fanout_new, - bcol_basesmuma_fanout_new); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_gather.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_gather.c deleted file mode 100644 index ef3d856b88..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_gather.c +++ /dev/null @@ -1,1106 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h" -#include "ompi/constants.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" - -/* debug - * #include "opal/sys/timer.h" - * - * extern uint64_t timers[7]; - * end debug */ - -/* debug */ -#include -/* end debug */ - -/* non-blocking gather routines: init and progress functions */ -int bcol_basesmuma_gather_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_GATHER; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1048576; - comm_attribs.data_src = DATA_SRC_KNOWN; - comm_attribs.waiting_semantics = BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; - inv_attribs.datatype_bitmap = 0x11111111; - inv_attribs.op_types_bitmap = 0x11111111; - - /* Set attributes for fanin fanout algorithm */ - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_k_nomial_gather_init, - bcol_basesmuma_k_nomial_gather_progress); - - return OMPI_SUCCESS; -} - -int bcol_basesmuma_k_nomial_gather_init(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int leading_dim, buff_idx, idx; - int src, i, j, k_temp1, k_temp2; - int pseudo_root, proxy_root, pseudo_base_adj; - volatile int8_t ready_flag; - int count=input_args->count; - struct ompi_datatype_t* dtype=input_args->dtype; - int root=input_args->root; - int base_adj, base; - int total_peers, my_pow_k=0; - int64_t sequence_number=input_args->sequence_num; - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - int bcol_id = (int) bcol_module->super.bcol_id; - int my_rank = bcol_module->super.sbgp_partner_module->my_index; - netpatterns_k_exchange_node_t *exchange_node = - &bcol_module->knomial_allgather_tree; - uint32_t buffer_index = input_args->buffer_index; - int *active_requests = - &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); - - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; - int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; - - int buff_offset = bcol_module->super.hier_scather_offset; - - /* "indirectors" */ - int *inv_map = exchange_node->inv_reindex_map; - int *reindex_map = exchange_node->reindex_map; - int stray = exchange_node->k_nomial_stray; - - /* tree radix */ - int tree_order = exchange_node->tree_order; - /* tree depth */ - int pow_k = exchange_node->log_tree_order; - /* largest power of k less than or equal to np */ - int cnt = exchange_node->n_largest_pow_tree_order; - - /* payload structures */ - volatile mca_bcol_basesmuma_payload_t *data_buffs; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - - size_t pack_len = 0, dt_size; - -#if 0 - fprintf(stderr,"Entering sm gather input_args->sbuf_offset %d \n",input_args->sbuf_offset); - fflush(stderr); -#endif - - - /* we will work only on packed data - so compute the length*/ - /* this is the size of my data, this is not gatherv so it's the same - * for all ranks in the communicator. - */ - ompi_datatype_type_size(dtype, &dt_size); - pack_len=count*dt_size; - /* now set the "real" offset */ - buff_offset = buff_offset*pack_len; - - buff_idx = input_args->src_desc->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Set pointer to current proc ctrl region */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - /* init the header */ - BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); - - /* init active requests, iteration, and status */ - *iteration = 0; - *active_requests = 0; - *status = -1; - /* calculate the number of steps necessary for this collective */ - - /* first thing we do is figure out where the root is in our new indexing */ - /* find root in new indexing */ - pseudo_root = inv_map[root]; - /* see if this is larger than the stray */ - if (pseudo_root >= stray) { - /* then we need to define the proxy root, everyone can do this */ - proxy_root = pseudo_root - cnt; - } else { - proxy_root = pseudo_root; - } - - /* do some figuring */ - if (EXCHANGE_NODE == exchange_node->node_type) { - total_peers = 0; - my_pow_k = pow_k; - k_temp1 = tree_order; - k_temp2 = 1; - for( i = 0; i < pow_k; i++) { - /* then find the base */ - FIND_BASE(base,exchange_node->reindex_myid,i+1,tree_order); - /* now find the adjusted base */ - base_adj = base + (base + proxy_root)%k_temp1; - /* ok, now find out WHO is occupying this slot */ - pseudo_base_adj = reindex_map[base_adj]; - - if(my_rank == pseudo_base_adj ) { - /* then go ahead and poll for children's data */ - for( j = 0; j < (tree_order - 1); j++ ) { - /* send phase - */ - /* get communication partner */ - - src = exchange_node->rank_exchanges[i][j]; - /* remember, if we have extra ranks, then we won't participate - * with a least one peer. Make a check - */ - if( src < 0 ){ - continue; - }else{ - - /* flip a bit to represent this request */ - *active_requests ^= (1<node_type || 0 == exchange_node->n_extra_sources) { - if (0 == my_pow_k || EXTRA_NODE == exchange_node->node_type) { - opal_atomic_rmb (); - - my_ctl_pointer->flags[GATHER_FLAG][bcol_id] = ready_flag; - } - - if ((EXTRA_NODE == exchange_node->node_type && root != my_rank) || 0 == my_pow_k) { - /* nothing more to do */ - my_ctl_pointer->starting_flag_value[bcol_id]++; - - return BCOL_FN_COMPLETE; - } - } - - return BCOL_FN_STARTED; -} - - -int bcol_basesmuma_k_nomial_gather_progress(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int group_size; - int flag_offset; - int leading_dim, buff_idx, idx; - int src, knt, i, j, k_temp1, k_temp2; - volatile int8_t ready_flag; - int count=input_args->count; - struct ompi_datatype_t* dtype=input_args->dtype; - int root=input_args->root; - int probe; - int matched; - int64_t sequence_number=input_args->sequence_num; - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - int bcol_id = (int) bcol_module->super.bcol_id; - int my_rank = bcol_module->super.sbgp_partner_module->my_index; - mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; - netpatterns_k_exchange_node_t *exchange_node = - &bcol_module->knomial_allgather_tree; - uint32_t buffer_index = input_args->buffer_index; - int *active_requests = - &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; - int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; - int buff_offset = bcol_module->super.hier_scather_offset; - /* "indirectors" */ - int *list_connected = bcol_module->super.list_n_connected; - /* tree radix */ - int tree_order = exchange_node->tree_order; - /* payload structures */ - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char *child_data_pointer; - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *child_ctl_pointer; - /*volatile mca_bcol_basesmuma_ctl_struct_t* parent_ctl_pointer; */ - - size_t pack_len = 0, dt_size; - void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr); - - -#if 0 - fprintf(stderr,"Entering sm gather input_args->sbuf_offset %d \n",input_args->sbuf_offset); - fflush(stderr); -#endif - - - /* we will work only on packed data - so compute the length*/ - /* this is the size of my data, this is not gatherv so it's the same - * for all ranks in the communicator. - */ - ompi_datatype_type_size(dtype, &dt_size); - pack_len=count*dt_size; - /* now set the "real" offset */ - buff_offset = buff_offset*pack_len; - - buff_idx = input_args->src_desc->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Set pointer to current proc ctrl region */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - /* restart the ready_flag state */ - flag_offset = my_ctl_pointer->starting_flag_value[bcol_id]; - ready_flag = flag_offset + 1; - - /* calculate the number of steps necessary for this collective */ - - /* first thing we do is figure out where the root is in our new indexing */ - /* find root in new indexing */ - if( EXTRA_NODE == exchange_node->node_type ) { - - /* poll for data from proxy */ - src = exchange_node->rank_extra_sources_array[0]; - /* get src data buffer */ - child_data_pointer = data_buffs[src].payload; - child_ctl_pointer = data_buffs[src].ctl_struct; - /* remember to bump your flag */ - ready_flag++; - - /* in this case, you must block */ - for (i = 0 ; i < cm->num_to_probe ; ++i) { - if (IS_PEER_READY(child_ctl_pointer,ready_flag,sequence_number, GATHER_FLAG, bcol_id)){ - /* receive the data from the proxy, aka pseudo-root */ - memcpy((void *) ((unsigned char *) data_addr + buff_offset), - (void *) ((unsigned char *) child_data_pointer+buff_offset), - pack_len * group_size); - - goto FINISHED; - } - } - - return BCOL_FN_STARTED; - } - - - if (0 < exchange_node->n_extra_sources && (-1 == (*status))) { - /* am a proxy, poll for pack_len data from extra */ - src = exchange_node->rank_extra_sources_array[0]; - /* get src data buffer */ - child_data_pointer = data_buffs[src].payload; - child_ctl_pointer = data_buffs[src].ctl_struct; - knt = 0; - for( i = 0; i < src; i++){ - knt += list_connected[i]; - } - /* must block here also */ - matched = 0; - for (i = 0, matched = 0 ; i < cm->num_to_probe && (0 == matched) ; ++i) { - if(IS_PEER_READY(child_ctl_pointer,ready_flag,sequence_number, GATHER_FLAG, bcol_id)){ - matched = 1; - memcpy((void *) ((unsigned char *) data_addr + buff_offset + pack_len*knt), - (void *) ((unsigned char *) child_data_pointer + buff_offset + - pack_len*knt), pack_len*list_connected[src]); - *status = 0; - if( 0 == *active_requests ){ - goto LAST_STEP; - } - - break; - } - } - if( 0 == matched ){ - return BCOL_FN_STARTED; - } - } - - /* start the k-nomial gather phase */ - /* only "active ranks participate, once a rank has forwarded its data, it becomes inactive */ - for (probe = 0 ; probe < cm->num_to_probe ; ++probe) { - k_temp1 = tree_order; - k_temp2 = 1; - for (i = 0 ; i < *(iteration) ; ++i) { - - /* then go ahead and poll for children's data */ - for (j = 0 ; j < (tree_order - 1) ; ++j) { - /* send phase - */ - /* get communication partner */ - - src = exchange_node->rank_exchanges[i][j]; - /* remember, if we have extra ranks, then we won't participate - * with a least one peer. Make a check - */ - /* if the bit that corresponds to this child has been set to zero, - * then it has already checked in and data received - */ - if (src < 0 || 1 != ((*active_requests >> ((tree_order - 1)*i + j))&1)){ - continue; - } - child_data_pointer = data_buffs[src].payload; - child_ctl_pointer = data_buffs[src].ctl_struct; - - if(IS_PEER_READY(child_ctl_pointer,ready_flag,sequence_number, GATHER_FLAG, bcol_id)){ - /* copy the data */ - memcpy((void *) ((unsigned char *) data_addr + buff_offset + - exchange_node->payload_info[i][j].r_offset*pack_len), - (void *) ((unsigned char *) child_data_pointer + buff_offset + - exchange_node->payload_info[i][j].r_offset*pack_len), - exchange_node->payload_info[i][j].r_len*pack_len); - /* flip the bit to zero */ - *active_requests ^= (1<<((tree_order - 1)*i + j)); - if(0 == (*active_requests)) { - goto LAST_STEP; - } - } - } - } - - k_temp1 = k_temp1*tree_order; - k_temp2 = k_temp2*tree_order; - } - - - return BCOL_FN_STARTED; - -LAST_STEP: - /* last step, proxies send full data back to the extra ranks */ - if( 0 < exchange_node->n_extra_sources && - root == exchange_node->rank_extra_sources_array[0]) { - /* regardless, I will bump the ready flag and set it in case someone is watching */ - /* announce that data is ready */ - ready_flag++; - } - - /* signal that data is ready */ - opal_atomic_wmb (); - my_ctl_pointer->flags[GATHER_FLAG][bcol_id] = ready_flag; - -FINISHED: - - - my_ctl_pointer->starting_flag_value[bcol_id]++; - - return BCOL_FN_COMPLETE; -} - - -/* Blocking routines, used to prototype and test signaling, - * as well as debug hierarchical algorithm - */ -#if 0 -int bcol_basesmuma_gather_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_GATHER; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 16; - comm_attribs.data_src = DATA_SRC_KNOWN; - comm_attribs.waiting_semantics = BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; - inv_attribs.datatype_bitmap = 0x11111111; - inv_attribs.op_types_bitmap = 0x11111111; - - - /* Set attributes for fanin fanout algorithm */ - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_basesmuma_k_nomial_gather, - bcol_basesmuma_k_nomial_gather); - - return OMPI_SUCCESS; -} -#endif - - -/* original, fully blocking, fully synchronous gather - should result in worst performance when used */ -#if 0 -int bcol_basesmuma_k_nomial_gather(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int group_size; - int first_instance=0, flag_offset; - int rc = OMPI_SUCCESS; - int leading_dim, buff_idx, idx; - int *group_list; - int src, comm_src, knt, i, k, j, k_temp1, k_temp2; - int pseudo_root, proxy_root, pseudo_base_adj; - volatile int64_t ready_flag; - int count=input_args->count; - struct ompi_datatype_t* dtype=input_args->dtype; - int root=input_args->root; - int base_adj, base; - int64_t sequence_number=input_args->sequence_num; - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - int my_rank = bcol_module->super.sbgp_partner_module->my_index; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - netpatterns_k_exchange_node_t *exchange_node = - &bcol_module->knomial_allgather_tree; - - int buff_offset = bcol_module->super.hier_scather_offset; - - /* "indirectors" */ - int *list_connected = bcol_module->super.list_n_connected; - int *inv_map = exchange_node->inv_reindex_map; - int *reindex_map = exchange_node->reindex_map; - /*int *reindex_map = exchange_node->reindex_map;*/ - /* stray rank == first rank in the extra set */ - int stray = exchange_node->k_nomial_stray; - - /* tree radix */ - int tree_order = exchange_node->tree_order; - /* tree depth */ - int pow_k = exchange_node->log_tree_order; - /* largest power of k less than or equal to np */ - int cnt = exchange_node->n_largest_pow_tree_order; - - /*fprintf(stderr,"tree order %d pow_k %d stray %d root %d\n",tree_order, pow_k, stray, root);*/ - /* payload structures */ - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char *child_data_pointer; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *child_ctl_pointer; - /*volatile mca_bcol_basesmuma_ctl_struct_t* parent_ctl_pointer; */ - - size_t pack_len = 0, dt_size; - void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr); - - /* active in the algorithm */ - bool active = true; - -#if 0 - fprintf(stderr,"Entering sm gather input_args->sbuf_offset %d \n",input_args->sbuf_offset); - fflush(stderr); -#endif - - - /* we will work only on packed data - so compute the length*/ - /* this is the size of my data, this is not gatherv so it's the same - * for all ranks in the communicator. - */ - ompi_datatype_type_size(dtype, &dt_size); - pack_len=count*dt_size; - /* now set the "real" offset */ - buff_offset = buff_offset*pack_len; - - buff_idx = input_args->src_desc->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - /* I have a feeling that I'll need this */ - group_list = bcol_module->super.sbgp_partner_module->group_list; - - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - /*ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **) - bcol_module->colls_with_user_data.ctl_buffs+idx; - */ - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Set pointer to current proc ctrl region */ - /*my_ctl_pointer = ctl_structs[my_rank]; */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - /* setup resource recycling */ - if( my_ctl_pointer->sequence_number < sequence_number ) { - first_instance=1; - } - - if( first_instance ) { - /* Signal arrival */ - my_ctl_pointer->flag = -1; - my_ctl_pointer->gflag = -1; - my_ctl_pointer->index=1; - /* this does not need to use any flag values , so only need to - * set the value for subsequent values that may need this */ - my_ctl_pointer->starting_flag_value=0; - flag_offset=0; - - } else { - /* only one thread at a time will be making progress on this - * collective, so no need to make this atomic */ - my_ctl_pointer->index++; - } - - - /* increment the starting flag by one and return */ - flag_offset = my_ctl_pointer->starting_flag_value; - ready_flag = flag_offset + sequence_number + 1; - my_ctl_pointer->sequence_number = sequence_number; - -/* debug - fprintf(stderr," sequence_number %lld flag_offset %d starting flag val %d\n",sequence_number,flag_offset, my_ctl_pointer->starting_flag_value); - fflush(stderr); - end debug */ - - - /* - * Fan out from root - */ - /* don't need this either */ - /* root is the local leader */ - /* calculate the number of steps necessary for this collective */ - - /* first thing we do is figure out where the root is in our new indexing */ - /* find root in new indexing */ - pseudo_root = inv_map[root]; - /* see if this is larger than the stray */ - if( pseudo_root >= stray ) { - /* then we need to define the proxy root, everyone can do this */ - proxy_root = pseudo_root - cnt; - }else { - proxy_root = pseudo_root; - } - - - - if( EXTRA_NODE == exchange_node->node_type ) { - - /* signal arrival */ - my_ctl_pointer->gflag = ready_flag; - - /* send is done */ - - /* poll for data only if I am the root */ - /* bump the ready flag */ - ready_flag++; - if( root == my_rank ){ - /* poll for data from proxy */ - src = exchange_node->rank_extra_sources_array[0]; - /* get src data buffer */ - child_data_pointer = data_buffs[src].payload; - child_ctl_pointer = data_buffs[src].ctl_struct; - while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){ - opal_progress(); - } - /* receive the data from the proxy, aka pseudo-root */ - - memcpy((void *) ((unsigned char *) data_addr + buff_offset),(void *) ((unsigned char *) child_data_pointer+buff_offset) - ,pack_len*group_size); - } - goto FINISHED; - - - } else if( 0 < exchange_node->n_extra_sources ) { - - /* am a proxy, poll for pack_len data from extra */ - src = exchange_node->rank_extra_sources_array[0]; - /* get src data buffer */ - child_data_pointer = data_buffs[src].payload; - child_ctl_pointer = data_buffs[src].ctl_struct; - knt = 0; - for( i = 0; i < src; i++){ - knt += list_connected[i]; - } - while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){ - opal_progress(); - } - memcpy((void *) ((unsigned char *) data_addr + buff_offset + pack_len*knt), - (void *) ((unsigned char *) child_data_pointer + buff_offset + - pack_len*knt), pack_len*list_connected[src]); - /*fprintf(stderr,"999 proxy received data from %d at offset %d of length %d\n",src, - buff_offset+pack_len*knt,pack_len*list_connected[src]); - */ - } - - /* start the k-nomial gather phase */ - /* only "active ranks participate, once a rank has forwarded its data, it becomes inactive */ - knt = 0; - while(active){ - k_temp1 = tree_order; - k_temp2 = 1; - for( i = 0; i < pow_k; i++) { - /* then find the base */ - /*FIND_BASE(base,my_rank,i+1,tree_order);*/ - FIND_BASE(base,exchange_node->reindex_myid,i+1,tree_order); - /* now find the adjusted base */ - base_adj = base + (base + proxy_root)%k_temp1; - /* ok, now find out WHO is occupying this slot */ - /*pseudo_base_adj = inv_map[base_adj];*/ - pseudo_base_adj = reindex_map[base_adj]; - - if(my_rank == pseudo_base_adj ) { - /* then go ahead and poll for children's data */ - for( j = 0; j < (tree_order - 1); j++ ) { - /* send phase - */ - /* get communication partner */ - - src = exchange_node->rank_exchanges[i][j]; - /*fprintf(stderr,"comm_src %d\n",comm_src);*/ - /* remember, if we have extra ranks, then we won't participate - * with a least one peer. Make a check - */ - if( src < 0 ){ - continue; - } - - /*fprintf(stderr,"src %d\n",src);*/ - child_data_pointer = data_buffs[src].payload; - child_ctl_pointer = data_buffs[src].ctl_struct; - while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){ - opal_progress(); - } - memcpy((void *) ((unsigned char *) data_addr + buff_offset + - exchange_node->payload_info[i][j].r_offset*pack_len), - (void *) ((unsigned char *) child_data_pointer + buff_offset + - exchange_node->payload_info[i][j].r_offset*pack_len), - exchange_node->payload_info[i][j].r_len*pack_len); - /* - fprintf(stderr,"999 receiving data from %d at offset %d of length %d\n", - exchange_node->rank_exchanges[i][j], buff_offset + exchange_node->payload_info[i][j].r_offset, - exchange_node->payload_info[i][j].r_len*pack_len); - */ - opal_atomic_wmb (); - knt++; - if(knt == exchange_node->n_actual_exchanges) { - /* this is the trick to break the root out, - * only the root should be able to satisfy this - */ - /* - fprintf(stderr,"hello n_actual is %d \n",knt); - fprintf(stderr,"hello n_actual_exch is %d \n", - exchange_node->n_actual_exchanges); - */ - goto LAST_STEP; - } - } - } else { - /* announce my arrival */ - my_ctl_pointer->gflag = ready_flag; - active = false; - break; - } - - k_temp1 = k_temp1*tree_order; - k_temp2 = k_temp2*tree_order; - } - } -LAST_STEP: - /* last step, proxies send full data back to the extra ranks */ - if( 0 < exchange_node->n_extra_sources && - root == exchange_node->rank_extra_sources_array[0]) { - /* regardless, I will bump the ready flag and set it in case someone is watching */ - /* announce that data is ready */ - ready_flag++; - my_ctl_pointer->gflag = ready_flag; - } - - -FINISHED: - -/* debug - fprintf(stderr," my_ctl_pointer->index %d n of this type %d %u \n", - my_ctl_pointer->index,c_input_args->n_of_this_type_in_collective,getpid()); - fflush(stderr); - end debug */ - - my_ctl_pointer->starting_flag_value+=1; - - return BCOL_FN_COMPLETE; -} - -#endif - - -#if 0 -/* blocking, asynchronous polling gather routine */ -int bcol_basesmuma_k_nomial_gather(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int group_size; - int first_instance=0, flag_offset; - int rc = OMPI_SUCCESS; - int leading_dim, buff_idx, idx; - int *group_list; - int src, comm_src, knt, i, k, j, k_temp1, k_temp2; - int pseudo_root, proxy_root, pseudo_base_adj; - volatile int64_t ready_flag; - int count=input_args->count; - struct ompi_datatype_t* dtype=input_args->dtype; - int root=input_args->root; - int base_adj, base; - int total_peers, my_pow_k; - int probe; - int matched; - int64_t sequence_number=input_args->sequence_num; - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - int my_rank = bcol_module->super.sbgp_partner_module->my_index; - mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; - netpatterns_k_exchange_node_t *exchange_node = - &bcol_module->knomial_allgather_tree; - - int buff_offset = bcol_module->super.hier_scather_offset; - - /* "indirectors" */ - int *list_connected = bcol_module->super.list_n_connected; - int *inv_map = exchange_node->inv_reindex_map; - int *reindex_map = exchange_node->reindex_map; - /*int *reindex_map = exchange_node->reindex_map;*/ - /* stray rank == first rank in the extra set */ - int stray = exchange_node->k_nomial_stray; - - /* tree radix */ - int tree_order = exchange_node->tree_order; - /* tree depth */ - int pow_k = exchange_node->log_tree_order; - /* largest power of k less than or equal to np */ - int cnt = exchange_node->n_largest_pow_tree_order; - - /*fprintf(stderr,"tree order %d pow_k %d stray %d root %d\n",tree_order, pow_k, stray, root);*/ - /* payload structures */ - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char *child_data_pointer; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *child_ctl_pointer; - /*volatile mca_bcol_basesmuma_ctl_struct_t* parent_ctl_pointer; */ - - size_t pack_len = 0, dt_size; - void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr); - - /* active in the algorithm */ - bool active = true; - -#if 0 - fprintf(stderr,"Entering sm gather root %d \n",root); - fflush(stderr); -#endif - - - /* we will work only on packed data - so compute the length*/ - /* this is the size of my data, this is not gatherv so it's the same - * for all ranks in the communicator. - */ - ompi_datatype_type_size(dtype, &dt_size); - pack_len=count*dt_size; - /* now set the "real" offset */ - buff_offset = buff_offset*pack_len; - - buff_idx = input_args->src_desc->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - /* I have a feeling that I'll need this */ - group_list = bcol_module->super.sbgp_partner_module->group_list; - - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - /*ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **) - bcol_module->colls_with_user_data.ctl_buffs+idx; - */ - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Set pointer to current proc ctrl region */ - /*my_ctl_pointer = ctl_structs[my_rank]; */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - /* setup resource recycling */ - if( my_ctl_pointer->sequence_number < sequence_number ) { - first_instance=1; - } - - if( first_instance ) { - /* Signal arrival */ - my_ctl_pointer->flag = -1; - my_ctl_pointer->gflag = -1; - my_ctl_pointer->index=1; - /* this does not need to use any flag values , so only need to - * set the value for subsequent values that may need this */ - my_ctl_pointer->starting_flag_value=0; - flag_offset=0; - - } else { - /* only one thread at a time will be making progress on this - * collective, so no need to make this atomic */ - my_ctl_pointer->index++; - } - - - /* increment the starting flag by one and return */ - flag_offset = my_ctl_pointer->starting_flag_value; - ready_flag = flag_offset + sequence_number + 1; - my_ctl_pointer->sequence_number = sequence_number; - -/* debug - fprintf(stderr," sequence_number %lld flag_offset %d starting flag val %d\n",sequence_number,flag_offset, my_ctl_pointer->starting_flag_value); - fflush(stderr); - end debug */ - - - /* - * Fan out from root - */ - /* don't need this either */ - /* root is the local leader */ - /* calculate the number of steps necessary for this collective */ - - /* first thing we do is figure out where the root is in our new indexing */ - /* find root in new indexing */ - pseudo_root = inv_map[root]; - /* see if this is larger than the stray */ - if( pseudo_root >= stray ) { - /* then we need to define the proxy root, everyone can do this */ - proxy_root = pseudo_root - cnt; - }else { - proxy_root = pseudo_root; - } - if( EXTRA_NODE == exchange_node->node_type ) { - - /* signal arrival */ - my_ctl_pointer->gflag = ready_flag; - - /* send is done */ - - /* poll for data only if I am the root */ - /* bump the ready flag */ - ready_flag++; - if( root == my_rank ){ - /* poll for data from proxy */ - src = exchange_node->rank_extra_sources_array[0]; - /* get src data buffer */ - child_data_pointer = data_buffs[src].payload; - child_ctl_pointer = data_buffs[src].ctl_struct; - /* in this case, you must block */ - while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){ - opal_progress(); - } - /* receive the data from the proxy, aka pseudo-root */ - - memcpy((void *) ((unsigned char *) data_addr + buff_offset), - (void *) ((unsigned char *) child_data_pointer+buff_offset) - ,pack_len*group_size); - } - goto FINISHED; - - - } else if( 0 < exchange_node->n_extra_sources ) { - - /* am a proxy, poll for pack_len data from extra */ - src = exchange_node->rank_extra_sources_array[0]; - /* get src data buffer */ - child_data_pointer = data_buffs[src].payload; - child_ctl_pointer = data_buffs[src].ctl_struct; - knt = 0; - for( i = 0; i < src; i++){ - knt += list_connected[i]; - } - /* must block here also */ - while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){ - opal_progress(); - } - memcpy((void *) ((unsigned char *) data_addr + buff_offset + pack_len*knt), - (void *) ((unsigned char *) child_data_pointer + buff_offset + - pack_len*knt), pack_len*list_connected[src]); - /*fprintf(stderr,"999 proxy received data from %d at offset %d of length %d\n",src, - buff_offset+pack_len*knt,pack_len*list_connected[src]); - */ - } - /* do some figuring */ - - total_peers = 0; - my_pow_k = pow_k; - k_temp1 = tree_order; - k_temp2 = 1; - for( i = 0; i < pow_k; i++) { - /* then find the base */ - /*FIND_BASE(base,my_rank,i+1,tree_order);*/ - FIND_BASE(base,exchange_node->reindex_myid,i+1,tree_order); - /* now find the adjusted base */ - base_adj = base + (base + proxy_root)%k_temp1; - /* ok, now find out WHO is occupying this slot */ - /*pseudo_base_adj = inv_map[base_adj];*/ - pseudo_base_adj = reindex_map[base_adj]; - - if(my_rank == pseudo_base_adj ) { - /* then go ahead and poll for children's data */ - for( j = 0; j < (tree_order - 1); j++ ) { - /* send phase - */ - /* get communication partner */ - - src = exchange_node->rank_exchanges[i][j]; - /*fprintf(stderr,"comm_src %d\n",comm_src);*/ - /* remember, if we have extra ranks, then we won't participate - * with a least one peer. Make a check - */ - if( src < 0 ){ - continue; - }else{ - total_peers++; - } - - - } - } else { - /* announce my arrival */ - my_pow_k = i; - break; - } - - k_temp1 = k_temp1*tree_order; - k_temp2 = k_temp2*tree_order; - } - - if( 0 == my_pow_k ){ - /* signal arrival */ - my_ctl_pointer->gflag = ready_flag; - - goto FINISHED; - } - - - - /* start the k-nomial gather phase */ - /* only "active ranks participate, once a rank has forwarded its data, it becomes inactive */ - knt = 0; - while(active){ - k_temp1 = tree_order; - k_temp2 = 1; - for( i = 0; i < my_pow_k; i++) { - - /* then go ahead and poll for children's data */ - for( j = 0; j < (tree_order - 1); j++ ) { - matched = 0; - /* send phase - */ - /* get communication partner */ - - src = exchange_node->rank_exchanges[i][j]; - /*fprintf(stderr,"comm_src %d\n",comm_src);*/ - /* remember, if we have extra ranks, then we won't participate - * with a least one peer. Make a check - */ - if( src < 0 ){ - continue; - } - - /*fprintf(stderr,"src %d\n",src);*/ - child_data_pointer = data_buffs[src].payload; - child_ctl_pointer = data_buffs[src].ctl_struct; - - /* if child has been marked, then skip */ - if( sequence_number == child_ctl_pointer->mark ){ - continue; - } - - - for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){ - if(IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){ - /* mark the child's pointer */ - child_ctl_pointer->mark = sequence_number; - /* copy the data */ - - memcpy((void *) ((unsigned char *) data_addr + buff_offset + - exchange_node->payload_info[i][j].r_offset*pack_len), - (void *) ((unsigned char *) child_data_pointer + buff_offset + - exchange_node->payload_info[i][j].r_offset*pack_len), - exchange_node->payload_info[i][j].r_len*pack_len); - /* - fprintf(stderr,"999 receiving data from %d at offset %d of length %d\n", - exchange_node->rank_exchanges[i][j], buff_offset + exchange_node->payload_info[i][j].r_offset, - exchange_node->payload_info[i][j].r_len*pack_len); - */ - knt++; - if(knt == total_peers) { - /* this is the trick to break the root out, - * only the root should be able to satisfy this - */ - /* - fprintf(stderr,"hello n_actual is %d \n",knt); - fprintf(stderr,"hello n_actual_exch is %d \n", - exchange_node->n_actual_exchanges); - */ - opal_atomic_wmb (); - my_ctl_pointer->gflag = ready_flag; - - goto LAST_STEP; - } - matched = 1; - }else{ - opal_progress(); - } - } - } - } - - k_temp1 = k_temp1*tree_order; - k_temp2 = k_temp2*tree_order; - } -LAST_STEP: - /* last step, proxies send full data back to the extra ranks */ - if( 0 < exchange_node->n_extra_sources && - root == exchange_node->rank_extra_sources_array[0]) { - /* regardless, I will bump the ready flag and set it in case someone is watching */ - /* announce that data is ready */ - ready_flag++; - my_ctl_pointer->gflag = ready_flag; - } - - -FINISHED: - -/* debug - fprintf(stderr," my_ctl_pointer->index %d n of this type %d %u \n", - my_ctl_pointer->index,c_input_args->n_of_this_type_in_collective,getpid()); - fflush(stderr); - end debug */ - - my_ctl_pointer->starting_flag_value+=1; - - return BCOL_FN_COMPLETE; -} -#endif diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.c deleted file mode 100644 index c985a6889a..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.c +++ /dev/null @@ -1,1878 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#ifdef __PORTALS_AVAIL__ -#define __PORTALS_ENABLE__ - -#include "ompi/constants.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" - -#include "bcol_basesmuma.h" -#include "bcol_basesmuma_portals.h" -#include "bcol_basesmuma_lmsg_bcast.h" -#include "bcol_basesmuma_utils.h" - - - -/* - * Scatter/Gather Broadcast algorithm - * - * Algorithm highlights: - * - * Uses portals for data transfer - * - * All processes participating in the broadcast are arranged in a - * binmoial tree. - * - * Phase1: Scatter the broadcast data to all the children - * Phase2: All processes in the tree participates in recursive doubling - * algorithm to obtain the missing data. - */ - - -static int completed_scatter = 0; -#if 0 -int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast_old(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - - /* local variables */ - int i; - uint64_t length; - int my_rank, parent_rank, src =-1, matched = 0; - int *src_list = NULL; - int group_size = -1, dummy_group_size; - int first_instance=0; - int rc = OMPI_SUCCESS; - int leading_dim, buff_idx, idx; - int count=input_args->count; - size_t pack_len = 0, dt_size =0 ; - int64_t ready_flag; - int flag_offset; - int pow_2, pow_2_levels; - int src_list_index = -1; - uint64_t fragment_size; /* user buffer size */ - int sg_matchbits = 0; - /* Input argument variables */ - void *my_userbuf = (void*)((unsigned char*)input_args->userbuf); - int64_t sequence_number=input_args->sequence_num; - struct ompi_datatype_t* dtype=input_args->dtype; - - /* Extra source variables */ - bool secondary_root = false; - int partner = -1, extra_partner = -1; - - /* Scatter Allgather offsets */ - uint64_t local_sg_offset = 0, global_sg_offset = 0, partner_offset = 0; - - /* Portals messaging relevant variables */ - mca_bcol_basesmuma_portal_proc_info_t *portals_info; - ptl_handle_eq_t allgather_eq_h; - ptl_event_t allgather_event; - bool blocked_post = false; - bool msg_posted = false; - int total_msg_posts = -1, scatter_posts = -1, allgather_posts = -1, extra_src_posts = -1; - - /* OMPI module and component variables */ - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - mca_bcol_basesmuma_module_t *bcol_module = - (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - - /* Control structure and payload variables */ - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer = NULL; - volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer = NULL; - volatile mca_bcol_basesmuma_header_t *partner_ctl_pointer = NULL; - - struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL; - struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL; - struct mca_bcol_basesmuma_portal_buf_addr_t *partner_lmsg_ctl_pointer = NULL; - - /* Make sure there userbuffer is not null */ - assert(my_userbuf != NULL); - - /* Get portals info*/ - portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info; - - /* Get addresing information */ - buff_idx = input_args->src_desc->buffer_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - my_rank = bcol_module->super.sbgp_partner_module->my_index; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - /* calculate the largest power of two that is smaller than - * or equal to the group size - */ - pow_2_levels = pow_sm_k(2,group_size, &(dummy_group_size)); - if( group_size < (1<colls_with_user_data.data_buffs+idx; - - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) - data_buffs[my_rank].payload; - - if(my_ctl_pointer->sequence_number < sequence_number) { - first_instance = 1; - } - - if(first_instance) { - my_ctl_pointer->flag = -1; - my_ctl_pointer->index = 1; - - my_ctl_pointer->starting_flag_value = 0; - flag_offset = 0; - - } else { - my_ctl_pointer->index++; - } - - assert( -1 == my_ctl_pointer->flag); - - /* increment the starting flag by one and return */ - flag_offset = my_ctl_pointer->starting_flag_value; - ready_flag = flag_offset + sequence_number + 1; - - my_ctl_pointer->sequence_number = sequence_number; - sg_matchbits = sequence_number ; - - /* Construct my portal buffer address and copy to payload buffer */ - mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer, - portals_info->portal_id.nid, - portals_info->portal_id.pid, - sg_matchbits, - bcol_module->super.sbgp_partner_module->group_comm->c_contextid); - - my_lmsg_ctl_pointer->userbuf = my_userbuf; - my_lmsg_ctl_pointer->userbuf_length = fragment_size; - - - /* - * If I am the root of bcast, scatter the data to my children - */ - if (input_args->root_flag) { - BASESMUMA_VERBOSE(10,("I am the root of the data")); - my_lmsg_ctl_pointer->offset = 0; - my_lmsg_ctl_pointer->n_sends = pow_2_levels; - my_lmsg_ctl_pointer->length = fragment_size; - - rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, - PTL_EQ_HANDLER_NONE, &allgather_eq_h); - - if (rc != PTL_OK) { - BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc)); - goto Release; - } - - /* Compute number of posts required - * We post the data buffer for both scatter and allgather phase at once so to avoid - * posting overhead - */ - if (my_rank >= pow_2) { - /* I am root and my rank is greater than pow_2, I will hand - * over to rank (that is < pow_2) to act as secondary root - */ - total_msg_posts = 1; - } - else { - - extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0; - scatter_posts = my_lmsg_ctl_pointer->n_sends; - allgather_posts = pow_2_levels - 1; - - total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ; - } - - mca_bcol_basesmuma_portals_post_msg(cs, my_lmsg_ctl_pointer, - my_userbuf, fragment_size, allgather_eq_h, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE); - msg_posted = true ; - /* important that these be set before my children - * see the ready flag raised - */ - opal_atomic_wmb (); - my_ctl_pointer->flag = ready_flag; - - /* Wait for my scatter partner */ - if (my_rank >= pow_2) { - int scatter_partner = -1; - volatile mca_bcol_basesmuma_header_t *scatter_partner_ctl_pointer = NULL; - - scatter_partner = my_rank - pow_2; - scatter_partner_ctl_pointer = - data_buffs[scatter_partner].ctl_struct; - - while(!IS_SG_DATA_READY(scatter_partner_ctl_pointer, ready_flag, - sequence_number)){ - opal_progress(); - } - - goto Release; - } - else { - wait_for_peers(my_rank, my_lmsg_ctl_pointer->n_sends, data_buffs, - ready_flag, sequence_number); - } - - goto Allgather; - } - - -Extra : - if( my_rank >= pow_2 ) { - parent_rank = my_rank & (pow_2-1); - parent_ctl_pointer = data_buffs[parent_rank].ctl_struct; - parent_lmsg_ctl_pointer = - (mca_bcol_basesmuma_portal_buf_addr_t*)data_buffs[parent_rank].payload; - - ready_flag = ready_flag + pow_2_levels; - - while(!IS_SG_DATA_READY(parent_ctl_pointer, ready_flag, sequence_number)) { - - opal_progress(); - - } - - - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer, - parent_lmsg_ctl_pointer, 0, - 0, fragment_size); - - my_ctl_pointer->flag = ready_flag; - - goto Release; - } - -Scatter: - - /* I am not root of bcast compute the list of possible - * where I will receive bcast data from. - */ - src_list = (int *) malloc(sizeof(int) * (pow_2_levels + 1)); - for( i = 0; i < pow_2_levels; i++) { - src_list[i] = my_rank ^ (1< pow_2 */ - if ((my_rank + pow_2) < group_size) { - src_list[i] = my_rank + pow_2; - } else { - src_list[i] = -1; - } - -Probe: - - /* If I am not the root, then poll on possible "senders'" control structs */ - /* For portals we block for now */ - while (!matched) { - /* Shared memory iprobe */ - SG_LARGE_MSG_PROBE(src_list, pow_2_levels + 1, - src_list_index, matched, src, data_buffs, parent_ctl_pointer, - parent_lmsg_ctl_pointer,ready_flag, sequence_number); - } - - /* If I am a secondary root - * Secondary root acts as root of bcast data when real root of data - * is process with group rank greater than pow_2 */ - if ((matched) && (src == pow_2 + my_rank)) { - volatile mca_bcol_basesmuma_header_t *extra_src_ctl_pointer = NULL; - struct mca_bcol_basesmuma_portal_buf_addr_t *extra_src_lmsg_ctl_pointer = NULL; - - secondary_root = true; - BASESMUMA_VERBOSE(10,("I am the secondary root for the data")); - my_lmsg_ctl_pointer->offset = 0; - my_lmsg_ctl_pointer->n_sends = pow_2_levels; - my_lmsg_ctl_pointer->length = fragment_size; - - extra_src_ctl_pointer = data_buffs[src].ctl_struct; - extra_src_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*)data_buffs[src].payload; - - /* create an event queue for the incoming buffer */ - rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, - PTL_EQ_HANDLER_NONE, &allgather_eq_h); - - if (rc != PTL_OK) { - BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc)); - goto Release; - } - - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer, - extra_src_lmsg_ctl_pointer, 0, - 0, fragment_size); - - - extra_src_posts = 0; - scatter_posts = my_lmsg_ctl_pointer->n_sends; - allgather_posts = pow_2_levels - 1; - - total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ; - - mca_bcol_basesmuma_portals_post_msg(cs, my_lmsg_ctl_pointer, - my_userbuf, fragment_size, allgather_eq_h, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE); - msg_posted = true ; - /* important that these be set before my children - * see the ready flag raised - */ - opal_atomic_wmb (); - my_ctl_pointer->flag = ready_flag; - - wait_for_peers(my_rank, my_lmsg_ctl_pointer->n_sends, data_buffs, - ready_flag, sequence_number); - goto Allgather; - } - - /* Verify whether we got the right - * source of the data, by computing the source's intended - * destinations - */ - for( i = 0; i < parent_lmsg_ctl_pointer->n_sends; i++) { - uint64_t local_offset = 0; - uint64_t remote_offset = 0; - - BASESMUMA_VERBOSE(5,("%d found it from %d \n",my_rank,src)); - - if( my_rank == (src^(1<n_sends = i; - - /* Am I source for other process during scatter phase */ - if ( i > 0) { - - /* compute the size of the chunk to copy */ - length = (parent_lmsg_ctl_pointer->length)/ - (1<<(parent_lmsg_ctl_pointer->n_sends - my_lmsg_ctl_pointer->n_sends)); - my_lmsg_ctl_pointer->length = length; - my_lmsg_ctl_pointer->offset = - parent_lmsg_ctl_pointer->offset + length; - - - local_offset = my_lmsg_ctl_pointer->offset; - remote_offset = parent_lmsg_ctl_pointer->offset + length; - - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer, - parent_lmsg_ctl_pointer,local_offset, - remote_offset, length); - rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, - PTL_EQ_HANDLER_NONE, - &allgather_eq_h); - - if (rc != PTL_OK) { - BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc)); - goto Release; - } - - /* Now post the message for other children to read */ - extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0; - scatter_posts = my_lmsg_ctl_pointer->n_sends; - allgather_posts = pow_2_levels - 1; - - total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ; - - - mca_bcol_basesmuma_portals_post_msg(cs, my_lmsg_ctl_pointer, - my_userbuf, my_lmsg_ctl_pointer->userbuf_length, - allgather_eq_h, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE - | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - ); - msg_posted = true; - /* set the memory barrier to ensure completion - * and signal I am done getting scatter data*/ - opal_atomic_wmb (); - my_ctl_pointer->flag = ready_flag; - - wait_for_peers(my_rank, my_lmsg_ctl_pointer->n_sends, data_buffs, - ready_flag, sequence_number); - - } else { - /* takes care of first level recurssive double */ - length = parent_lmsg_ctl_pointer->length/ - (1<<(parent_lmsg_ctl_pointer->n_sends - 1)); - my_lmsg_ctl_pointer->length = length; - my_lmsg_ctl_pointer->offset = parent_lmsg_ctl_pointer->offset; - - local_offset = my_lmsg_ctl_pointer->offset; - remote_offset = my_lmsg_ctl_pointer->offset; - - - while(!IS_SG_DATA_READY(parent_ctl_pointer, ready_flag, sequence_number)) { - opal_progress(); - } - - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer, - parent_lmsg_ctl_pointer,local_offset, - remote_offset, length); - - /* signal that I am done reading data from parent */ - opal_atomic_wmb (); - my_ctl_pointer->flag = ready_flag; - } - - /* time for allgather phase */ - input_args->status = ALLGATHER; - - BASESMUMA_VERBOSE(5,("Completed %d found it from %d \n",my_rank,src)); - - while(ready_flag > parent_ctl_pointer->flag); - - goto Allgather; - } - } - - { - /* this is not who we are looking for, - * mark as false positive so we don't - * poll here again - */ - src_list[src_list_index] = -1; - matched = 0; - goto Probe; - } - -Allgather: - - BASESMUMA_VERBOSE(5,(" %d Completed Scatter %d times \n", my_rank, completed_scatter)); - - /* zip it back up - we have already taken care of first level */ - global_sg_offset = my_lmsg_ctl_pointer->offset; - - /* first level of zip up */ - length = 2 * fragment_size/pow_2; - - - if (!msg_posted) { - rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, - PTL_EQ_HANDLER_NONE, &allgather_eq_h); - - /* Posting for all phases of recursive doubling */ - extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0; - allgather_posts = pow_2_levels - 1; - total_msg_posts = allgather_posts + extra_src_posts ; - - - mca_bcol_basesmuma_portals_post_msg(cs, my_lmsg_ctl_pointer, - my_userbuf, my_lmsg_ctl_pointer->userbuf_length, - allgather_eq_h, total_msg_posts , blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE - | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - ); - msg_posted = true; - } - - - ready_flag++; - opal_atomic_wmb (); - my_ctl_pointer->flag = ready_flag; - - /* - * Recursive doubling allgather implementation - */ - for( i = 1; i < pow_2_levels; i++) { - /* get my partner for this level */ - partner = my_rank^(1<flag >= ready_flag); - - if (partner_lmsg_ctl_pointer->offset < my_lmsg_ctl_pointer->offset) { - global_sg_offset -= length; - local_sg_offset = global_sg_offset; - } else { - local_sg_offset = global_sg_offset + length; - } - - - BASESMUMA_VERBOSE(10,("Allgather Phase: Get message from process %d, length %d", partner, length)); - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer, - partner_lmsg_ctl_pointer,local_sg_offset, - local_sg_offset, length); - - ready_flag++; - opal_atomic_wmb (); - my_ctl_pointer->flag = ready_flag; - - /* Block until partner completed this level of recursive-doubling stage */ - while(!IS_SG_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) { - opal_progress(); - } - - /* - * Compute length for next recursive doubling phase - */ - length *= 2; - } - - - /* If I am source for non-power 2 children wait for them */ - /* If I am secondary root then my partner would be real root - * so no need for exchange of data with the extra partner */ - extra_partner = my_rank + pow_2 ; - if ((extra_partner < group_size) && (!secondary_root)) { - volatile mca_bcol_basesmuma_header_t *extra_partner_ctl_pointer = NULL; - - extra_partner_ctl_pointer = data_buffs[extra_partner].ctl_struct; - /* Block until extra partner has copied data */ - while(!IS_SG_DATA_READY(extra_partner_ctl_pointer, ready_flag, sequence_number)) { - opal_progress(); - } - - } - -Release: - - /* free the event queue */ - rc = PtlEQFree(allgather_eq_h); - if (rc != PTL_OK) { - BASESMUMA_VERBOSE(10,("PtlEQFree() failed: %d )\n",rc)); - } - - my_ctl_pointer->starting_flag_value++; - input_args->status = FINISHED; - - return BCOL_FN_COMPLETE; - -} -#endif - -/* - * Blocking Portals Scatter Allgather - * - * - * - * - * - */ - -int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - - /* local variables */ - int i; - uint64_t length; - int my_rank, parent_rank, src =-1, matched = 0; - int *src_list = NULL; - int group_size = -1, dummy_group_size; - int first_instance=0; - int rc = OMPI_SUCCESS; - int leading_dim, buff_idx, idx; - int count=input_args->count; - size_t pack_len = 0, dt_size =0 ; - volatile int8_t ready_flag; - int flag_offset; - int pow_2, pow_2_levels; - int src_list_index = -1; - uint64_t fragment_size; /* user buffer size */ - int sg_matchbits; - - /* Input argument variables */ - void *my_userbuf = (void*)((unsigned char*)input_args->userbuf); - int64_t sequence_number=input_args->sequence_num; - struct ompi_datatype_t* dtype=input_args->dtype; - - /* Extra source variables */ - bool secondary_root = false; - int partner = -1, extra_partner = -1; - - /* Scatter Allgather offsets */ - uint64_t local_sg_offset = 0, global_sg_offset = 0, partner_offset = 0; - - /* Portals messaging relevant variables */ - mca_bcol_basesmuma_portal_proc_info_t *portals_info; - ptl_handle_eq_t allgather_eq_h; - ptl_event_t allgather_event; - bool blocked_post = false; - bool msg_posted = false; - int total_msg_posts = -1, scatter_posts = -1, allgather_posts = -1, extra_src_posts = -1; - - /* OMPI module and component variables */ - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - mca_bcol_basesmuma_module_t *bcol_module = - (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - - /* Control structure and payload variables */ - volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer = NULL; - volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer = NULL; /* binomial fanout */ - volatile mca_bcol_basesmuma_ctl_struct_t *partner_ctl_pointer = NULL; /* recursive double */ - - /* Make sure there userbuffer is not null */ - assert(my_userbuf != NULL); - - /* Get portals info*/ - portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info; - - /* Get addresing information */ - buff_idx = input_args->src_desc->buffer_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - my_rank = bcol_module->super.sbgp_partner_module->my_index; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - /* calculate the largest power of two that is smaller than - * or equal to the group size - */ - pow_2_levels = pow_sm_k(2,group_size, &(dummy_group_size)); - if( group_size < (1<colls_with_user_data.ctl_buffs+idx; - - - my_ctl_pointer = ctl_structs[my_rank]; - if(my_ctl_pointer->sequence_number < sequence_number) { - first_instance = 1; - } - - if(first_instance) { - for( i = 0; i < NUM_SIGNAL_FLAGS; i++){ - my_ctl_pointer->flags[i] = -1; - } - my_ctl_pointer->index = 1; - - my_ctl_pointer->starting_flag_value = 0; - flag_offset = 0; - - } else { - my_ctl_pointer->index++; - } - - - /* increment the starting flag by one and return */ - flag_offset = my_ctl_pointer->starting_flag_value; - /*ready_flag = flag_offset + sequence_number + 1;*/ - ready_flag = flag_offset + 1; - - my_ctl_pointer->sequence_number = sequence_number; - sg_matchbits = sequence_number ; - - /* Construct my portal buffer address and copy to payload buffer */ - mca_bcol_basesmuma_construct_portal_address(&my_ctl_pointer->portals_buf_addr, - portals_info->portal_id.nid, - portals_info->portal_id.pid, - sg_matchbits, - bcol_module->super.sbgp_partner_module->group_comm->c_contextid); - - my_ctl_pointer->portals_buf_addr.userbuf = my_userbuf; - my_ctl_pointer->portals_buf_addr.userbuf_length = fragment_size; - - - if (input_args->root_flag) { - my_ctl_pointer->offset = 0; - my_ctl_pointer->n_sends = pow_2_levels; - my_ctl_pointer->length = fragment_size; - - rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, - PTL_EQ_HANDLER_NONE, &allgather_eq_h); - - if (rc != PTL_OK) { - BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc)); - goto Release; - } - - /* Compute number of posts required */ - if (my_rank >= pow_2) { - /* I am root and my rank is greater than pow_2, I will hand - * over to rank (that is < pow_2) to act as secondary root - */ - total_msg_posts = 1; - } - else { - - extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0; - scatter_posts = my_ctl_pointer->n_sends; - allgather_posts = pow_2_levels - 1; - - total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ; - } - - mca_bcol_basesmuma_portals_post_msg(cs, - &my_ctl_pointer->portals_buf_addr, - my_userbuf, fragment_size, allgather_eq_h, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE); - msg_posted = true ; - - /* important that these be set before my children - * see the ready flag raised - */ - opal_atomic_wmb (); - my_ctl_pointer->flags[BCAST_FLAGS] = ready_flag; - BASESMUMA_VERBOSE(1,("I am the root(ctl_pointer %x) of the data flag value %d",my_ctl_pointer, my_ctl_pointer->flag)); - /* Wait for my scatter partner */ - if (my_rank >= pow_2) { - int scatter_partner = -1; - volatile mca_bcol_basesmuma_ctl_struct_t *scatter_partner_ctl_pointer = NULL; - - scatter_partner = my_rank - pow_2; - scatter_partner_ctl_pointer = - ctl_structs[scatter_partner]; - - while(!IS_SG_DATA_READY(scatter_partner_ctl_pointer, ready_flag, - sequence_number)){ -SCATTER_WAIT_FOR_EXTRA: - opal_progress(); - } - - goto Release; - } - else { - - wait_for_peers_nb(my_rank, my_ctl_pointer->n_sends, ctl_structs, - ready_flag, sequence_number); - } - - goto Allgather; - } - - -Extra : - if( my_rank >= pow_2 ) { - parent_rank = my_rank & (pow_2-1); - parent_ctl_pointer = ctl_structs[parent_rank]; - - ready_flag = ready_flag + pow_2_levels; - - while(!IS_SG_DATA_READY(parent_ctl_pointer, ready_flag, sequence_number)) { - - opal_progress(); - - } - - - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, &my_ctl_pointer->portals_buf_addr, - &parent_ctl_pointer->portals_buf_addr, 0, - 0, fragment_size); - - my_ctl_pointer->flags[BCAST_FLAG] = ready_flag; - - goto Release; - } - -Scatter: - - /* compute the list of possible sources */ - src_list = (int *) malloc(sizeof(int) * (pow_2_levels + 1)); - for( i = 0; i < pow_2_levels; i++) { - src_list[i] = my_rank ^ (1< pow_2 */ - if ((my_rank + pow_2) < group_size) { - src_list[i] = my_rank + pow_2; - } else { - src_list[i] = -1; - } - -Probe: - - /* If I am not the root, then poll on possible "senders'" control structs */ - /* For portals we block for now */ - while (!matched) { - /* Shared memory iprobe */ - SG_LARGE_MSG_NB_PROBE(src_list, pow_2_levels + 1, - src_list_index, matched, src, ctl_structs, - parent_ctl_pointer, ready_flag, sequence_number); - } - - BASESMUMA_VERBOSE(1,("Scatter : Im non-root match received")); - /* If I am a secondary root */ - if ((matched) && (src == pow_2 + my_rank)) { - volatile mca_bcol_basesmuma_ctl_struct_t *extra_src_ctl_pointer = NULL; - - secondary_root = true; - BASESMUMA_VERBOSE(10,("I am the secondary root for the data")); - my_ctl_pointer->offset = 0; - my_ctl_pointer->n_sends = pow_2_levels; - my_ctl_pointer->length = fragment_size; - - extra_src_ctl_pointer = ctl_structs[src]; - - /* create an event queue for the incoming buffer */ - rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, - PTL_EQ_HANDLER_NONE, &allgather_eq_h); - - if (rc != PTL_OK) { - BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc)); - goto Release; - } - - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, - &my_ctl_pointer->portals_buf_addr, - &extra_src_ctl_pointer->portals_buf_addr, 0, - 0, fragment_size); - - - extra_src_posts = 0; - scatter_posts = my_ctl_pointer->n_sends; - allgather_posts = pow_2_levels - 1; - - total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ; - - mca_bcol_basesmuma_portals_post_msg(cs, - &my_ctl_pointer->portals_buf_addr, - my_userbuf, fragment_size, allgather_eq_h, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET - | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE); - msg_posted = true ; - - /* important that these be set before my children - * see the ready flag raised - */ - opal_atomic_wmb (); - my_ctl_pointer->flags[BCAST_FLAG] = ready_flag; - - wait_for_peers_nb(my_rank, my_ctl_pointer->n_sends, ctl_structs, - ready_flag, sequence_number); - goto Allgather; - } - - /* we need to see whether this is really - * who we are looking for - */ - for( i = 0; i < parent_ctl_pointer->n_sends; i++) { - uint64_t local_offset = 0; - uint64_t remote_offset = 0; - - BASESMUMA_VERBOSE(1,("%d found it from %d \n",my_rank,src)); - - if( my_rank == (src^(1<n_sends = i; - - /* Am I source for other process during scatter phase */ - if ( i > 0) { - - /* compute the size of the chunk to copy */ - length = (parent_ctl_pointer->length)/ - (1<<(parent_ctl_pointer->n_sends - my_ctl_pointer->n_sends)); - my_ctl_pointer->length = length; - my_ctl_pointer->offset = - parent_ctl_pointer->offset + length; - - - local_offset = my_ctl_pointer->offset; - remote_offset = parent_ctl_pointer->offset + length; - - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, &my_ctl_pointer->portals_buf_addr, - &parent_ctl_pointer->portals_buf_addr,local_offset, - remote_offset, length); - rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, - PTL_EQ_HANDLER_NONE, - &allgather_eq_h); - - if (rc != PTL_OK) { - BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc)); - goto Release; - } - - /* Now post the message for other children to read */ - extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0; - scatter_posts = my_ctl_pointer->n_sends; - allgather_posts = pow_2_levels - 1; - - total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ; - - - mca_bcol_basesmuma_portals_post_msg(cs, &my_ctl_pointer->portals_buf_addr, - my_userbuf, my_ctl_pointer->portals_buf_addr.userbuf_length, - allgather_eq_h, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE - | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - ); - msg_posted = true; - /* set the memory barrier to ensure completion */ - opal_atomic_wmb (); - /* signal that I am done */ - my_ctl_pointer->flags[BCAST_FLAG] = ready_flag; - - wait_for_peers_nb(my_rank, my_ctl_pointer->n_sends, ctl_structs, - ready_flag, sequence_number); - - } else { - /* takes care of first level recurssive double */ - length = parent_ctl_pointer->length/ - (1<<(parent_ctl_pointer->n_sends - 1)); - my_ctl_pointer->length = length; - my_ctl_pointer->offset = parent_ctl_pointer->offset; - - local_offset = my_ctl_pointer->offset; - remote_offset = my_ctl_pointer->offset; - - - while(!IS_SG_DATA_READY(parent_ctl_pointer, ready_flag, sequence_number)) { - opal_progress(); - } - - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, - &my_ctl_pointer->portals_buf_addr, - &parent_ctl_pointer->portals_buf_addr, local_offset, - remote_offset, length); - - /* signal that I am done reading data from parent */ - opal_atomic_wmb (); - my_ctl_pointer->flags[BCAST_FLAG] = ready_flag; - } - - /* time for allgather phase */ - input_args->status = ALLGATHER; - - BASESMUMA_VERBOSE(5,("Completed %d found it from %d \n",my_rank,src)); - - while(ready_flag > parent_ctl_pointer->flags[BCAST_FLAG]); - - goto Allgather; - } - } - - { - /* this is not who we are looking for, - * mark as false positive so we don't - * poll here again - */ - src_list[src_list_index] = -1; - matched = 0; - goto Probe; - } - -Allgather: - - BASESMUMA_VERBOSE(5,(" %d Completed Scatter %d times \n", my_rank, completed_scatter)); - - /* zip it back up - we have already taken care of first level */ - global_sg_offset = my_ctl_pointer->offset; - - /* first level of zip up */ - length = 2 * fragment_size/pow_2; - - - if (!msg_posted) { - rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, - PTL_EQ_HANDLER_NONE, &allgather_eq_h); - - /* Posting for all phases of recursive doubling */ - extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0; - allgather_posts = pow_2_levels - 1; - total_msg_posts = allgather_posts + extra_src_posts ; - - - mca_bcol_basesmuma_portals_post_msg(cs, &my_ctl_pointer->portals_buf_addr, - my_userbuf, my_ctl_pointer->portals_buf_addr.userbuf_length, - allgather_eq_h, total_msg_posts , blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE - | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - ); - msg_posted = true; - } - - ready_flag++; - opal_atomic_wmb (); - my_ctl_pointer->flags[BCAST_FLAG] = ready_flag; - - for( i = 1; i < pow_2_levels; i++) { - /* get my partner for this level */ - partner = my_rank^(1<flags[BCAST_FLAG] >= ready_flag); - - if (partner_ctl_pointer->offset < my_ctl_pointer->offset) { - global_sg_offset -= length; - local_sg_offset = global_sg_offset; - } else { - local_sg_offset = global_sg_offset + length; - } - - - BASESMUMA_VERBOSE(10,("Allgather Phase: Get message from process %d, length %d", partner, length)); - mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, - &my_ctl_pointer->portals_buf_addr, - &partner_ctl_pointer->portals_buf_addr,local_sg_offset, - local_sg_offset, length); - - ready_flag++; - opal_atomic_wmb (); - my_ctl_pointer->flags[BCAST_FLAG] = ready_flag; - - /* Block until partner is at this level of recursive-doubling stage */ - while(!IS_SG_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) { - opal_progress(); - } - - /* double the length */ - length *= 2; - } - - - /* If I am source for non-power 2 children wait for them */ - /* If I am secondary root then my partner would be real root - * so no need for exchange of data with the extra partner */ - extra_partner = my_rank + pow_2 ; - if ((extra_partner < group_size) && (!secondary_root)) { - volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer = NULL; - - extra_partner_ctl_pointer = ctl_structs[extra_partner]; - /* Block until extra partner has copied data */ - while(!IS_SG_DATA_READY(extra_partner_ctl_pointer, ready_flag, sequence_number)) { - opal_progress(); - } - - } - -Release: - - /* free the event queue */ - rc = PtlEQFree(allgather_eq_h); - if (rc != PTL_OK) { - BASESMUMA_VERBOSE(10,("PtlEQFree() failed: %d )\n",rc)); - } - - my_ctl_pointer->starting_flag_value++; - input_args->status = FINISHED; - - return BCOL_FN_COMPLETE; - -} - - -/* - * static sg_state_t *sg_state = NULL; - */ - -int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - int i; - mca_bcol_basesmuma_portal_proc_info_t *portals_info; - int dummy_group_size; - int rc = OMPI_SUCCESS; - int buff_idx; - int count=input_args->count; - size_t pack_len = 0, dt_size =0 ; - struct ompi_datatype_t* dtype=input_args->dtype; - int completed_posts = 0; - sg_state_t *sg_state = NULL; - mca_bcol_basesmuma_module_t *bcol_module = NULL; - int extra_src_posts = -1,allgather_posts = -1, total_msg_posts = -1; - - bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - /* - sg_state = (sg_state_t*)bcol_module->sg_state; - */ - sg_state = (sg_state_t*)&(bcol_module->sg_state); - /* Re-entering the algorithm */ - switch (sg_state->phase) { - case PROBE: - if (input_args->root_flag) { - /* I became a root for this group */ - sg_state->phase = START; - goto Start; - } - goto Probe; - break; - - case SCATTER_ROOT_WAIT: - goto Scatter_root_wait; - - case SCATTER_EXTRA_ROOT_WAIT: - goto Scatter_extra_root_wait; - - case SCATTER_PARENT_WAIT: - goto Scatter_parent_wait; - - default: - break; - } - - sg_state->phase = INIT; - - BASESMUMA_VERBOSE(1,("Im entering portals_nb_bcast Unknown root ")); - /* Allocate space for algorithm state */ - /* - sg_state = (sg_state_t *) malloc(sizeof(sg_state_t)); - bcol_module->sg_state = (void *)sg_state; - - assert(NULL != sg_state); - */ - - sg_state->secondary_root = false; - sg_state->msg_posted = false; - sg_state->matched = 0; - sg_state->phase = SCATTER; - /* Copy input args to local variables */ - sg_state->my_userbuf = (void*)((unsigned char*)input_args->userbuf); - assert(sg_state->my_userbuf != NULL); - sg_state->sequence_number=input_args->sequence_num; - sg_state->cs = &mca_bcol_basesmuma_component; - sg_state->bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - /* Should this be buffer index (ML) or control buffer index ? */ - buff_idx = input_args->src_desc->buffer_index; - - /* Initialize SM group info used for control signaling */ - init_sm_group_info(sg_state, buff_idx); - - /* calculate the largest power of two that is smaller than - * or equal to the group size - */ - sg_state->pow_2_levels = pow_sm_k(2, sg_state->group_size, &(dummy_group_size)); - if( sg_state->group_size < (1 << sg_state->pow_2_levels)) { - sg_state->pow_2_levels--; - } - /* power-of-two group size */ - sg_state->pow_2 = 1 << sg_state->pow_2_levels; - - - /* we will work only on packed data - so compute the length*/ - ompi_datatype_type_size(dtype, &dt_size); - sg_state->fragment_size = count*dt_size; - - - /* Init portals scatter allgather info */ - rc = init_sm_portals_sg_info(sg_state); - - if (rc != OMPI_SUCCESS) { - goto Release; - } - -Start : -Extra : - /* - * My rank > pow2 groupsize - */ - if( sg_state->my_rank >= sg_state->pow_2 ) { - - if (input_args->root_flag){ - - rc = sm_portals_extra_root_scatter(sg_state); - if (rc != OMPI_SUCCESS) { - goto Release; - } - - } else { - /* - * Wait for my partner to receive bcast data, and copy from it - */ - int extra_parent_rank; - volatile mca_bcol_basesmuma_ctl_struct_t *extra_parent_ctl_pointer = NULL; /* binomial fanout */ - extra_parent_rank = sg_state->my_rank & (sg_state->pow_2-1); - extra_parent_ctl_pointer = sg_state->ctl_structs[extra_parent_rank]; - - sg_state->ready_flag = sg_state->ready_flag + sg_state->pow_2_levels; - - while(!IS_SG_DATA_READY(extra_parent_ctl_pointer, sg_state->ready_flag, - sg_state->sequence_number)) { - opal_progress(); - - } - - mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs, - sg_state->read_eq, - &sg_state->my_ctl_pointer->portals_buf_addr, - &extra_parent_ctl_pointer->portals_buf_addr, 0, - 0, sg_state->fragment_size); - - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - } - - goto Release; - } - - if (input_args->root_flag) { - - BASESMUMA_VERBOSE(1,("Scatter : Im root (bcol_module %x,ctl_pointer %x) my ready flag %d \n", - sg_state->bcol_module, sg_state->my_ctl_pointer, sg_state->ready_flag)); - rc = sm_portals_root_scatter(sg_state); - - /* gvm Fix: Redudant - opal_atomic_wmb (); - */ - - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - - if (rc != OMPI_SUCCESS) { - goto Release; - } - -Scatter_root_wait: - - BASESMUMA_VERBOSE(5,("Scatter: Im root waiting for children to complete my flag %d", - sg_state->my_ctl_pointer->flag)); - - for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends; - i++) { - - completed_posts = wait_for_post_complete_nb(sg_state->my_rank, - sg_state->my_ctl_pointer->n_sends, sg_state->ctl_structs, - sg_state->ready_flag, sg_state->sequence_number); - - } - - if (completed_posts < sg_state->my_ctl_pointer->n_sends) { - sg_state->phase = SCATTER_ROOT_WAIT; - return BCOL_FN_STARTED; - } - - goto Allgather; - } - - -Scatter: - - BASESMUMA_VERBOSE(1,("Scatter : Im non-root probing for data ")); - /* compute the list of possible sources */ - /* - sg_state->src_list = (int *) malloc(sizeof(int) * (sg_state->pow_2_levels + 1)); - */ - assert(MAX_SM_GROUP_SIZE > sg_state->pow_2_levels+1); - - for( i = 0; i < sg_state->pow_2_levels; i++) { - sg_state->src_list[i] = sg_state->my_rank ^ (1< pow_2 */ - - if ((sg_state->my_rank + sg_state->pow_2) < sg_state->group_size) { - sg_state->src_list[i] = sg_state->my_rank + sg_state->pow_2; - } else { - sg_state->src_list[i] = -1; - } - - - BASESMUMA_VERBOSE(1,("Scatter : Ready flag %d Im non-root probing for %d procs %d:%d \n", - sg_state->ready_flag,sg_state->pow_2_levels,sg_state->src_list[0],sg_state->src_list[1])); -Probe: - /* If I am not the root, then poll on possible "senders'" control structs */ - /* For portals we block for now */ - /* Shared memory iprobe */ - - - /* - SG_LARGE_MSG_NB_PROBE(sg_state->src_list, sg_state->pow_2_levels + 1, - sg_state->src_list_index, sg_state->matched, sg_state->src, - sg_state->ctl_structs, - sg_state->parent_ctl_pointer, sg_state->ready_flag, sg_state->sequence_number); - */ - - for( i = 0; i < sg_state->cs->num_to_probe && 0 == sg_state->matched; - i++) { - sg_large_msg_probe(sg_state); - } - - if (!sg_state->matched) { - sg_state->phase = PROBE; - return BCOL_FN_STARTED; - } - - BASESMUMA_VERBOSE(1,("Scatter : Im non-root match received")); - /* If I am a secondary root */ - if ((sg_state->matched) && (sg_state->src == sg_state->pow_2 + sg_state->my_rank)) { - - BASESMUMA_VERBOSE(5,("Scatter : Im secondary root \n")); - - rc = sm_portals_secondary_root_scatter(sg_state); - if (rc != OMPI_SUCCESS) { - goto Release; - } - -Scatter_extra_root_wait: - - for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends; - i++) { - - completed_posts = wait_for_post_complete_nb(sg_state->my_rank, sg_state->my_ctl_pointer->n_sends, - sg_state->ctl_structs, sg_state->ready_flag, sg_state->sequence_number); - - } - - if (completed_posts < sg_state->my_ctl_pointer->n_sends) { - sg_state->phase = SCATTER_EXTRA_ROOT_WAIT; - return BCOL_FN_STARTED; - } - - goto Allgather; - } - - /* we need to see whether this is really - * who we are looking for - */ - for( i = 0; i < sg_state->parent_ctl_pointer->n_sends; i++) { - uint64_t local_offset = 0; - uint64_t remote_offset = 0; - - BASESMUMA_VERBOSE(5,("%d found it from %d \n",sg_state->my_rank,sg_state->src)); - - if( sg_state->my_rank == (sg_state->src^(1<parent_ctl_pointer = sg_state->ctl_structs[sg_state->src]; - - /* we found our root within the group ... */ - BASESMUMA_VERBOSE(5,("Shared memory probe was matched, the root is %d ",sg_state->src)); - - sg_state->my_ctl_pointer->n_sends = i; - - /* Am I source for other process during scatter phase */ - if ( i > 0) { - BASESMUMA_VERBOSE(1,("Scatter : Im Internal node \n")); - - rc = sm_portals_internode_scatter(sg_state); - - if (rc != OMPI_SUCCESS) { - goto Release; - } - -Scatter_parent_wait: - - for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends; - i++) { - - completed_posts = wait_for_post_complete_nb(sg_state->my_rank, - sg_state->my_ctl_pointer->n_sends, - sg_state->ctl_structs, - sg_state->ready_flag, sg_state->sequence_number); - } - - if (completed_posts < sg_state->my_ctl_pointer->n_sends) { - sg_state->phase = SCATTER_PARENT_WAIT; - return BCOL_FN_STARTED; - } - - } else { - - BASESMUMA_VERBOSE(1,("Scatter : Im leaf node \n")); - - /* takes care of first level recurssive double */ - sg_state->length = sg_state->parent_ctl_pointer->length/ - (1<<(sg_state->parent_ctl_pointer->n_sends - 1)); - sg_state->my_ctl_pointer->length = sg_state->length; - sg_state->my_ctl_pointer->offset = sg_state->parent_ctl_pointer->offset; - - - while(!IS_SG_DATA_READY(sg_state->parent_ctl_pointer, - sg_state->ready_flag, sg_state->sequence_number)) { - opal_progress(); - } - - mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs, - sg_state->read_eq, - &sg_state->my_ctl_pointer->portals_buf_addr, - &sg_state->parent_ctl_pointer->portals_buf_addr, - sg_state->my_ctl_pointer->offset, - sg_state->my_ctl_pointer->offset, sg_state->length); - - /* signal that I am done reading data from parent */ - /* - opal_atomic_wmb (); - */ - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - } - - BASESMUMA_VERBOSE(1,("Completed %d found it from %d \n", - sg_state->my_rank, sg_state->src)); - - while(sg_state->ready_flag > sg_state->parent_ctl_pointer->flag); - - goto Allgather; - } - } - - { - /* this is not who we are looking for, - * mark as false positive so we don't - * poll here again - */ - sg_state->src_list[sg_state->src_list_index] = -1; - sg_state->matched = 0; - goto Probe; - } - -Allgather: - - BASESMUMA_VERBOSE(5,("Completed Scatter phase")); - - /* zip it back up - we have already taken care of first level */ - sg_state->global_sg_offset = sg_state->my_ctl_pointer->offset; - - /* first level of zip up */ - sg_state->length = 2 * sg_state->fragment_size/sg_state->pow_2; - - - /* Posting for all phases of recursive doubling */ - extra_src_posts = (sg_state->my_rank + sg_state->pow_2 < sg_state->group_size ) ? 1: 0; - allgather_posts = sg_state->pow_2_levels - 1; - total_msg_posts = allgather_posts + extra_src_posts ; - - if ((!sg_state->msg_posted) && (total_msg_posts > 0)){ - - mca_bcol_basesmuma_portals_post_msg(sg_state->cs, &sg_state->my_ctl_pointer->portals_buf_addr, - sg_state->my_userbuf, sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length, - PTL_EQ_NONE, total_msg_posts, blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE - | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - ); - sg_state->msg_posted = true; - } - - BASESMUMA_VERBOSE(5,("Done with allgather phase")); - /* I reached an allgather phase */ - sg_state->ready_flag++; - opal_atomic_wmb (); - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - - rc = sm_portals_bcasts_allgather_phase(sg_state); - - if (rc != OMPI_SUCCESS) { - BASESMUMA_VERBOSE(10,("Error in Bcast's allgather phase ")); - goto Release; - } - - /* If I am source for non-power 2 children wait for them */ - /* If I am secondary root then my partner would be real root - * so no need for exchange of data with the extra partner */ - sg_state->extra_partner = sg_state->my_rank + sg_state->pow_2 ; - if ((sg_state->extra_partner < sg_state->group_size) && (!sg_state->secondary_root)) { - - sg_state->extra_partner_ctl_pointer = sg_state->ctl_structs[sg_state->extra_partner]; - /* Block until extra partner has copied data */ - while(!IS_SG_DATA_READY(sg_state->extra_partner_ctl_pointer, - sg_state->ready_flag, sg_state->sequence_number)) { - opal_progress(); - } - - } - -Release: - - BASESMUMA_VERBOSE(1,("Im done ")); - - sg_state->my_ctl_pointer->starting_flag_value++; - sg_state->phase = FINISHED; - - - return BCOL_FN_COMPLETE; - -} - - -int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - - int i; - mca_bcol_basesmuma_portal_proc_info_t *portals_info; - int dummy_group_size; - int rc = OMPI_SUCCESS; - int buff_idx; - int count=input_args->count; - size_t pack_len = 0, dt_size =0 ; - struct ompi_datatype_t* dtype=input_args->dtype; - int completed_posts = 0; - sg_state_t *sg_state = NULL; - mca_bcol_basesmuma_module_t *bcol_module=NULL; - int extra_src_posts = -1,allgather_posts = -1, total_msg_posts = -1; - bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; - - sg_state = (sg_state_t*)(&bcol_module->sg_state); - - BASESMUMA_VERBOSE(1,("Im entering nb_knownroot_bcast bcol = %x ", - c_input_args->bcol_module)); - /* Re-entering the algorithm */ - switch (sg_state->phase) { - case PROBE: - if (input_args->root_flag) { - /* I became a root for this group */ - sg_state->phase = START; - goto Start; - } - goto Probe; - break; - - case SCATTER_ROOT_WAIT: - goto Scatter_root_wait; - - case SCATTER_EXTRA_ROOT_WAIT: - goto Scatter_extra_root_wait; - - case SCATTER_PARENT_WAIT: - goto Scatter_parent_wait; - - default: - break; - } - - /* Allocate space for algorithm state */ - /* - sg_state = (sg_state_t *) malloc(sizeof(sg_state_t)); - bcol_module->sg_state = (void*) sg_state; - */ - - /* Make sure there userbuffer is not null */ - - sg_state->phase = INIT; - sg_state->secondary_root = false; - sg_state->msg_posted = false; - sg_state->matched = 0; - /* Copy input args to local variables */ - sg_state->my_userbuf = (void*)((unsigned char*)input_args->userbuf); - assert(sg_state->my_userbuf != NULL); - sg_state->sequence_number=input_args->sequence_num; - sg_state->cs = &mca_bcol_basesmuma_component; - sg_state->bcol_module = bcol_module; - buff_idx = input_args->src_desc->buffer_index; - - /* Initialize SM group info used for control signaling */ - init_sm_group_info(sg_state, buff_idx); - - /* calculate the largest power of two that is smaller than - * or equal to the group size - */ - sg_state->pow_2_levels = pow_sm_k(2, sg_state->group_size, &(dummy_group_size)); - if( sg_state->group_size < (1 << sg_state->pow_2_levels)) { - sg_state->pow_2_levels--; - } - /* power-of-two group size */ - sg_state->pow_2 = 1 << sg_state->pow_2_levels; - - - /* we will work only on packed data - so compute the length*/ - ompi_datatype_type_size(dtype, &dt_size); - sg_state->fragment_size = count*dt_size; - - - /* Init portals scatter allgather info */ - rc = init_sm_portals_sg_info(sg_state); - - if (rc != OMPI_SUCCESS) { - goto Release; - } -Start: -Extra : - /* - * My rank > pow2 groupsize - */ - if( sg_state->my_rank >= sg_state->pow_2 ) { - - if (input_args->root_flag){ - - rc = sm_portals_extra_root_scatter(sg_state); - if (rc != OMPI_SUCCESS) { - goto Release; - } - - } else { - /* - * Wait for my partner to receive bcast data, and copy from it - */ - int extra_parent_rank; - volatile mca_bcol_basesmuma_ctl_struct_t *extra_parent_ctl_pointer = NULL; /* binomial fanout */ - extra_parent_rank = sg_state->my_rank & (sg_state->pow_2-1); - extra_parent_ctl_pointer = sg_state->ctl_structs[extra_parent_rank]; - - sg_state->ready_flag = sg_state->ready_flag + sg_state->pow_2_levels; - - while(!IS_SG_DATA_READY(extra_parent_ctl_pointer, sg_state->ready_flag, - sg_state->sequence_number)) { - opal_progress(); - - } - - mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs, - sg_state->read_eq, - &sg_state->my_ctl_pointer->portals_buf_addr, - &extra_parent_ctl_pointer->portals_buf_addr, 0, - 0, sg_state->fragment_size); - - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - } - - goto Release; - } - - if (input_args->root_flag) { - - BASESMUMA_VERBOSE(1,("Scatter : Im root (bcol_module %x,ctl_pointer %x) my ready flag %d \n", - bcol_module, sg_state->my_ctl_pointer, sg_state->ready_flag)); - rc = sm_portals_root_scatter(sg_state); - - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - - if (rc != OMPI_SUCCESS) { - goto Release; - } - -Scatter_root_wait: - - BASESMUMA_VERBOSE(5,("Scatter: Im root waiting for children to complete my flag %d", - sg_state->my_ctl_pointer->flag)); - for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends; - i++) { - completed_posts = wait_for_post_complete_nb(sg_state->my_rank, - sg_state->my_ctl_pointer->n_sends, sg_state->ctl_structs, - sg_state->ready_flag, sg_state->sequence_number); - } - - if (completed_posts < sg_state->my_ctl_pointer->n_sends) { - sg_state->phase = SCATTER_ROOT_WAIT; - return BCOL_FN_STARTED; - } - - goto Allgather; - } - - -Probe: - - sg_state->src = compute_src_from_root(input_args->root_route->rank, sg_state->my_rank, - sg_state->pow_2, sg_state->group_size); - - sg_state->parent_ctl_pointer = sg_state->ctl_structs[sg_state->src]; - - while(!IS_SG_DATA_READY(sg_state->parent_ctl_pointer, sg_state->ready_flag, - sg_state->sequence_number)) { - opal_progress(); - - } - sg_state->matched = true; - - /* If I am a secondary root */ - if ((sg_state->matched) && (sg_state->src == sg_state->pow_2 + sg_state->my_rank)) { - - rc = sm_portals_secondary_root_scatter(sg_state); - if (rc != OMPI_SUCCESS) { - goto Release; - } -Scatter_extra_root_wait: - - for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends; - i++) { - - completed_posts = wait_for_post_complete_nb(sg_state->my_rank, sg_state->my_ctl_pointer->n_sends, - sg_state->ctl_structs, sg_state->ready_flag, sg_state->sequence_number); - - } - - if (completed_posts < sg_state->my_ctl_pointer->n_sends) { - sg_state->phase = SCATTER_EXTRA_ROOT_WAIT; - return BCOL_FN_STARTED; - } - - goto Allgather; - } - - /* we need to see whether this is really - * who we are looking for - */ - for( i = 0; i < sg_state->parent_ctl_pointer->n_sends; i++) { - uint64_t local_offset = 0; - uint64_t remote_offset = 0; - - BASESMUMA_VERBOSE(5,("%d found it from %d \n",sg_state->my_rank,sg_state->src)); - - if( sg_state->my_rank == (sg_state->src^(1<parent_ctl_pointer = sg_state->ctl_structs[sg_state->src]; - - /* we found our root within the group ... */ - BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d ",sg_state->src)); - - sg_state->my_ctl_pointer->n_sends = i; - - /* Am I source for other process during scatter phase */ - if ( i > 0) { - - rc = sm_portals_internode_scatter(sg_state); - - if (rc != OMPI_SUCCESS) { - goto Release; - } -Scatter_parent_wait: - - for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends; - i++) { - - completed_posts = wait_for_post_complete_nb(sg_state->my_rank, - sg_state->my_ctl_pointer->n_sends, - sg_state->ctl_structs, - sg_state->ready_flag, sg_state->sequence_number); - } - - if (completed_posts < sg_state->my_ctl_pointer->n_sends) { - sg_state->phase = SCATTER_PARENT_WAIT; - return BCOL_FN_STARTED; - } - - } else { - - /* takes care of first level recursive double */ - sg_state->length = sg_state->parent_ctl_pointer->length/ - (1<<(sg_state->parent_ctl_pointer->n_sends - 1)); - sg_state->my_ctl_pointer->length = sg_state->length; - sg_state->my_ctl_pointer->offset = sg_state->parent_ctl_pointer->offset; - - - while(!IS_SG_DATA_READY(sg_state->parent_ctl_pointer, - sg_state->ready_flag, sg_state->sequence_number)) { - opal_progress(); - } - - mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs, - sg_state->read_eq, - &sg_state->my_ctl_pointer->portals_buf_addr, - &sg_state->parent_ctl_pointer->portals_buf_addr, - sg_state->my_ctl_pointer->offset, - sg_state->my_ctl_pointer->offset, sg_state->length); - - /* signal that I am done reading data from parent */ - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - } - - BASESMUMA_VERBOSE(5,("Completed %d found it from %d \n", - sg_state->my_rank, sg_state->src)); - - while(sg_state->ready_flag > sg_state->parent_ctl_pointer->flag); - - goto Allgather; - } - } - - { - /* this is not who we are looking for, - * mark as false positive so we don't - * poll here again - */ - sg_state->src_list[sg_state->src_list_index] = -1; - sg_state->matched = 0; - goto Probe; - } - -Allgather: - - /* zip it back up - we have already taken care of first level */ - sg_state->global_sg_offset = sg_state->my_ctl_pointer->offset; - - /* first level of zip up */ - sg_state->length = 2 * sg_state->fragment_size/sg_state->pow_2; - - /* Posting for all phases of recursive doubling */ - extra_src_posts = (sg_state->my_rank + sg_state->pow_2 < sg_state->group_size ) ? 1: 0; - allgather_posts = sg_state->pow_2_levels - 1; - total_msg_posts = allgather_posts + extra_src_posts ; - - if ((!sg_state->msg_posted) && (total_msg_posts > 0)){ - - mca_bcol_basesmuma_portals_post_msg(sg_state->cs, &sg_state->my_ctl_pointer->portals_buf_addr, - sg_state->my_userbuf, sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length, - PTL_EQ_NONE, total_msg_posts, blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE - | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - ); - sg_state->msg_posted = true; - } - - sg_state->ready_flag++; - opal_atomic_wmb (); - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - - rc = sm_portals_bcasts_allgather_phase(sg_state); - - if (rc != OMPI_SUCCESS) { - BASESMUMA_VERBOSE(10,("Error in Bcast's allgather phase ")); - goto Release; - } - - /* If I am source for non-power 2 children wait for them */ - /* If I am secondary root then my partner would be real root - * so no need for exchange of data with the extra partner */ - sg_state->extra_partner = sg_state->my_rank + sg_state->pow_2 ; - if ((sg_state->extra_partner < sg_state->group_size) && (!sg_state->secondary_root)) { - - sg_state->extra_partner_ctl_pointer = sg_state->ctl_structs[sg_state->extra_partner]; - /* Block until extra partner has copied data */ - while(!IS_SG_DATA_READY(sg_state->extra_partner_ctl_pointer, - sg_state->ready_flag, sg_state->sequence_number)) { - opal_progress(); - } - - } - -Release: - - BASESMUMA_VERBOSE(1,("Im done ")); - - sg_state->my_ctl_pointer->starting_flag_value++; - sg_state->phase = FINISHED; - - return BCOL_FN_COMPLETE; - -} -#endif /* __PORTALS_AVAIL__ */ diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.h b/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.h deleted file mode 100644 index d15851b036..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.h +++ /dev/null @@ -1,626 +0,0 @@ -#ifdef __PORTALS_AVAIL__ -#define __PORTALS_ENABLE__ - -#include - -#include "ompi_config.h" -#include "ompi/constants.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" - -#include "bcol_basesmuma_utils.h" -#include "bcol_basesmuma_portals.h" -#include "bcol_basesmuma.h" - -#if 0 -struct scatter_allgather_nb_bcast_state_t -{ - /* local variables */ - uint64_t length; - int my_rank, src, matched; - int *src_list; - int group_size; - int64_t ready_flag; - int pow_2, pow_2_levels; - int src_list_index; - uint64_t fragment_size; /* user buffer size */ - - /* Input argument variables */ - void *my_userbuf; - int64_t sequence_number; - - /* Extra source variables */ - bool secondary_root; - int partner , extra_partner; - - /* Scatter Allgather offsets */ - uint64_t local_sg_offset , global_sg_offset , partner_offset ; - - /* Portals messaging relevant variables */ - ptl_handle_eq_t allgather_eq_h; - ptl_handle_eq_t read_eq; - ptl_event_t allgather_event; - bool msg_posted; - - /* OMPI module and component variables */ - mca_bcol_basesmuma_component_t *cs; - mca_bcol_basesmuma_module_t *bcol_module; - - /* Control structure and payload variables */ - volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* scatter source */ - volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer; /* scatter source */ - - int phase; -}; - -typedef struct scatter_allgather_nb_bcast_state_t sg_state_t; -#endif - -bool blocked_post = false; - -#define IS_SG_DATA_READY(peer, my_flag, my_sequence_number) \ - (((peer)->sequence_number == (my_sequence_number) && \ - (peer)->flags[BCAST_FLAG] >= (my_flag) \ - )? true : false ) - - - -#define SG_LARGE_MSG_PROBE(src_list, n_src, src_list_index, matched, \ - src, data_buffs, data_src_ctl_pointer, \ - data_src_lmsg_ctl_pointer, ready_flag, \ - sequence_number) \ -do { \ - int j; \ - for( j = 0; j < n_src; j++) { \ - if(src_list[j] != -1) { \ - data_src_ctl_pointer = data_buffs[src_list[j]].ctl_struct; \ - data_src_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) \ - data_buffs[src_list[j]].payload; \ - if( IS_SG_DATA_READY(data_src_ctl_pointer,ready_flag,sequence_number)) { \ - src = src_list[j]; \ - matched = 1; \ - src_list_index = j; \ - break; \ - } \ - } \ - } \ -} while(0) - -#define SG_LARGE_MSG_NB_PROBE(src_list, n_src, src_list_index, matched, \ - src, ctl_structs, data_src_ctl_pointer, \ - ready_flag, sequence_number) \ -do { \ - int j; \ - for( j = 0; j < n_src; j++) { \ - if(src_list[j] != -1) { \ - data_src_ctl_pointer = ctl_structs[src_list[j]]; \ - if( IS_SG_DATA_READY(data_src_ctl_pointer,ready_flag,sequence_number)) { \ - src = src_list[j]; \ - matched = 1; \ - src_list_index = j; \ - break; \ - } \ - } \ - } \ -} while(0) - - - - - -static inline __opal_attribute_always_inline__ -int wait_for_peers(int my_rank, int npeers, volatile mca_bcol_basesmuma_payload_t *data_buffs, - int flag_value, int sn) -{ - int *peers_list = NULL; - int counter = 0, diter = 0; - volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer = NULL; - - peers_list = (int *)malloc(sizeof(int) * npeers); - - for (diter = 0; diter < npeers; diter++ ){ - peers_list[diter] = my_rank ^ (1<pow_2_levels+1; - - - for( j = 0; j < n_src; j++) { - if(sg_state->src_list[j] != -1) { - sg_state->parent_ctl_pointer = sg_state->ctl_structs[sg_state->src_list[j]]; - - BASESMUMA_VERBOSE(5,("Parent %d ctl pointer (parent=%x, my ctl=%x) flag %d", - sg_state->src_list[j],sg_state->parent_ctl_pointer, - sg_state->my_ctl_pointer, - sg_state->parent_ctl_pointer->flag)); - - if (IS_SG_DATA_READY(sg_state->parent_ctl_pointer, - sg_state->ready_flag, sg_state->sequence_number)) { - sg_state->src = sg_state->src_list[j]; - sg_state->matched = 1; - sg_state->src_list_index = j; - break; - } - } - } - - return 0; -} -/* - * I will post message for all the my children - */ -static inline __opal_attribute_always_inline__ -int sm_portals_root_scatter(sg_state_t *sg_state) -{ - int extra_src_posts = -1, scatter_posts = -1, allgather_posts = -1, - total_msg_posts = -1; - - BASESMUMA_VERBOSE(10,("I am the root of the data")); - sg_state->my_ctl_pointer->offset = 0; - sg_state->my_ctl_pointer->n_sends = sg_state->pow_2_levels; - sg_state->my_ctl_pointer->length = sg_state->fragment_size; - - - - extra_src_posts = (sg_state->my_rank + sg_state->pow_2 < sg_state->group_size ) ? 1: 0; - scatter_posts = sg_state->my_ctl_pointer->n_sends; - allgather_posts = sg_state->pow_2_levels - 1; - - total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ; - - if ( total_msg_posts <= 0) { - BASESMUMA_VERBOSE(10,("No need to post the data ")); - return OMPI_SUCCESS; - } - - mca_bcol_basesmuma_portals_post_msg(sg_state->cs, - &sg_state->my_ctl_pointer->portals_buf_addr, - sg_state->my_userbuf, sg_state->fragment_size, - PTL_EQ_NONE, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | - PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE); - - /* - mca_bcol_basesmuma_portals_post_msg(sg_state->cs, - &sg_state->my_ctl_pointer->portals_buf_addr, - sg_state->my_userbuf, sg_state->fragment_size, - sg_state->allgather_eq_h, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | - PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE); - */ - - sg_state->msg_posted = true ; - - /* - opal_atomic_wmb(); - */ - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - - return OMPI_SUCCESS; -} - -/* - * Im root but my rank > pow2_groupsize, so will copy to partner who - * will act as root (secondary) - */ -static inline __opal_attribute_always_inline__ -int sm_portals_extra_root_scatter(sg_state_t *sg_state) -{ - int scatter_partner = -1; - volatile mca_bcol_basesmuma_ctl_struct_t *scatter_partner_ctl_pointer = NULL; - - int total_msg_posts = 1; - - if ( total_msg_posts <= 0) { - BASESMUMA_VERBOSE(10,("No need to post the data ")); - } - else { - mca_bcol_basesmuma_portals_post_msg(sg_state->cs, - &sg_state->my_ctl_pointer->portals_buf_addr, - sg_state->my_userbuf, sg_state->fragment_size, - PTL_EQ_NONE, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET - | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE); - sg_state->msg_posted = true ; - - } - - opal_atomic_wmb(); - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - - - - scatter_partner = sg_state->my_rank - sg_state->pow_2; - scatter_partner_ctl_pointer = - sg_state->ctl_structs[scatter_partner]; - - while(!IS_SG_DATA_READY(scatter_partner_ctl_pointer, sg_state->ready_flag, - sg_state->sequence_number)){ - opal_progress(); - } - - return OMPI_SUCCESS; -} - -/* - * Gets msg from the partner (> pow2_groupsize) and posts the - * message acting as root - */ -static inline __opal_attribute_always_inline__ -int sm_portals_secondary_root_scatter(sg_state_t *sg_state) -{ - - volatile mca_bcol_basesmuma_ctl_struct_t *extra_src_ctl_pointer = NULL; - int scatter_posts, allgather_posts, extra_src_posts, total_msg_posts; - - sg_state->secondary_root = true; - BASESMUMA_VERBOSE(10,("I am the secondary root for the data")); - sg_state->my_ctl_pointer->offset = 0; - sg_state->my_ctl_pointer->n_sends = sg_state->pow_2_levels; - sg_state->my_ctl_pointer->length = sg_state->fragment_size; - - extra_src_ctl_pointer = sg_state->ctl_structs[sg_state->src]; - - mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs, - sg_state->read_eq, - &sg_state->my_ctl_pointer->portals_buf_addr, - &extra_src_ctl_pointer->portals_buf_addr, 0, - 0, sg_state->fragment_size); - - - extra_src_posts = 0; - scatter_posts = sg_state->my_ctl_pointer->n_sends; - allgather_posts = sg_state->pow_2_levels - 1; - - total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ; - - if (total_msg_posts > 0) { - mca_bcol_basesmuma_portals_post_msg(sg_state->cs, - &sg_state->my_ctl_pointer->portals_buf_addr, - sg_state->my_userbuf, sg_state->fragment_size, - PTL_EQ_NONE, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET - | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE); - sg_state->msg_posted = true ; - } - opal_atomic_wmb(); - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - - return OMPI_SUCCESS; -} - -/* - * Internode Scatter: Get data from my parent and post for my children - */ - -static inline __opal_attribute_always_inline__ -int sm_portals_internode_scatter(sg_state_t *sg_state) -{ - - int scatter_posts, allgather_posts, extra_src_posts, - total_msg_posts; - uint64_t local_offset, remote_offset; - - /* compute the size of the chunk to copy */ - sg_state->length = (sg_state->parent_ctl_pointer->length)/ - (1<<(sg_state->parent_ctl_pointer->n_sends - sg_state->my_ctl_pointer->n_sends)); - sg_state->my_ctl_pointer->length = sg_state->length; - sg_state->my_ctl_pointer->offset = - sg_state->parent_ctl_pointer->offset + sg_state->length; - - - local_offset = sg_state->my_ctl_pointer->offset; - remote_offset = sg_state->parent_ctl_pointer->offset + - sg_state->length; - - mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs, - sg_state->read_eq, - &sg_state->my_ctl_pointer->portals_buf_addr, - &sg_state->parent_ctl_pointer->portals_buf_addr,local_offset, - remote_offset,sg_state->length); - - /* Now post the message for other children to read */ - extra_src_posts = (sg_state->my_rank + sg_state->pow_2 < - sg_state->group_size ) ? 1: 0; - scatter_posts = sg_state->my_ctl_pointer->n_sends; - allgather_posts = sg_state->pow_2_levels - 1; - - total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ; - - if (total_msg_posts > 0) { - mca_bcol_basesmuma_portals_post_msg(sg_state->cs, &sg_state->my_ctl_pointer->portals_buf_addr, - sg_state->my_userbuf, sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length, - PTL_EQ_NONE, - total_msg_posts, - blocked_post, - PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE - | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE); - - sg_state->msg_posted = true; - } - /* - opal_atomic_wmb(); - */ - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - - return OMPI_SUCCESS; -} - -/* - * Bcast's Allgather Phase: - * Combines data from all processes using recursive doubling algorithm - */ -static inline __opal_attribute_always_inline__ -int sm_portals_bcasts_allgather_phase(sg_state_t *sg_state) -{ - int ag_loop, partner; - volatile mca_bcol_basesmuma_ctl_struct_t *partner_ctl_pointer = NULL; /* recursive double */ - - - for( ag_loop = 1; ag_loop < sg_state->pow_2_levels; ag_loop++) { - /* get my partner for this level */ - partner = sg_state->my_rank^(1<ctl_structs[partner]; - - - /* Block until partner is at this level of recursive-doubling stage */ - while(!IS_SG_DATA_READY(partner_ctl_pointer, sg_state->ready_flag, - sg_state->sequence_number)) { - opal_progress(); - } - assert(partner_ctl_pointer->flag >= sg_state->ready_flag); - - if (partner_ctl_pointer->offset < sg_state->my_ctl_pointer->offset) { - sg_state->global_sg_offset -= sg_state->length; - sg_state->local_sg_offset = sg_state->global_sg_offset; - } else { - sg_state->local_sg_offset = sg_state->global_sg_offset + sg_state->length; - } - - - BASESMUMA_VERBOSE(10,("Allgather Phase: Get message from process %d, length %d", - partner, sg_state->length)); - mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs, - sg_state->read_eq, - &sg_state->my_ctl_pointer->portals_buf_addr, - &partner_ctl_pointer->portals_buf_addr,sg_state->local_sg_offset, - sg_state->local_sg_offset, sg_state->length); - - sg_state->ready_flag++; - opal_atomic_wmb(); - sg_state->my_ctl_pointer->flag = sg_state->ready_flag; - - /* Block until partner is at this level of recursive-doubling stage */ - while(!IS_SG_DATA_READY(partner_ctl_pointer, sg_state->ready_flag, - sg_state->sequence_number)) { - opal_progress(); - } - - /* double the length */ - sg_state->length *= 2; - } - - return OMPI_SUCCESS; - -} - - -static inline __opal_attribute_always_inline__ -int init_sm_group_info(sg_state_t *sg_state, int buff_idx) -{ - int idx, leading_dim; - int first_instance=0; - int flag_offset; - - /* Get addresing information */ - sg_state->group_size = sg_state->bcol_module->colls_no_user_data.size_of_group; - leading_dim = sg_state->bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - BASESMUMA_VERBOSE(1,("My buffer idx %d group size %d, leading dim %d, idx %d", - buff_idx,sg_state->group_size,leading_dim,idx)); - /* grab the ctl buffs */ - sg_state->ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **) - sg_state->bcol_module->colls_with_user_data.ctl_buffs+idx; - - sg_state->my_rank = sg_state->bcol_module->super.sbgp_partner_module->my_index; - sg_state->my_ctl_pointer = sg_state->ctl_structs[sg_state->my_rank]; - - if (sg_state->my_ctl_pointer->sequence_number < sg_state->sequence_number) { - first_instance = 1; - } - - if(first_instance) { - sg_state->my_ctl_pointer->flag = -1; - sg_state->my_ctl_pointer->index = 1; - - sg_state->my_ctl_pointer->starting_flag_value = 0; - flag_offset = 0; - - } else { - sg_state->my_ctl_pointer->index++; - } - - /* For bcast we shud have only entry to this bcol - assert(sg_state->my_ctl_pointer->flag == -1); - */ - - /* increment the starting flag by one and return */ - flag_offset = sg_state->my_ctl_pointer->starting_flag_value; - sg_state->ready_flag = flag_offset + sg_state->sequence_number + 1; - - sg_state->my_ctl_pointer->sequence_number = sg_state->sequence_number; - - return OMPI_SUCCESS; - -} - -static inline __opal_attribute_always_inline__ -int init_sm_portals_sg_info(sg_state_t *sg_state) -{ -/* Get portals info*/ - mca_bcol_basesmuma_portal_proc_info_t *portals_info; - int rc = OMPI_SUCCESS; - int sg_matchbits; - - portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)sg_state->cs->portals_info; - - sg_matchbits = sg_state->sequence_number ; - - /* Construct my portal buffer address and copy to payload buffer */ - mca_bcol_basesmuma_construct_portal_address(&sg_state->my_ctl_pointer->portals_buf_addr, - portals_info->portal_id.nid, - portals_info->portal_id.pid, - sg_matchbits, - sg_state->bcol_module->super.sbgp_partner_module->group_comm->c_contextid); - - sg_state->my_ctl_pointer->portals_buf_addr.userbuf = sg_state->my_userbuf; - sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length = sg_state->fragment_size; - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ -int compute_src_from_root(int group_root, int my_group_rank, int pow2, int - group_size) -{ - - int root, relative_rank, src, i; - - if (group_root < pow2) { - root = group_root; - } else { - /* the source of the data is extra node, - the real root it represented by some rank from - pow2 group */ - root = group_root - pow2; - /* shortcut for the case when my rank is root for the group */ - if (my_group_rank == root) { - return group_root; - } - } - - relative_rank = (my_group_rank - root) < 0 ? my_group_rank - root + pow2 : - my_group_rank - root; - - for (i = 1; i < pow2; i<<=1) { - if (relative_rank & i) { - src = my_group_rank ^ i; - if (src >= pow2) - src -= pow2; - - return src; - } - } - - return -1; -} - -int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -#endif diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_knomial_bcast.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_knomial_bcast.c deleted file mode 100644 index a1454102a8..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_knomial_bcast.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -/* #define __PORTALS_AVAIL__ */ -#ifdef __PORTALS_AVAIL__ - -#define __PORTALS_ENABLE__ -#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h" -#include "ompi/constants.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" -#include "bcol_basesmuma_utils.h" - -#include "bcol_basesmuma_portals.h" - -/* debug */ -#include -/* end debug */ - - -/** - * Shared memory non-blocking Broadcast - K-nomial fan-out for small data buffers. - * This routine assumes that buf (the input buffer) is a single writer - * multi reader (SWMR) shared memory buffer owned by the calling rank - * which is the only rank that can write to this buffers. - * It is also assumed that the buffers are registered and fragmented - * at the ML level and that buf is sufficiently large to hold the data. - * - * - * @param buf - SWMR shared buffer within a sbgp that the - * executing rank can write to. - * @param count - the number of elements in the shared buffer. - * @param dtype - the datatype of a shared buffer element. - * @param root - the index within the sbgp of the root. - * @param module - basesmuma module. - */ -int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ -#if 0 - /* local variables */ - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - int i, matched = 0; - int src=-1; - int group_size; - int my_rank, first_instance=0, flag_offset; - int rc = OMPI_SUCCESS; - int leading_dim, buff_idx, idx; - int count=input_args->count; - struct ompi_datatype_t* dtype=input_args->dtype; - int64_t sequence_number=input_args->sequence_num; - - volatile int64_t ready_flag; - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char* parent_data_pointer; - volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - void *userbuf = (void *)((unsigned char *)input_args->userbuf); - - size_t pack_len = 0, dt_size; - - struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL; - struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL; - mca_bcol_basesmuma_portal_proc_info_t *portals_info; - portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info; - - /* we will work only on packed data - so compute the length*/ - ompi_datatype_type_size(dtype, &dt_size); - pack_len=count*dt_size; - buff_idx = input_args->src_desc->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Set pointer to current proc ctrl region */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) data_buffs[my_rank].payload; - - /* setup resource recycling */ - if( my_ctl_pointer->sequence_number < sequence_number ) { - first_instance=1; - } - - if( first_instance ) { - /* Signal arrival */ - my_ctl_pointer->flag = -1; - my_ctl_pointer->index=1; - /* this does not need to use any flag values , so only need to - * set the value for subsequent values that may need this */ - my_ctl_pointer->starting_flag_value=0; - flag_offset=0; - - } else { - /* only one thread at a time will be making progress on this - * collective, so no need to make this atomic */ - my_ctl_pointer->index++; - } - - - /* increment the starting flag by one and return */ - flag_offset = my_ctl_pointer->starting_flag_value; - ready_flag = flag_offset + sequence_number + 1; - my_ctl_pointer->sequence_number = sequence_number; - - - /* Construct my portal buffer address and copy to payload buffer */ - mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer, - portals_info->portal_id.nid, - portals_info->portal_id.pid, - sequence_number, - bcol_module->super.sbgp_partner_module->group_comm->c_contextid); - - /* non-blocking broadcast algorithm */ - - /* If I am the root, then signal ready flag */ - if(input_args->root_flag) { - ptl_handle_eq_t eq_h; - ptl_event_t event; - int ret; - - BASESMUMA_VERBOSE(10,("I am the root of the data")); - - /* create an event queue for the incoming buffer */ - ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, PTL_EQ_HANDLER_NONE, &eq_h); - - if (ret != PTL_OK) { - fprintf(stderr, "PtlEQAlloc() failed: %d \n",ret); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* Post the message using portal copy */ - - mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf, - pack_len, eq_h, my_lmsg_ctl_pointer->nsends); - - /* - * signal ready flag - */ - my_ctl_pointer->flag = ready_flag; - - /* wait for a response from the client */ - mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT, - &event, my_lmsg_ctl_pointer->nsends); - - /* free the event queue */ - ret = PtlEQFree(eq_h); - if (ret != PTL_OK) { - fprintf(stderr, "PtlEQFree() failed: %d )\n",ret); - } - - /* root is finished */ - goto Release; - } - - /* If I am not the root, then poll on possible "senders'" control structs */ - for( i = 0; i < cs->num_to_probe && 0 == matched; i++) { - - /* Shared memory iprobe */ - /* - BCOL_BASESMUMA_SM_PROBE(bcol_module->src, bcol_module->src_size, - my_rank, matched, src); - */ - do { - int j, n_src, my_index; - n_src = bcol_module->src_size; - - for( j = 0; j < n_src; j++) { - parent_ctl_pointer = data_buffs[bcol_module->src[j]].ctl_struct; - parent_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *) - data_buffs[bcol_module->src[j]].payload; - if (IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) { - - src = bcol_module->src[j]; - matched = 1; - break; - } - } - } while(0); - - } - - /* If not matched, then hop out and put me on progress list */ - if(0 == matched ) { - BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match")); - return BCOL_FN_NOT_STARTED; - } - - /* else, we found our root within the group ... */ - BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src)); - - /* receive the data from sender */ - /* get the data buff */ - /* taken care of in the macro */ - /*parent_data_pointer = data_buffs[src].payload;*/ - /* copy the data */ - mca_bcol_basesmuma_portals_get_msg(cs, parent_lmsg_ctl_pointer, userbuf, pack_len); - - /* set the memory barrier to ensure completion */ - opal_atomic_wmb (); - /* signal that I am done */ - my_ctl_pointer->flag = ready_flag; - - /* am I the last one? If so, release buffer */ - -Release: - my_ctl_pointer->starting_flag_value++; - - return BCOL_FN_COMPLETE; -#endif -} - -#if 0 - -#define BASESMUMA_K_NOMIAL_SEND_SIGNAL(radix_mask, radix, my_relative_index, \ - my_group_index, group_size,sm_data_buffs,sender_ready_flag, \ - num_pending_sends) \ -{ \ - int k, rc; \ - int dst; \ - int comm_dst; \ - volatile mca_bcol_basesmuma_header_t *recv_ctl_pointer = NULL; \ - volatile mca_bcol_basesmuma_portal_buf_addr_t *recv_lmsg_ctl_pointer = NULL; \ - \ - num_pending_sends = 0; \ - while(radix_mask > 0) { \ - /* For each level of tree, do sends */ \ - for (k = 1; \ - k < radix && my_relative_index + radix_mask * k < group_size; \ - ++k) { \ - \ - dst = my_group_index + radix_mask * k; \ - if (dst >= group_size) { \ - dst -= group_size; \ - } \ - /* Signal the children to get data */ \ - recv_ctl_pointer = data_buffs[dst].ctl; \ - recv_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *) \ - data_buffs[dst].payload; \ - recv_lmsg_ctl_pointer->src_index = my_group_index; \ - recv_lmsg_ctl_pointer->flag = sender_ready_flag; \ - ++num_pending_sends; \ - } \ - radix_mask /= radix; \ - } \ - \ -} - - - -int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - mca_bcol_basesmuma_module_t* bcol_module= - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - int i, matched = 0; - int src=-1; - int group_size; - int my_rank, first_instance=0, flag_offset; - int rc = OMPI_SUCCESS; - int leading_dim, buff_idx, idx; - int count=input_args->count; - struct ompi_datatype_t* dtype=input_args->dtype; - int64_t sequence_number=input_args->sequence_num; - - volatile int64_t ready_flag; - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char* parent_data_pointer; - volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - void *userbuf = (void *)((unsigned char *)input_args->userbuf); - - size_t pack_len = 0, dt_size; - - struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL; - struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL; - mca_bcol_basesmuma_portal_proc_info_t *portals_info; - portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info; - - /* we will work only on packed data - so compute the length*/ - ompi_datatype_type_size(dtype, &dt_size); - pack_len=count*dt_size; - buff_idx = input_args->src_desc->buffer_index; - - /* Get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - group_size = bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Set pointer to current proc ctrl region */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) data_buffs[my_rank].payload; - - /* setup resource recycling */ - if( my_ctl_pointer->sequence_number < sequence_number ) { - first_instance=1; - } - - if( first_instance ) { - /* Signal arrival */ - my_ctl_pointer->flag = -1; - my_ctl_pointer->index=1; - /* this does not need to use any flag values , so only need to - * set the value for subsequent values that may need this */ - my_ctl_pointer->starting_flag_value=0; - flag_offset=0; - - } else { - /* only one thread at a time will be making progress on this - * collective, so no need to make this atomic */ - my_ctl_pointer->index++; - } - - - /* increment the starting flag by one and return */ - flag_offset = my_ctl_pointer->starting_flag_value; - ready_flag = flag_offset + sequence_number + 1; - my_ctl_pointer->sequence_number = sequence_number; - - - /* Construct my portal buffer address and copy to payload buffer */ - mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer, - portals_info->portal_id.nid, - portals_info->portal_id.pid, - sequence_number, - bcol_module->super.sbgp_partner_module->group_comm->c_contextid); - - my_lmsg_ctl_pointer->userbuf = userbuff; - my_lsmg_ctl_pointer->userbuf_length = fragment_length; - /* create an event queue */ - ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, PTL_EQ_HANDLER_NONE, &eq_h); - - /* non-blocking broadcast algorithm */ - - /* If I am the root, then signal ready flag */ - if(input_args->root_flag) { - ptl_handle_eq_t eq_h; - ptl_event_t event; - int ret; - int root_radix_mask = sm_module->pow_knum; - - BASESMUMA_VERBOSE(10,("I am the root of the data")); - - - if (ret != PTL_OK) { - fprintf(stderr, "PtlEQAlloc() failed: %d \n",ret); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - BASESMUMA_K_NOMIAL_SEND_SIGNAL(root_radix_mask, radix, 0, - my_rank, group_size, data_buffs, ready_flag, nsends) ; - - mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf, - pack_len, eq_h, nsends); - - /* wait for a response from the client */ - mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT, - &event, nsends); - - /* root is finished */ - goto Release; - } - - /* Im not a root so wait until someone puts data and - * compute where to get data from */ - - while (my_ctl_pointer->flag != ready_flag) ; - - my_data_source_index = lmsg_ctl_pointer->src_index; - - parent_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *) - data_buffs[my_data_source_index].payload; - - mca_bcol_basesmuma_portals_get_msg(cs, parent_lmsg_ctl_pointer, userbuf, pack_len); - - - - - /* I am done getting data, should I send the data to someone */ - - my_relative_index = (my_rank - my_data_source_index) < 0 ? my_rank - - my_data_source_index + group_size : my_rank - my_data_source_index; - - /* - * 2. Locate myself in the tree: - * calculate number of radix steps that we should to take - */ - radix_mask = 1; - while (radix_mask < group_size) { - if (0 != my_relative_index % (radix * radix_mask)) { - /* I found my level in tree */ - break; - } - radix_mask *= radix; - } - - /* go one step back */ - radix_mask /=radix; - - BASESMUMA_K_NOMIAL_SEND_SIGNAL(radix_mask, radix, my_relative_index, - my_rank, group_size,data_buffs,ready_flag,nsends) - - mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf, - pack_len, eq_h, nsends); - - /* wait for childrens to read */ - mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT, - &event, nsends); - - - -Release: - /* free the event queue */ - ret = PtlEQFree(eq_h); - if (ret != PTL_OK) { - fprintf(stderr, "PtlEQFree() failed: %d )\n",ret); - } - - - my_ctl_pointer->starting_flag_value++; - - return BCOL_FN_COMPLETE; -} - -#endif -#endif diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_mem_mgmt.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_mem_mgmt.c deleted file mode 100644 index eff6697ec2..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_mem_mgmt.c +++ /dev/null @@ -1,101 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "bcol_basesmuma.h" - - -/* Shared memory registration function: Calls into the "shared memory - connection manager" (aka - smcm) and registers a chunk of memory by - opening and mmaping a file. - - @input: - - void *reg_data - shared memory specific data needed by the registration - function. - - void *base - pointer to memory address. - - size_t size - size of memory chunk to be registered with sm. - - mca_mpool_base_registration_t *reg - registration data is cached here. - - @output: - - returns OMPI_SUCCESS on successful registration. - - returns OMPI_ERROR on failure. - -*/ - -int mca_bcol_basesmuma_register_sm(void *context_data, void *base, size_t size, - void **reg_desc) -{ - - /* local variables */ - int ret = OMPI_SUCCESS; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - bcol_basesmuma_registration_data_t *sm_reg = - (bcol_basesmuma_registration_data_t*) context_data; - - /* cache some info on sm_reg aka "context_data", you'll need it later */ - sm_reg->base_addr = base; - sm_reg->size = size; - - /* call into the shared memory registration function in smcm - * we need to be sure that the memory is page aligned in order - * to "map_fixed" - */ - sm_reg->sm_mmap = bcol_basesmuma_smcm_mem_reg(base, size, - sm_reg->data_seg_alignment, - sm_reg->file_name); - if(NULL == sm_reg->sm_mmap) { - opal_output (ompi_bcol_base_framework.framework_output, "Bcol_basesmuma memory registration error"); - return OMPI_ERROR; - } - - /* don't let other communicators re-register me! */ - cs->mpool_inited = true; - /* alias back to component */ - cs->sm_payload_structs = sm_reg->sm_mmap; - - return ret; -} - -/* Shared memory deregistration function - deregisters memory by munmapping it and removing the - shared memory file. - - Basic steps (please let me know if this is incompatible with your notion of deregistration - or if it causes problems on cleanup): - - 1. munmap the shared memory file. - 2. set the base pointer to the mmaped memory to NULL. - 3. permanently remove the shared memory file from the directory. - -*/ - -int mca_bcol_basesmuma_deregister_sm(void *context_data, void *reg) -{ - /* local variables */ - bcol_basesmuma_registration_data_t *sm_reg = - (bcol_basesmuma_registration_data_t*) context_data; - - if (sm_reg->sm_mmap) { - OBJ_RELEASE(sm_reg->sm_mmap); - } - - /* set the pointer to NULL */ - sm_reg->base_addr = NULL; - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c deleted file mode 100644 index 8770689ed2..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c +++ /dev/null @@ -1,687 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/patterns/net/netpatterns.h" - -#include "opal/util/show_help.h" -#include "opal/align.h" - -#include "ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h" -#include "bcol_basesmuma.h" -#include "bcol_basesmuma_utils.h" - -#ifdef __PORTALS_AVAIL__ -#include "bcol_basesmuma_portals.h" -#endif - - -/* - * Local functions - */ -static int alloc_lmsg_reduce_offsets_array(mca_bcol_basesmuma_module_t *sm_module) -{ - int rc = OMPI_SUCCESS, i = 0; - netpatterns_k_exchange_node_t *k_node = &sm_module->knomial_exchange_tree; - int n_exchanges = k_node->n_exchanges; - - /* Precalculate the allreduce offsets */ - if (0 < k_node->n_exchanges) { - sm_module->reduce_offsets = (int **)malloc(n_exchanges * sizeof(int*)); - - if (!sm_module->reduce_offsets) { - rc = OMPI_ERROR; - return rc; - } - - for (i=0; i < n_exchanges ; i++) { - sm_module->reduce_offsets[i] = (int *)malloc (sizeof(int) * NOFFSETS); - - if (!sm_module->reduce_offsets[i]){ - rc = OMPI_ERROR; - return rc; - } - } - } - return rc; -} - -static int free_lmsg_reduce_offsets_array(mca_bcol_basesmuma_module_t *sm_module) -{ - int rc = OMPI_SUCCESS, i = 0; - netpatterns_k_exchange_node_t *k_node = &sm_module->knomial_exchange_tree; - int n_exchanges = k_node->n_exchanges; - - if (sm_module->reduce_offsets) { - for (i=0; i < n_exchanges; i++) { - free (sm_module->reduce_offsets[i]); - } - - free(sm_module->reduce_offsets); - } - return rc; -} - -static void -mca_bcol_basesmuma_module_construct(mca_bcol_basesmuma_module_t *module) -{ - /* initialize all values to 0 */ - memset((void*)((uintptr_t) module + sizeof (module->super)), 0, sizeof (*module) - sizeof (module->super)); - module->super.bcol_component = (mca_bcol_base_component_t *) &mca_bcol_basesmuma_component; - module->super.list_n_connected = NULL; - module->super.hier_scather_offset = 0; -} - -static void -mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module) -{ - /* local variables */ - mca_sbgp_base_module_t *sbgp_module = sm_module->super.sbgp_partner_module; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - - /* - * release allocated resrouces - */ - - /* ...but not until you're sure you have no outstanding collectives */ - while(0 != opal_list_get_size(&(cs->nb_admin_barriers))) { - opal_progress(); - } - -#ifdef __PORTALS_AVAIL__ - /* Remove portals bcast specific resources */ - if ( PTL_OK != PtlEQFree(sm_module->sg_state.read_eq)) { - BASESMUMA_VERBOSE(10,("PtlEQFree() failed: )")); - } -#endif - - /* Remove Lmsg Reduce Offsets Array */ - free_lmsg_reduce_offsets_array(sm_module); - - /* collective topology data */ - if( sm_module->fanout_read_tree) { - for (int i = 0 ; i < sm_module->super.size_of_subgroup ; i++ ) { - if(0 < sm_module->fanout_read_tree[i].n_children ) { - free(sm_module->fanout_read_tree[i].children_ranks); - sm_module->fanout_read_tree[i].children_ranks=NULL; - } - } - free(sm_module->fanout_read_tree); - sm_module->fanout_read_tree=NULL; - } - - /* gvm Leak FIX Reduction_tree[].children_ranks has - * to be removed. I don't how to get the size (which is - * size of subgroup) of array reduction_tree - */ - if( sm_module->reduction_tree) { - for (int i = 0 ; i < sm_module->super.size_of_subgroup ; i++ ) { - if(0 < sm_module->reduction_tree[i].n_children ) { - free(sm_module->reduction_tree[i].children_ranks); - sm_module->reduction_tree[i].children_ranks=NULL; - } - } - free(sm_module->reduction_tree); - sm_module->reduction_tree=NULL; - } - - /* gvm Leak FIX */ - if (sm_module->fanout_node.children_ranks){ - free(sm_module->fanout_node.children_ranks); - sm_module->fanout_node.children_ranks = NULL; - } - - if (sm_module->fanin_node.children_ranks){ - free(sm_module->fanin_node.children_ranks); - sm_module->fanin_node.children_ranks = NULL; - } - - /* colls_no_user_data resrouces */ - if(sm_module->colls_no_user_data.ctl_buffs_mgmt){ - free(sm_module->colls_no_user_data.ctl_buffs_mgmt); - sm_module->colls_no_user_data.ctl_buffs_mgmt=NULL; - } - if(sm_module->colls_no_user_data.ctl_buffs){ - free(sm_module->colls_no_user_data.ctl_buffs); - sm_module->colls_no_user_data.ctl_buffs=NULL; - } - - /* return control */ - opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->no_userdata_ctl); - - /* colls_with_user_data resrouces */ - /* - *debug print */ - /* - fprintf(stderr,"AAA colls_with_user_data.ctl_buffs %p \n", - sm_module->colls_with_user_data.ctl_buffs_mgmt); - end debug */ - - if(sm_module->colls_with_user_data.ctl_buffs_mgmt){ - free(sm_module->colls_with_user_data.ctl_buffs_mgmt); - sm_module->colls_with_user_data.ctl_buffs_mgmt=NULL; - } - if(sm_module->colls_with_user_data.ctl_buffs){ - free(sm_module->colls_with_user_data.ctl_buffs); - sm_module->colls_with_user_data.ctl_buffs=NULL; - } - - if(sm_module->shared_memory_scratch_space) { - free(sm_module->shared_memory_scratch_space); - sm_module->shared_memory_scratch_space=NULL; - } - - /* return control */ - opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->userdata_ctl); - -#if 1 - if(sm_module->scatter_kary_tree) { - for (int i = 0 ; i < sm_module->super.size_of_subgroup ; i++ ) { - if(0 < sm_module->scatter_kary_tree[i].n_children) { - free(sm_module->scatter_kary_tree[i].children_ranks); - sm_module->scatter_kary_tree[i].children_ranks=NULL; - } - } - free(sm_module->scatter_kary_tree); - } -#endif - - if(NULL != sm_module->super.list_n_connected ){ - free(sm_module->super.list_n_connected); - sm_module->super.list_n_connected = NULL; - } - - cleanup_nb_coll_buff_desc(&sm_module->ml_mem.nb_coll_desc, - sm_module->ml_mem.num_banks, - sm_module->ml_mem.num_buffers_per_bank); - - for (int i = 0; i < BCOL_NUM_OF_FUNCTIONS; i++){ - /* gvm FIX: Go through the list and destroy each item */ - /* Destroy the function table object for each bcol type list */ - OPAL_LIST_DESTRUCT((&sm_module->super.bcol_fns_table[i])); - } - - if (NULL != sm_module->payload_backing_files_info) { - bcol_basesmuma_smcm_release_connections (sm_module, sbgp_module, &cs->sm_connections_list, - &sm_module->payload_backing_files_info); - } - - if (NULL != sm_module->ctl_backing_files_info) { - bcol_basesmuma_smcm_release_connections (sm_module, sbgp_module, &cs->sm_connections_list, - &sm_module->ctl_backing_files_info); - } - - if (NULL != sm_module->ml_mem.bank_release_counter) { - free(sm_module->ml_mem.bank_release_counter); - sm_module->ml_mem.bank_release_counter = NULL; - } - - if (NULL != sm_module->colls_with_user_data.data_buffs) { - free((void *)sm_module->colls_with_user_data.data_buffs); - sm_module->colls_with_user_data.data_buffs = NULL; - } - - /* free the k-nomial allgather tree here */ - netpatterns_cleanup_recursive_knomial_allgather_tree_node(&sm_module->knomial_allgather_tree); - netpatterns_cleanup_recursive_doubling_tree_node(&sm_module->recursive_doubling_tree); - netpatterns_cleanup_recursive_knomial_tree_node(&sm_module->knomial_exchange_tree); - - /* done */ -} - -static void bcol_basesmuma_set_small_msg_thresholds(struct mca_bcol_base_module_t *super) -{ - mca_bcol_basesmuma_module_t *basesmuma_module = - (mca_bcol_basesmuma_module_t *) super; - - size_t basesmuma_offset = bcol_basesmuma_data_offset_calc(basesmuma_module); - - /* Set the Allreduce threshold, for Basesmuma it equals to ML buffer size - data offset */ - super->small_message_thresholds[BCOL_ALLREDUCE] = - basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset; - - /* Set the Bcast threshold, for Basesmuma it equals to ML buffer size - data offset */ - super->small_message_thresholds[BCOL_BCAST] = - basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset; - - /* Set the Gather threshold, for Basesmuma it equals to ML buffer size - data offset */ - super->small_message_thresholds[BCOL_GATHER] = - (basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) / - ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm); - - /* Set the ALLgather threshold, for Basesmuma it equals to ML buffer size - data offset */ - super->small_message_thresholds[BCOL_ALLGATHER] = - (basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) / - ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm); - - /* Set the Reduce threshold, for Basesmuma it equals to ML buffer size - data offset */ - super->small_message_thresholds[BCOL_REDUCE] = - basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset; - - /* Set the Scatter threshold, for Basesmuma it equals to ML buffer size - data offset */ - super->small_message_thresholds[BCOL_SCATTER] = - basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset; -} - -/* setup memory management and collective routines */ - -static void load_func(mca_bcol_base_module_t *super) -{ - int fnc; - - /* Loading memory management and collective functions */ - - for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) { - super->bcol_function_table[fnc] = NULL; - } - - /*super->bcol_function_table[BCOL_BARRIER] = bcol_basesmuma_recursive_double_barrier;*/ - -#ifdef __PORTALS_AVAIL__ - super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_lmsg_scatter_allgather_portals_bcast; - /* super->bcol_function_table[BCOL_BCAST] = - bcol_basesmuma_lmsg_bcast_k_nomial_anyroot; */ -#endif - - /*super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_bcast;*/ - /*super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_binary_scatter_allgather_segment;*/ - /*super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_bcast_k_nomial_anyroot;*/ - super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_bcast; -#ifdef __PORTALS_AVAIL__ - super->bcol_function_table[BCOL_BCAST] = - bcol_basesmuma_lmsg_scatter_allgather_portals_bcast; -#endif - /* super->bcol_function_table[BCOL_ALLREDUCE] = bcol_basesmuma_allreduce_intra_fanin_fanout; */ - super->bcol_function_table[BCOL_ALLREDUCE] = bcol_basesmuma_allreduce_intra_recursive_doubling; - super->bcol_function_table[BCOL_REDUCE] = bcol_basesmuma_reduce_intra_fanin_old; - /* memory management */ - super->bcol_memory_init = bcol_basesmuma_bank_init_opti; - - super->k_nomial_tree = bcol_basesmuma_setup_knomial_tree; - - /* Set thresholds */ - super->set_small_msg_thresholds = bcol_basesmuma_set_small_msg_thresholds; -} - -static void load_func_with_choices(mca_bcol_base_module_t *super) -{ - int fnc; - - /* Loading memory management and collective functions */ - - for (fnc=0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) { - super->bcol_function_init_table[fnc] = NULL; - } - - super->bcol_function_init_table[BCOL_FANIN] = bcol_basesmuma_fanin_init; - super->bcol_function_init_table[BCOL_FANOUT] = bcol_basesmuma_fanout_init; - super->bcol_function_init_table[BCOL_BARRIER] = bcol_basesmuma_barrier_init; - - super->bcol_function_init_table[BCOL_BCAST] = bcol_basesmuma_bcast_init; - super->bcol_function_init_table[BCOL_ALLREDUCE] = bcol_basesmuma_allreduce_init; - super->bcol_function_init_table[BCOL_REDUCE] = bcol_basesmuma_reduce_init; - super->bcol_function_init_table[BCOL_GATHER] = bcol_basesmuma_gather_init; - super->bcol_function_init_table[BCOL_ALLGATHER] = bcol_basesmuma_allgather_init; - super->bcol_function_init_table[BCOL_SYNC] = bcol_basesmuma_memsync_init; - /* memory management */ - super->bcol_memory_init = bcol_basesmuma_bank_init_opti; - - super->k_nomial_tree = bcol_basesmuma_setup_knomial_tree; - -} - -static int load_recursive_knomial_info(mca_bcol_basesmuma_module_t - *sm_module) -{ - int rc = OMPI_SUCCESS; - rc = netpatterns_setup_recursive_knomial_tree_node(sm_module->super.sbgp_partner_module->group_size, - sm_module->super.sbgp_partner_module->my_index, - mca_bcol_basesmuma_component.k_nomial_radix, - &sm_module->knomial_exchange_tree); - return rc; -} - - -int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super) -{ - mca_bcol_basesmuma_module_t *sm_module = (mca_bcol_basesmuma_module_t *) super; - - return netpatterns_setup_recursive_knomial_allgather_tree_node(sm_module->super.sbgp_partner_module->group_size, - sm_module->super.sbgp_partner_module->my_index, - mca_bcol_basesmuma_component.k_nomial_radix, - super->list_n_connected, - &sm_module->knomial_allgather_tree); -} - - - - -/* query to see if the module is available for use on the given - * communicator, and if so, what it's priority is. This is where - * the backing shared-memory file is created. - */ -mca_bcol_base_module_t ** -mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules) -{ - /* local variables */ - mca_bcol_base_module_t **sm_modules = NULL; - mca_bcol_basesmuma_module_t *sm_module; - bcol_basesmuma_registration_data_t *sm_reg_data; - int ret, my_rank, name_length; - char *name; - int i; - - int bcast_radix; - - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - /*mca_base_component_list_item_t *hdl_cli = NULL;*/ - /*int hdl_num;*/ - - /* at this point I think there is only a sinle shared - memory bcol that we need to be concerned with */ - - /* No group, no modules */ - if (OPAL_UNLIKELY(NULL == module)) { - return NULL; - } - - /* allocate and initialize an sm_bcol module */ - sm_module = OBJ_NEW(mca_bcol_basesmuma_module_t); - - /* set the subgroup */ - sm_module->super.sbgp_partner_module=module; - - (*num_modules)=1; - cs->super.n_net_contexts = *num_modules; - sm_module->reduction_tree = NULL; - sm_module->fanout_read_tree = NULL; - - ret=netpatterns_setup_recursive_doubling_tree_node( - module->group_size,module->my_index, - &(sm_module->recursive_doubling_tree)); - if(OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "Error setting up recursive_doubling_tree \n"); - return NULL; - } - - /* setup the fanin tree - this is used only as part of a hierarchical - * barrier, so will set this up with rank 0 as the root */ - my_rank=module->my_index; - ret=netpatterns_setup_narray_tree(cs->radix_fanin, - my_rank,module->group_size,&(sm_module->fanin_node)); - if(OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "Error setting up fanin tree \n"); - return NULL; - } - - /* setup the fanout tree - this is used only as part of a hierarchical - * barrier, so will set this up with rank 0 as the root */ - ret=netpatterns_setup_narray_tree(cs->radix_fanout, - my_rank,module->group_size,&(sm_module->fanout_node)); - if(OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "Error setting up fanout tree \n"); - return NULL; - } - - /* - * Setup the broadcast tree - this is used only as part of a hierarchical - * bcast, so will set this up with rank 0 as the root. - */ - - /* set the radix of the bcast tree */ - bcast_radix = cs->radix_read_tree; - - /* initialize fan-out read tree */ - sm_module->fanout_read_tree=(netpatterns_tree_node_t*) malloc( - sizeof(netpatterns_tree_node_t)*module->group_size); - if( NULL == sm_module->fanout_read_tree ) { - goto Error; - } - - for(i = 0; i < module->group_size; i++){ - ret = netpatterns_setup_narray_tree(bcast_radix, - i, module->group_size, &(sm_module->fanout_read_tree[i])); - if(OMPI_SUCCESS != ret) { - goto Error; - } - } - - ret = load_recursive_knomial_info(sm_module); - if (OMPI_SUCCESS != ret) { - BASESMUMA_VERBOSE(10, ("Failed to load recursive knomial tree")); - goto Error; - } - - /* Allocate offsets array for lmsg reduce */ - ret = alloc_lmsg_reduce_offsets_array(sm_module); - if (OMPI_SUCCESS != ret) { - BASESMUMA_VERBOSE(10, ("Failed to allocate reduce offsets array")); - goto Error; - } - - /* initialize reduction tree */ - sm_module->reduction_tree=(netpatterns_tree_node_t *) malloc( - sizeof(netpatterns_tree_node_t )*module->group_size); - if( NULL == sm_module->reduction_tree ) { - goto Error; - } - - ret=netpatterns_setup_multinomial_tree( - cs->order_reduction_tree,module->group_size, - sm_module->reduction_tree); - if( MPI_SUCCESS != ret ) { - goto Error; - } - - /* get largest power of k for given group size */ - sm_module->pow_k_levels = pow_sm_k(cs->k_nomial_radix, - sm_module->super.sbgp_partner_module->group_size, - &(sm_module->pow_k)); - - /* get largest power of 2 for a given group size - * used in scatter allgather - */ - sm_module->pow_2_levels = pow_sm_k(2, - sm_module->super.sbgp_partner_module->group_size, - &(sm_module->pow_2)); - - /* - * setup scatter data - */ - sm_module->scatter_kary_radix=cs->scatter_kary_radix; - sm_module->scatter_kary_tree=NULL; - ret=netpatterns_setup_narray_tree_contigous_ranks( - sm_module->scatter_kary_radix, - sm_module->super.sbgp_partner_module->group_size, - &(sm_module->scatter_kary_tree)); - if(OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "In base_bcol_masesmuma_setup_library_buffers and scatter k-ary tree setup failed \n"); - return NULL; - } - - /* setup the module shared memory management */ - ret=base_bcol_basesmuma_setup_library_buffers(sm_module, cs); - - if(OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "In base_bcol_masesmuma_setup_library_buffers and mpool was not successfully setup!\n"); - return NULL; - } - - /* setup the collectives and memory management */ - - /* check to see whether or not the mpool has been inited */ - /* allocate some space for the network contexts */ - if(!cs->mpool_inited) { - /* if it's empty, then fill it for first time */ - cs->super.network_contexts = (bcol_base_network_context_t **) - malloc((cs->super.n_net_contexts)* - sizeof(bcol_base_network_context_t *)); - /* you need to do some basic setup - define the file name, - * set data seg alignment and size of cntl structure in sm - * file. - */ - /* give the payload sm file a name */ - name_length=asprintf(&name, - "%s"OPAL_PATH_SEP"0%s%0d", - ompi_process_info.job_session_dir, - cs->payload_base_fname, - (int)getpid()); - if( 0 > name_length ) { - opal_output (ompi_bcol_base_framework.framework_output, "Failed to assign the shared memory payload file a name\n"); - return NULL; - } - /* make sure name is not too long */ - if ( OPAL_PATH_MAX < (name_length-1) ) { - opal_output (ompi_bcol_base_framework.framework_output, "Shared memory file name is too long!\n"); - return NULL; - } - /* set the name and alignment characteristics */ - sm_reg_data = (bcol_basesmuma_registration_data_t *) malloc( - sizeof(bcol_basesmuma_registration_data_t)); - sm_reg_data->file_name = name; - - sm_reg_data->data_seg_alignment = getpagesize(); - sm_reg_data->size_ctl_structure = 0; - cs->super.network_contexts[0] = (bcol_base_network_context_t *) - malloc(sizeof(bcol_base_network_context_t)); - cs->super.network_contexts[0]->context_data = - (void *) sm_reg_data; - cs->super.network_contexts[0]-> - register_memory_fn = mca_bcol_basesmuma_register_sm; - cs->super.network_contexts[0]-> - deregister_memory_fn = mca_bcol_basesmuma_deregister_sm; - sm_module->super.network_context = cs->super.network_contexts[0]; - } else { - - sm_module->super.network_context = cs->super.network_contexts[0]; - } - - /* Set the header size */ - sm_module->super.header_size = sizeof(mca_bcol_basesmuma_header_t); - - /*initialize the hdl module if it's to be enabled*/ -#if 0 - if (module->use_hdl) { - sm_module->super.use_hdl = module->use_hdl; - hdl_cli = (mca_base_component_list_item_t *) - opal_list_get_first(&mca_hdl_base_components_in_use); - sm_module->hdl_module = ((mca_hdl_base_component_t*) - hdl_cli->cli_component)->hdl_comm_query(sm_module, &hdl_num); - if (1 != hdl_num || sm_module->hdl_module == NULL) { - ML_ERROR(("hdl modules are not successfully initialized!\n")); - goto Error; - } - } else { - sm_module->hdl_module = NULL; - } -#else - sm_module->hdl_module = NULL; -#endif - - - /* collective setup */ - load_func(&(sm_module->super)); - load_func_with_choices(&(sm_module->super)); - - /* - * This initializes all collective algorithms - */ - - ret = mca_bcol_base_bcol_fns_table_init(&(sm_module->super)); - - if (OMPI_SUCCESS != ret) { - - goto Error; - } - - sm_module->super.supported_mode = 0; - - /* NTH: this is not set anywhere on the trunk as of 08/13/13 */ -#if 0 - if (module->use_hdl) { - sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY; - } -#endif - - /* Initializes portals library required for basesmuma large message */ -#ifdef __PORTALS_AVAIL__ - /* Enable zero copy mode */ - sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY; - - ret = mca_bcol_basesmuma_portals_init(cs); - if (OMPI_SUCCESS != ret) { - return NULL; - } - - sm_module->sg_state.phase = INIT; - - ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*) - cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, - PTL_EQ_HANDLER_NONE, &sm_module->sg_state.read_eq); - - if (ret != PTL_OK) { - BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d",ret)); - return NULL; - } - -#endif - /* blocking recursive double barrier test */ - /* - { - opal_output (ompi_bcol_base_framework.framework_output, "BBB About to hit the barrier test\n"); - int rc; - bcol_function_args_t bogus; - rc = bcol_basesmuma_rd_barrier_init(&(sm_module->super)); - rc = bcol_basesmuma_recursive_double_barrier( - &bogus, &(sm_module->super)); - } - */ - - /* in this case we only expect a single network context. - in the future we should loop around this */ - sm_modules = (mca_bcol_base_module_t **) malloc(sizeof(mca_bcol_base_module_t *)); - if( !sm_modules ) { - opal_output (ompi_bcol_base_framework.framework_output, "In base_bcol_masesmuma_setup_library_buffers failed to allocate memory for sm_modules\n"); - return NULL; - } - - sm_modules[0] = &(sm_module->super); - - return sm_modules; - - Error: - - /* cleanup */ - if( sm_module->reduction_tree ) { - free(sm_module->reduction_tree); - sm_module->reduction_tree=NULL; - } - - return NULL; -} - -OBJ_CLASS_INSTANCE(mca_bcol_basesmuma_module_t, - mca_bcol_base_module_t, - mca_bcol_basesmuma_module_construct, - mca_bcol_basesmuma_module_destruct); diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_progress.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_progress.c deleted file mode 100644 index 7029c251ab..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_progress.c +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#include "ompi/constants.h" - -#include "bcol_basesmuma.h" - -/* the progress function to be called from the opal progress function - */ -int bcol_basesmuma_progress(void) -{ - /* local variables */ - volatile int32_t *cntr; - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - - /* check to see if release of memory blocks needs to be done */ - if( opal_list_get_size(&(cs->nb_admin_barriers)) ) { - sm_nbbar_desc_t *item_ptr; - opal_list_t *list=&(cs->nb_admin_barriers); - /* process only if the list is non-empty */ - if( !OPAL_THREAD_TRYLOCK(&cs->nb_admin_barriers_mutex)) { - - for (item_ptr = (sm_nbbar_desc_t*) opal_list_get_first(list); - item_ptr != (sm_nbbar_desc_t*) opal_list_get_end(list); - item_ptr = (sm_nbbar_desc_t*) opal_list_get_next(item_ptr) ) - { - bcol_basesmuma_rd_nb_barrier_progress_admin(item_ptr); - /* check to see if an complete */ - if( NB_BARRIER_DONE == item_ptr->collective_phase ) { - /* barrier is complete - remove from the list. No need - * to put it on another list, as it is part of the memory - * bank control structure, and will be picked up - * again when needed. - */ - int index= - item_ptr->pool_index; - /* old way - ctl_struct specific */ - /* - volatile uint64_t *cntr= (volatile uint64_t *) - &(item_ptr->sm_module->colls_no_user_data. - ctl_buffs_mgmt[index].bank_gen_counter); - */ - - cntr= (volatile int32_t *) &(item_ptr->coll_buff-> - ctl_buffs_mgmt[index].bank_gen_counter); - item_ptr=(sm_nbbar_desc_t*)opal_list_remove_item((opal_list_t *)list, - ( opal_list_item_t *)item_ptr); - /* increment the generation number */ - OPAL_THREAD_ADD32(cntr,1); - } - } - - OPAL_THREAD_UNLOCK(&cs->nb_admin_barriers_mutex); - } - - } - return OMPI_SUCCESS; - -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_barrier.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_barrier.c deleted file mode 100644 index 9749491e9f..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_barrier.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/* Recursive doubling blocking barrier */ - -#include "ompi_config.h" -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/patterns/net/netpatterns.h" - -#include "opal/sys/atomic.h" - -#include "bcol_basesmuma.h" - -#if 0 -int bcol_basesmuma_recursive_double_barrier(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - - /* local variables */ - int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange, flag_to_set; - int pair_rank, flag_offset; - mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - netpatterns_pair_exchange_node_t *my_exchange_node; - int extra_rank, my_rank, pow_2; - volatile mca_bcol_basesmuma_ctl_struct_t *partner_ctl; - volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl; - int64_t sequence_number; - bool found; - int buff_index, first_instance=0; - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; -#if 0 - fprintf(stderr,"Entering the sm rd barrier\n"); - fflush(stderr); -#endif - - /* get the pointer to the segment of control structures */ - my_exchange_node=&(bcol_module->recursive_doubling_tree); - my_rank=bcol_module->super.sbgp_partner_module->my_index; - pow_2=bcol_module->super.sbgp_partner_module->pow_2; - - /* figure out what instance of the basesmuma bcol I am */ - leading_dim=bcol_module->colls_no_user_data.size_of_group; - sequence_number=input_args->sequence_num - c_input_args->bcol_module->squence_number_offset; - - buff_index=sequence_number & (bcol_module->colls_no_user_data.mask); - - idx=SM_ARRAY_INDEX(leading_dim,buff_index,0); - ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **) - bcol_module->colls_no_user_data.ctl_buffs+idx; - my_ctl=ctl_structs[my_rank]; - if( my_ctl->sequence_number < sequence_number ) { - first_instance=1; - } - - /* get the pool index */ - if( first_instance ) { - idx = -1; - while( idx == -1 ) { - - idx=bcol_basesmuma_get_buff_index( - &(bcol_module->colls_no_user_data),sequence_number); - } - if( -1 == idx ){ - return ORTE_ERR_TEMP_OUT_OF_RESOURCE; - } - my_ctl->index=1; - /* this does not need to use any flag values , so only need to - * set the value for subsequent values that may need this */ - my_ctl->starting_flag_value=0; - flag_offset=0; - } else { - /* only one thread at a time will be making progress on this - * collective, so no need to make this atomic */ - my_ctl->index++; - flag_offset=my_ctl->starting_flag_value; - } - - /* signal that I have arrived */ - my_ctl->flag = -1; - /* don't need to set this flag anymore */ - my_ctl->sequence_number = sequence_number; - /* opal_atomic_wmb ();*/ - - if(0 < my_exchange_node->n_extra_sources) { - if (EXCHANGE_NODE == my_exchange_node->node_type) { - volatile int64_t *partner_sn; - int cnt=0; - - /* I will participate in the exchange - wait for signal from extra - ** process */ - extra_rank = my_exchange_node->rank_extra_source; - partner_ctl=(volatile mca_bcol_basesmuma_ctl_struct_t *)ctl_structs[extra_rank]; - - /*partner_ctl=ctl_structs[extra_rank];*/ - partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); - - /* spin n iterations until partner registers */ - loop_cnt=0; - found=false; - while( !found ) - { - if( *partner_sn >= sequence_number ) { - found=true; - } - cnt++; - if( cnt == 1000 ) { - opal_progress(); - cnt=0; - } - } - - } else { - - /* Nothing to do, already registared that I am here */ - } - } - - for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) { - - volatile int64_t *partner_sn; - volatile int *partner_flag; - int cnt=0; - - /* rank of exchange partner */ - pair_rank = my_rank ^ ( 1 SHIFT_UP exchange ); - partner_ctl=ctl_structs[pair_rank]; - partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); - partner_flag=(volatile int *)&(partner_ctl->flag); - - /* signal that I am at iteration exchange of the algorithm */ - flag_to_set=flag_offset+exchange; - my_ctl->flag = flag_to_set; - - /* check to see if the partner has arrived */ - - /* spin n iterations until partner registers */ - found=false; - while( !found ) - { - if( (*partner_sn > sequence_number) || - ( *partner_sn == sequence_number && - *partner_flag >= flag_to_set ) ) { - found=true; - } else { - cnt++; - if( cnt == 1000 ) { - opal_progress(); - cnt=0; - } - } - } - } - - if(0 < my_exchange_node->n_extra_sources) { - if ( EXTRA_NODE == my_exchange_node->node_type ) { - int cnt=0; - - /* I will not participate in the exchange - - * wait for signal from extra partner */ - extra_rank = my_exchange_node->rank_extra_source; - partner_ctl=ctl_structs[extra_rank]; - flag_to_set=flag_offset+my_exchange_node->log_2; - - /* spin n iterations until partner registers */ - found=false; - while( !found ) - { - if (IS_PEER_READY(partner_ctl, flag_to_set, sequence_number)){ - found=true; - } else { - cnt++; - if( cnt == 1000 ) { - opal_progress(); - cnt=0; - } - } - } - - } else { - - /* signal the extra rank that I am done with the recursive - * doubling phase. - */ - flag_to_set=flag_offset+my_exchange_node->log_2; - my_ctl->flag = flag_to_set; - - } - } - - /* if I am the last instance of a basesmuma function in this collectie, - * release the resrouces */ - if (IS_LAST_BCOL_FUNC(c_input_args)){ - idx=bcol_basesmuma_free_buff( - &(bcol_module->colls_no_user_data), - sequence_number); - } else { - /* increment flag value - so next sm collective in the hierarchy - * will not collide with the current one, as they share the - * control structure */ - my_ctl->starting_flag_value+=(my_exchange_node->log_2+1); - } - - /* return */ - return ret; -} -#endif diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_nb_barrier.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_nb_barrier.c deleted file mode 100644 index 60be1a4364..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_nb_barrier.c +++ /dev/null @@ -1,462 +0,0 @@ -/* - * Copyright (c) 2009-2012 UT-Battelle, LLC. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -/* we need make cleanup with all these includes START */ -#include -#include - -#include "ompi/constants.h" -#include "ompi/mca/bcol/bcol.h" -#include "bcol_basesmuma.h" -#include "opal/sys/atomic.h" -#include "ompi/patterns/net/netpatterns.h" -#include "ompi/mca/bcol/base/base.h" - -/* - * Initialize nonblocking barrier. This is code specific for handling - * the recycling of data, and uses only a single set of control buffers. - * It also assumes that for a given process, only a single outstanding - * barrier operation will occur for a given control structure, - * with the sequence number being used for potential overlap in time - * between succesive barrier calls on different processes. - */ -int bcol_basesmuma_rd_nb_barrier_init_admin( - sm_nbbar_desc_t *sm_desc) - -{ - /* local variables */ - int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange; - int pair_rank; - mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - netpatterns_pair_exchange_node_t *my_exchange_node; - int extra_rank, my_rank; - mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl; - mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl; - int64_t bank_genaration; - bool found; - int pool_index=sm_desc->pool_index; - mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module; - - /* get the pointer to the segment of control structures */ - idx=sm_desc->coll_buff->number_of_buffs+pool_index; - leading_dim=sm_desc->coll_buff->size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,idx,0); - ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **) - sm_desc->coll_buff->ctl_buffs+idx; - bank_genaration= sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter; - - my_exchange_node=&(bcol_module->recursive_doubling_tree); - my_rank=bcol_module->super.sbgp_partner_module->my_index; - my_ctl=ctl_structs[my_rank]; - /* debug print */ - /* - { - int ii; - for(ii = 0; ii < 6; ii++) { - fprintf(stderr,"UUU ctl_struct[%d] := %p\n",ii, - bcol_module->colls_no_user_data.ctl_buffs[ii]); - fflush(stderr); - } - } - */ - /* end debug */ - - /* signal that I have arrived */ - my_ctl->flag = -1; - - opal_atomic_wmb (); - - /* don't need to set this flag anymore */ - my_ctl->sequence_number = bank_genaration; - - if(0 < my_exchange_node->n_extra_sources) { - if (EXCHANGE_NODE == my_exchange_node->node_type) { - volatile int64_t *partner_sn; - /* I will participate in the exchange - wait for signal from extra - ** process */ - extra_rank = my_exchange_node->rank_extra_source; - partner_ctl=ctl_structs[extra_rank]; - partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); - - /* spin n iterations until partner registers */ - loop_cnt=0; - found=false; - while( loop_cnt < bcol_module->super.n_poll_loops ) - { - if( *partner_sn >= bank_genaration ) { - found=true; - break; - } - loop_cnt++; - } - if( !found ) { - /* set restart parameters */ - sm_desc->collective_phase=NB_PRE_PHASE; - return OMPI_SUCCESS; - } - - } else { - - /* Nothing to do, already registared that I am here */ - } - } - - for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) { - - volatile int64_t *partner_sn; - volatile int *partner_flag; - - /* rank of exchange partner */ - pair_rank = my_rank ^ ( 1 SHIFT_UP exchange ); - partner_ctl=ctl_structs[pair_rank]; - partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); - partner_flag=(volatile int *)&(partner_ctl->flag); - - /* signal that I am at iteration exchange of the algorithm */ - my_ctl->flag = exchange; - - /* check to see if the partner has arrived */ - - /* spin n iterations until partner registers */ - loop_cnt=0; - found=false; - while( loop_cnt < bcol_module->super.n_poll_loops ) - { - if( (*partner_sn > bank_genaration) || - ( *partner_sn == bank_genaration && - *partner_flag >= exchange ) ) { - found=true; - break; - } - - loop_cnt++; - - } - if( !found ) { - /* set restart parameters */ - sm_desc->collective_phase=NB_RECURSIVE_DOUBLING; - sm_desc->recursive_dbl_iteration=exchange; - return OMPI_SUCCESS; - } - - } - - if(0 < my_exchange_node->n_extra_sources) { - if ( EXTRA_NODE == my_exchange_node->node_type ) { - volatile int64_t *partner_sn; - volatile int *partner_flag; - - /* I will not participate in the exchange - - * wait for signal from extra partner */ - extra_rank = my_exchange_node->rank_extra_source; - partner_ctl=ctl_structs[extra_rank]; - partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); - partner_flag=(volatile int *)&(partner_ctl->flag); - - /* spin n iterations until partner registers */ - loop_cnt=0; - found=false; - while( loop_cnt < bcol_module->super.n_poll_loops ) - { - if( (*partner_sn > bank_genaration) || - ( (*partner_sn == bank_genaration) && - (*partner_flag == (my_exchange_node->log_2)) ) ) { - found=true; - break; - } - loop_cnt++; - } - if( !found ) { - /* set restart parameters */ - sm_desc->collective_phase=NB_POST_PHASE; - return OMPI_SUCCESS; - } - - } else { - - /* signal the extra rank that I am done with the recursive - * doubling phase. - */ - my_ctl->flag = my_exchange_node->n_exchanges; - - } - } - - /* set the barrier as complete */ - sm_desc->collective_phase=NB_BARRIER_DONE; - /* return */ - return ret; -} - -/* admin nonblocking barrier - progress function */ -int bcol_basesmuma_rd_nb_barrier_progress_admin( - sm_nbbar_desc_t *sm_desc) - -{ - /* local variables */ - int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange; - int pair_rank, start_index, restart_phase; - mca_bcol_basesmuma_ctl_struct_t **ctl_structs; - netpatterns_pair_exchange_node_t *my_exchange_node; - int extra_rank, my_rank; - mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl; - mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl; - int64_t bank_genaration; - int pool_index=sm_desc->pool_index; - bool found; - mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module; - - /* get the pointer to the segment of control structures */ - idx = sm_desc->coll_buff->number_of_buffs+pool_index; - leading_dim = sm_desc->coll_buff->size_of_group; - idx = SM_ARRAY_INDEX(leading_dim,idx,0); - ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **) - sm_desc->coll_buff->ctl_buffs+idx; - bank_genaration = sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter; - - my_exchange_node=&(bcol_module->recursive_doubling_tree); - my_rank=bcol_module->super.sbgp_partner_module->my_index; - my_ctl=ctl_structs[my_rank]; - - /* check to make sure that this should be progressed */ - if( ( sm_desc->collective_phase == NB_BARRIER_INACTIVE ) || - ( sm_desc->collective_phase == NB_BARRIER_DONE ) ) - { - return OMPI_SUCCESS; - } - - /* set the restart up - and jump to the correct place in the algorithm */ - restart_phase=sm_desc->collective_phase; - if ( NB_PRE_PHASE == restart_phase ) { - start_index=0; - } else if ( NB_RECURSIVE_DOUBLING == restart_phase ) { - start_index=sm_desc->recursive_dbl_iteration; - goto Exchange_phase; - } else { - goto Post_phase; - } - - if(0 < my_exchange_node->n_extra_sources) { - if (EXCHANGE_NODE == my_exchange_node->node_type) { - volatile int64_t *partner_sn; - /* I will participate in the exchange - wait for signal from extra - ** process */ - extra_rank = my_exchange_node->rank_extra_source; - partner_ctl=ctl_structs[extra_rank]; - partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); - - /* spin n iterations until partner registers */ - loop_cnt=0; - while( loop_cnt < bcol_module->super.n_poll_loops ) - { - found=false; - if( *partner_sn >= bank_genaration ) { - found=true; - break; - } - loop_cnt++; - } - if( !found ) { - /* set restart parameters */ - sm_desc->collective_phase=NB_PRE_PHASE; - return OMPI_SUCCESS; - } - - } else { - - /* Nothing to do, already registared that I am here */ - } - } - -Exchange_phase: - - for(exchange = start_index; - exchange < my_exchange_node->n_exchanges; exchange++) { - - volatile int64_t *partner_sn; - volatile int *partner_flag; - - /* rank of exchange partner */ - pair_rank = my_rank ^ ( 1 SHIFT_UP exchange ); - partner_ctl=ctl_structs[pair_rank]; - partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); - partner_flag=(volatile int *)&(partner_ctl->flag); - - /* signal that I am at iteration exchange of the algorithm */ - my_ctl->flag = exchange; - - /* check to see if the partner has arrived */ - - /* spin n iterations until partner registers */ - loop_cnt=0; - found=false; - while( loop_cnt < bcol_module->super.n_poll_loops ) - { - if( (*partner_sn > bank_genaration) || - ( (*partner_sn == bank_genaration) && - (*partner_flag >= exchange) ) ) { - found=true; - break; - } - loop_cnt++; - } - if( !found ) { - /* set restart parameters */ - sm_desc->collective_phase=NB_RECURSIVE_DOUBLING; - sm_desc->recursive_dbl_iteration=exchange; - return OMPI_SUCCESS; - } - - } - -Post_phase: - if(0 < my_exchange_node->n_extra_sources) { - if ( EXTRA_NODE == my_exchange_node->node_type ) { - volatile int64_t *partner_sn; - volatile int *partner_flag; - - /* I will not participate in the exchange - - * wait for signal from extra partner */ - extra_rank = my_exchange_node->rank_extra_source; - partner_ctl=ctl_structs[extra_rank]; - partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); - partner_flag=(volatile int *)&(partner_ctl->flag); - - /* spin n iterations until partner registers */ - loop_cnt=0; - found=false; - while( loop_cnt < bcol_module->super.n_poll_loops ) - { - if( (*partner_sn > bank_genaration) || - ( *partner_sn == bank_genaration && - *partner_flag == (my_exchange_node->log_2) ) ) { - found=true; - break; - } - loop_cnt++; - } - if( !found ) { - /* set restart parameters */ - sm_desc->collective_phase=NB_POST_PHASE; - return OMPI_SUCCESS; - } - - } else { - - /* signal the extra rank that I am done with the recursive - * doubling phase. - */ - my_ctl->flag = my_exchange_node->n_exchanges; - - } - } - - /* set the barrier as complete */ - sm_desc->collective_phase=NB_BARRIER_DONE; - - /* return */ - return ret; -} - -static int bcol_basesmuma_memsync(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - int rc; - int memory_bank = input_args->root; - - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - sm_buffer_mgmt *buff_block = &(bcol_module->colls_with_user_data); - sm_nbbar_desc_t *sm_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc); - - sm_desc->coll_buff = buff_block; - /* - printf("XXX SYNC call\n"); - */ - - rc = bcol_basesmuma_rd_nb_barrier_init_admin( - sm_desc); - if (OMPI_SUCCESS != rc) { - return rc; - } - - if (NB_BARRIER_DONE != sm_desc->collective_phase) { - mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; - opal_list_t *list=&(cs->nb_admin_barriers); - opal_list_item_t *append_item; - - /* put this onto the progression list */ - OPAL_THREAD_LOCK(&(cs->nb_admin_barriers_mutex)); - append_item=(opal_list_item_t *) - &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc); - opal_list_append(list,append_item); - OPAL_THREAD_UNLOCK(&(cs->nb_admin_barriers_mutex)); - /* progress communications so that resources can be freed up */ - return BCOL_FN_STARTED; - } - - /* Done - bump the counter */ - (buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter)++; - /* - printf("XXX SYNC call done \n"); - */ - return BCOL_FN_COMPLETE; -} - -static int bcol_basesmuma_memsync_progress(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - int memory_bank = input_args->root; - - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - sm_buffer_mgmt *buff_block = &(bcol_module->colls_with_user_data); - sm_nbbar_desc_t *sm_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc); - - /* I do not have to do anything, since the - progress done by basesmuma progress engine */ - - if (NB_BARRIER_DONE != sm_desc->collective_phase) { - return BCOL_FN_STARTED; - } - - return BCOL_FN_COMPLETE; -} - -int bcol_basesmuma_memsync_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_SYNC; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - bcol_basesmuma_memsync, - bcol_basesmuma_memsync_progress); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.c deleted file mode 100644 index 570280d084..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.c +++ /dev/null @@ -1,382 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/constants.h" -#include "ompi/op/op.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/mca/bcol/bcol.h" - -#include "opal/include/opal_stdint.h" - -#include "bcol_basesmuma.h" -#include "bcol_basesmuma_reduce.h" -/** - * gvm - Shared memory reduce - */ - -static int bcol_basesmuma_reduce_intra_fanin_progress(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_reduce_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_REDUCE; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1048576; - comm_attribs.data_src = DATA_SRC_KNOWN; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; - inv_attribs.datatype_bitmap = 0x11111111; - inv_attribs.op_types_bitmap = 0x11111111; - - - /* Set attributes for fanin fanout algorithm */ - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_basesmuma_reduce_intra_fanin, - bcol_basesmuma_reduce_intra_fanin_progress); - - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, NULL, NULL); - - return OMPI_SUCCESS; -} - -/* - * Small data fanin reduce - * ML buffers are used for both payload and control structures - * This functions works with hierarchical allreduce and - * progress engine - */ -static inline int reduce_children (mca_bcol_basesmuma_module_t *bcol_module, volatile void *rbuf, netpatterns_tree_node_t *my_reduction_node, - int *iteration, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, ompi_datatype_t *dtype, - volatile mca_bcol_basesmuma_payload_t *data_buffs, int count, struct ompi_op_t *op, int process_shift) { - volatile mca_bcol_basesmuma_header_t * child_ctl_pointer; - int bcol_id = (int) bcol_module->super.bcol_id; - int64_t sequence_number = my_ctl_pointer->sequence_number; - int8_t ready_flag = my_ctl_pointer->ready_flag; - int group_size = bcol_module->colls_no_user_data.size_of_group; - - if (LEAF_NODE != my_reduction_node->my_node_type) { - volatile char *child_data_pointer; - volatile void *child_rbuf; - - /* for each child */ - /* my_result_data = child_result_data (op) my_source_data */ - - for (int child = *iteration ; child < my_reduction_node->n_children ; ++child) { - int child_rank = my_reduction_node->children_ranks[child] + process_shift; - - if (group_size <= child_rank){ - child_rank -= group_size; - } - - child_ctl_pointer = data_buffs[child_rank].ctl_struct; - child_data_pointer = data_buffs[child_rank].payload; - - if (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, REDUCE_FLAG, bcol_id)) { - *iteration = child; - return BCOL_FN_STARTED; - } - - child_rbuf = child_data_pointer + child_ctl_pointer->roffsets[bcol_id]; - - ompi_op_reduce(op,(void *)child_rbuf,(void *)rbuf, count, dtype); - } /* end child loop */ - } - - if (ROOT_NODE != my_reduction_node->my_node_type) { - opal_atomic_wmb (); - my_ctl_pointer->flags[REDUCE_FLAG][bcol_id] = ready_flag; - } - - return BCOL_FN_COMPLETE; -} - -static int bcol_basesmuma_reduce_intra_fanin_progress(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - - netpatterns_tree_node_t *my_reduction_node; - int my_rank, my_node_index; - struct ompi_datatype_t *dtype = input_args->dtype; - int leading_dim, idx; - - /* Buffer index */ - int buff_idx = input_args->src_desc->buffer_index; - - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration; - - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - void *data_addr = (void *)input_args->src_desc->data_addr; - volatile void *rbuf; - - /* get addressing information */ - my_rank = bcol_module->super.sbgp_partner_module->my_index; - leading_dim = bcol_module->colls_no_user_data.size_of_group; - idx = SM_ARRAY_INDEX(leading_dim, buff_idx, 0); - - data_buffs = (volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs + idx; - - /* Get control structure and payload buffer */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - my_node_index = my_rank - input_args->root; - if (0 > my_node_index) { - int group_size = bcol_module->colls_no_user_data.size_of_group; - my_node_index += group_size; - } - - my_reduction_node = bcol_module->reduction_tree + my_node_index; - rbuf = (volatile void *)((uintptr_t) data_addr + input_args->rbuf_offset); - - return reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer, dtype, - data_buffs, input_args->count, input_args->op, input_args->root); -} - -int bcol_basesmuma_reduce_intra_fanin(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int rc=BCOL_FN_COMPLETE; - int my_rank,group_size,my_node_index; - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - - netpatterns_tree_node_t *my_reduction_node; - volatile int8_t ready_flag; - int bcol_id = (int) bcol_module->super.bcol_id; - volatile void *sbuf,*rbuf; - int sbuf_offset,rbuf_offset; - int root,count; - int64_t sequence_number=input_args->sequence_num; - struct ompi_datatype_t *dtype; - int leading_dim,idx; - - /* Buffer index */ - int buff_idx = input_args->src_desc->buffer_index; - - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration; - - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char * my_data_pointer; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - void *data_addr = (void *)input_args->src_desc->data_addr; - -#if 0 - fprintf(stderr,"777 entering sm reduce \n"); -#endif - - /* get addressing information */ - my_rank=bcol_module->super.sbgp_partner_module->my_index; - group_size=bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - data_buffs = (volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - /* fprintf(stderr,"AAA the devil!!\n"); */ - /* Get control structure and payload buffer */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - my_data_pointer = (volatile char *)data_addr; - - /* Align node index to around sbgp root */ - root = input_args->root; - my_node_index = my_rank - root; - if (0 > my_node_index) { - my_node_index += group_size; - } - - /* get arguments */ - sbuf_offset = input_args->sbuf_offset; - rbuf_offset = input_args->rbuf_offset; - sbuf = (volatile void *)(my_data_pointer + sbuf_offset); - data_buffs[my_rank].payload = (void*)sbuf; - rbuf = (volatile void *)(my_data_pointer + rbuf_offset); - count = input_args->count; - dtype = input_args->dtype; - - /* Cache my rbuf_offset */ - my_ctl_pointer->roffsets[bcol_id] = rbuf_offset; - - /* get my node for the reduction tree */ - my_reduction_node=&(bcol_module->reduction_tree[my_node_index]); - - /* init the header */ - BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); - - input_args->result_in_rbuf = (ROOT_NODE == my_reduction_node->my_node_type); - - /* set starting point for progress loop */ - *iteration = 0; - my_ctl_pointer->ready_flag = ready_flag; - - if (sbuf != rbuf) { - rc = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf, - (char *)sbuf); - if( 0 != rc ) { - return OMPI_ERROR; - } - } - - rc = reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer, dtype, - data_buffs, count, input_args->op, root); - - /* Flag value if other bcols are called */ - my_ctl_pointer->starting_flag_value[bcol_id]++; - - /* Recycle payload buffers */ - - return rc; -} - -/* Small data fanin reduce - * Uses SM buffer (backed by SM file) for both control structures and - * payload - * - * NTH: How does this differ from the new one? Can we replace this - * with a call to the new init then a call the new progress until - * complete? - */ -int bcol_basesmuma_reduce_intra_fanin_old(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args) -{ - /* local variables */ - int rc=OMPI_SUCCESS; - int my_rank,group_size,process_shift,my_node_index; - int n_children,child; - mca_bcol_basesmuma_module_t* bcol_module = - (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; - - netpatterns_tree_node_t *my_reduction_node; - volatile int8_t ready_flag; - volatile void *sbuf,*rbuf; - int sbuf_offset,rbuf_offset; - int root,count; - struct ompi_op_t *op; - int64_t sequence_number=input_args->sequence_num; - struct ompi_datatype_t *dtype; - int leading_dim,idx; - int buff_idx; - int child_rank; - int bcol_id = (int) bcol_module->super.bcol_id; - - volatile mca_bcol_basesmuma_payload_t *data_buffs; - volatile char * my_data_pointer; - volatile char * child_data_pointer; - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_header_t * child_ctl_pointer; - -#if 0 - fprintf(stderr,"Entering fanin reduce \n"); -#endif - - /* Buffer index */ - buff_idx = input_args->src_desc->buffer_index; - /* get addressing information */ - my_rank=bcol_module->super.sbgp_partner_module->my_index; - group_size=bcol_module->colls_no_user_data.size_of_group; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - /*ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **) - bcol_module->colls_with_user_data.ctl_buffs+idx;*/ - data_buffs = (volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - - /* Get control structure and payload buffer */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - my_data_pointer = (volatile char *) data_buffs[my_rank].payload; - - /* Align node index to around sbgp root */ - root = input_args->root; - process_shift = root; - my_node_index = my_rank - root; - if (0 > my_node_index ) { - my_node_index += group_size; - } - - /* get arguments */ - sbuf_offset = input_args->sbuf_offset; - rbuf_offset = input_args->rbuf_offset; - sbuf = (volatile void *)(my_data_pointer + sbuf_offset); - rbuf = (volatile void *)(my_data_pointer + rbuf_offset); - op = input_args->op; - count = input_args->count; - dtype = input_args->dtype; - - /* get my node for the reduction tree */ - my_reduction_node=&(bcol_module->reduction_tree[my_node_index]); - n_children=my_reduction_node->n_children; - - /* init the header */ - BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); - - input_args->result_in_rbuf = (ROOT_NODE == my_reduction_node->my_node_type); - - rc = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf, - (char *)sbuf); - if (0 != rc) { - return OMPI_ERROR; - } - - if (LEAF_NODE != my_reduction_node->my_node_type) { - volatile void *child_rbuf; - /* for each child */ - /* my_result_data = child_result_data (op) my_source_data */ - - for (child = 0 ; child < n_children ; ++child) { - child_rank = my_reduction_node->children_ranks[child]; - child_rank += process_shift; - - /* wrap around */ - if( group_size <= child_rank ){ - child_rank-=group_size; - } - - /*child_ctl_pointer = ctl_structs[child_rank];*/ - child_ctl_pointer = data_buffs[child_rank].ctl_struct; - child_data_pointer = data_buffs[child_rank].payload; - - child_rbuf = child_data_pointer + rbuf_offset; - /* wait until child child's data is ready for use */ - while (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, REDUCE_FLAG, bcol_id)) { - opal_progress(); - } - - /* apply collective operation */ - ompi_op_reduce(op,(void *)child_rbuf,(void *)rbuf, count,dtype); - } /* end child loop */ - } - - if (ROOT_NODE != my_reduction_node->my_node_type) { - opal_atomic_wmb (); - my_ctl_pointer->flags[REDUCE_FLAG][bcol_id] = ready_flag; - } - - my_ctl_pointer->starting_flag_value[bcol_id]++; - - return rc; -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h b/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h deleted file mode 100644 index 3d6f209446..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h +++ /dev/null @@ -1,92 +0,0 @@ -#ifndef __BASESMUMA_REDUCE_H_ - -#define __BASESMUMA_REDUCE_H_ - -#include "ompi_config.h" -#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h" -#include "ompi/constants.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" -#include "bcol_basesmuma_utils.h" -#include - -enum { - BLOCK_OFFSET = 0, - LOCAL_REDUCE_SEG_OFFSET, - BLOCK_COUNT, - SEG_SIZE, - NOFFSETS -}; - -int compute_knomial_reduce_offsets(int group_index, int count, struct - ompi_datatype_t *dtype,int k_radix,int n_exchanges, - int **offsets); - -int compute_knomial_reduce_offsets_reverse(int group_index, int count, struct - ompi_datatype_t *dtype,int k_radix,int n_exchanges, - int **offsets); - -int bcol_basesmuma_lmsg_reduce_recursivek_scatter_reduce(mca_bcol_basesmuma_module_t *sm_module, - const int buffer_index, void *sbuf, - void *rbuf, - struct ompi_op_t *op, - const int count, struct ompi_datatype_t *dtype, - const int relative_group_index, - const int padded_start_byte, - volatile int8_t ready_flag, - volatile mca_bcol_basesmuma_payload_t *data_buffs); - -int bcol_basesmuma_lmsg_reduce_knomial_gather(mca_bcol_basesmuma_module_t *basesmuma_module, - const int buffer_index, - void *sbuf,void *rbuf, int count, struct - ompi_datatype_t *dtype, - const int my_group_index, - const int padded_start_byte, - volatile int8_t rflag, - volatile mca_bcol_basesmuma_payload_t *data_buffs); - -int bcol_basesmuma_lmsg_reduce_extra_root(mca_bcol_basesmuma_module_t *sm_module, - const int buffer_index, void *sbuf, - void *rbuf, - struct ompi_op_t *op, - const int count, struct ompi_datatype_t *dtype, - const int relative_group_index, - const int padded_start_byte, - volatile int8_t rflag, - volatile mca_bcol_basesmuma_payload_t *data_buffs); - - - -int bcol_basesmuma_lmsg_reduce_extra_non_root(mca_bcol_basesmuma_module_t *sm_module, - const int buffer_index, void *sbuf, - void *rbuf, - int root, - struct ompi_op_t *op, - const int count, struct ompi_datatype_t *dtype, - const int relative_group_index, - const int group_size, - const int padded_start_byte, - volatile int8_t rflag, - volatile mca_bcol_basesmuma_payload_t *data_buffs); - -int bcol_basesmuma_lmsg_reduce(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -int bcol_basesmuma_lmsg_reduce_extra(bcol_function_args_t *input_args, - mca_bcol_base_function_t *c_input_args); - -void basesmuma_reduce_recv(int my_group_index, int peer, - void *recv_buffer, - int recv_size, - volatile int8_t ready_flag_val, - volatile mca_bcol_basesmuma_payload_t *data_buffs); - -void basesmuma_reduce_send(int my_group_index, - int peer, - void *send_buffer, - int snd_size, - int send_offset, - volatile int8_t ready_flag_val, - volatile mca_bcol_basesmuma_payload_t *data_buffs); - -#endif diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_rk_barrier.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_rk_barrier.c deleted file mode 100644 index bd8e1ad2d0..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_rk_barrier.c +++ /dev/null @@ -1,442 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h" - -/* -#define IS_BARRIER_READY(peer, my_flag, my_sequence_number)\ - (((peer)->sequence_number == (my_sequence_number) && \ - (peer)->flags[BARRIER_RKING_FLAG][bcol_id] >= (my_flag) \ - )? true : false ) -*/ - -#define CALC_ACTIVE_REQUESTS(active_requests,peers, tree_order) \ -do{ \ - for( j = 0; j < (tree_order - 1); j++){ \ - if( 0 > peers[j] ) { \ - /* set the bit */ \ - *active_requests ^= (1<bcol_module; - netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree; - mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; - uint32_t buffer_index = input_args->buffer_index; - int *active_requests = - &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); - - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; - int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; - int leading_dim, buff_idx, idx; - int bcol_id = (int) bcol_module->super.bcol_id; - - int i, j, probe; - int src; - - int pow_k, tree_order; - int max_requests = 0; /* important to initialize this */ - - bool matched; - int64_t sequence_number=input_args->sequence_num; - int my_rank = bcol_module->super.sbgp_partner_module->my_index; - - volatile mca_bcol_basesmuma_payload_t *data_buffs; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer; -#if 0 - fprintf(stderr,"entering sm barrier sn = %d buff index = %d\n",sequence_number,input_args->buffer_index); -#endif - /* initialize the iteration counter */ - buff_idx = input_args->buffer_index; - leading_dim = bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - /* Set pointer to current proc ctrl region */ - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - /* init the header */ - BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); - /* initialize these */ - *iteration = 0; - *active_requests = 0; - *status = 0; - - /* k-nomial parameters */ - tree_order = exchange_node->tree_order; - pow_k = exchange_node->log_tree_order; - - /* calculate the maximum number of requests - * at each level each rank communicates with - * at most (k - 1) peers - * so if we set k - 1 bit fields in "max_requests", then - * we have max_request == 2^(k - 1) -1 - */ - for(i = 0; i < (tree_order - 1); i++){ - max_requests ^= (1<node_type ) { - - /* then I will signal to my proxy rank*/ - - my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; - ready_flag = flag_offset + 1 + pow_k + 2; - /* now, poll for completion */ - - src = exchange_node->rank_extra_sources_array[0]; - peer_ctl_pointer = data_buffs[src].ctl_struct; - - for( i = 0; i < cm->num_to_probe ; i++ ) { - if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ - goto FINISHED; - } - - } - - /* cache state and bail */ - *iteration = -1; - return BCOL_FN_STARTED; - - }else if ( 0 < exchange_node->n_extra_sources ) { - - /* I am a proxy for someone */ - src = exchange_node->rank_extra_sources_array[0]; - peer_ctl_pointer = data_buffs[src].ctl_struct; - - /* probe for extra rank's arrival */ - for( i = 0, matched = false ; i < cm->num_to_probe && !matched ; i++) { - if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ - /* copy it in */ - matched = true; - break; - } - } - - if (!matched) { - *status = ready_flag; - *iteration = -1; - return BCOL_FN_STARTED; - } - } - - /* bump the ready flag */ - ready_flag++; - - /* we start the recursive k - ing phase */ - for( *iteration = 0; *iteration < pow_k; (*iteration)++) { - /* announce my arrival */ - my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; - /* calculate the number of active requests */ - CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iteration],tree_order); - /* Now post the recv's */ - for( j = 0; j < (tree_order - 1); j++ ) { - - /* recv phase */ - src = exchange_node->rank_exchanges[*iteration][j]; - if( src < 0 ) { - /* then not a valid rank, continue */ - continue; - } - - peer_ctl_pointer = data_buffs[src].ctl_struct; - if( !(*active_requests&(1<num_to_probe ; probe++){ - if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ - /* set this request's bit */ - *active_requests ^= (1<flags[BARRIER_RKING_FLAG][bcol_id]; - return BCOL_FN_STARTED; - } - } - - /* bump the flag one more time for the extra rank */ - ready_flag = flag_offset + 1 + pow_k + 2; - - /* finish off the last piece, send the data back to the extra */ - if( 0 < exchange_node->n_extra_sources ) { - /* simply announce my arrival */ - my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; - - } - -FINISHED: - - - my_ctl_pointer->starting_flag_value[bcol_id]++; - return BCOL_FN_COMPLETE; -} - - -/* allgather progress function */ - -int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - - - /* local variables */ - int flag_offset; - volatile int8_t ready_flag; - mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module; - netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree; - mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; - uint32_t buffer_index = input_args->buffer_index; - int *active_requests = - &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); - - int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; - int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; - int *iter = iteration; /* double alias */ - int leading_dim, idx, buff_idx; - - int i, j, probe; - int src; - int max_requests = 0; /* critical to set this */ - int pow_k, tree_order; - int bcol_id = (int) bcol_module->super.bcol_id; - - bool matched; - int64_t sequence_number=input_args->sequence_num; - int my_rank = bcol_module->super.sbgp_partner_module->my_index; - - volatile mca_bcol_basesmuma_payload_t *data_buffs; - - /* control structures */ - volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; - volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer; -#if 0 - fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n",my_rank, - *active_requests,*iter,*status); -#endif - buff_idx = buffer_index; - leading_dim=bcol_module->colls_no_user_data.size_of_group; - idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); - - data_buffs=(volatile mca_bcol_basesmuma_payload_t *) - bcol_module->colls_with_user_data.data_buffs+idx; - my_ctl_pointer = data_buffs[my_rank].ctl_struct; - - /* increment the starting flag by one and return */ - flag_offset = my_ctl_pointer->starting_flag_value[bcol_id]; - ready_flag = *status; - /* k-nomial parameters */ - tree_order = exchange_node->tree_order; - pow_k = exchange_node->log_tree_order; - - /* calculate the maximum number of requests - * at each level each rank communicates with - * at most (k - 1) peers - * so if we set k - 1 bit fields in "max_requests", then - * we have max_request == 2^(k - 1) -1 - */ - for(i = 0; i < (tree_order - 1); i++){ - max_requests ^= (1<node_type ) { - - /* If I'm in here, then I must be looking for data */ - ready_flag = flag_offset + 1 + pow_k + 2; - - src = exchange_node->rank_extra_sources_array[0]; - peer_ctl_pointer = data_buffs[src].ctl_struct; - - for( i = 0; i < cm->num_to_probe ; i++ ) { - if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ - goto FINISHED; - } - - } - - /* haven't found it, state is cached, bail out */ - return BCOL_FN_STARTED; - - }else if ( ( -1 == *iteration ) && (0 < exchange_node->n_extra_sources) ) { - - /* I am a proxy for someone */ - src = exchange_node->rank_extra_sources_array[0]; - peer_ctl_pointer = data_buffs[src].ctl_struct; - - /* probe for extra rank's arrival */ - for( i = 0, matched = false ; i < cm->num_to_probe && !matched ; i++) { - if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ - matched = true; - /* bump the flag */ - ready_flag++; - *iteration = 0; - break; - } - } - - if (!matched) { - return BCOL_FN_STARTED; - } - } - - /* start the recursive k - ing phase */ - for( *iter=*iteration; *iter < pow_k; (*iter)++) { - /* I am ready at this level */ - my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; - if( 0 == *active_requests ) { - /* flip some bits, if we don't have active requests from a previous visit */ - CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iter],tree_order); - } - for( j = 0; j < (tree_order - 1); j++ ) { - - /* recv phase */ - src = exchange_node->rank_exchanges[*iter][j]; - if( src < 0 ) { - /* then not a valid rank, continue - */ - continue; - } - - peer_ctl_pointer = data_buffs[src].ctl_struct; - if( !(*active_requests&(1<num_to_probe ; probe++){ - if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ - /* flip the request's bit */ - *active_requests ^= (1<flags[BARRIER_RKING_FLAG][bcol_id]; - return BCOL_FN_STARTED; - } - } - /* bump the flag one more time for the extra rank */ - ready_flag = flag_offset + 1 + pow_k + 2; - - /* finish off the last piece, send the data back to the extra */ - if( 0 < exchange_node->n_extra_sources ) { - /* simply announce my arrival */ - my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; - - } - -FINISHED: - - my_ctl_pointer->starting_flag_value[bcol_id]++; - return BCOL_FN_COMPLETE; -} - -/* Register k-nomial barrier functions to the BCOL function table, - * so they can be selected - */ -int bcol_basesmuma_barrier_init(mca_bcol_base_module_t *super) -{ -mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_BARRIER; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_basesmuma_k_nomial_barrier_init, - bcol_basesmuma_k_nomial_barrier_progress); - - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_setup.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_setup.c deleted file mode 100644 index 435d6a6983..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_setup.c +++ /dev/null @@ -1,588 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#include "mpi.h" -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "opal/mca/mpool/base/base.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/patterns/comm/coll_ops.h" - -#include "opal/class/opal_object.h" -#include "opal/dss/dss.h" - -#include "bcol_basesmuma.h" - -int base_bcol_basesmuma_setup_ctl_struct( - mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_bcol_basesmuma_component_t *cs, - sm_buffer_mgmt *ctl_mgmt); - -/* this is the new one, uses the pml allgather */ -int base_bcol_basesmuma_exchange_offsets( - mca_bcol_basesmuma_module_t *sm_bcol_module, - void **result_array, uint64_t mem_offset, int loop_limit, - int leading_dim) -{ - int ret=OMPI_SUCCESS,i; - int count; - int index_in_group; - char *send_buff; - char *recv_buff; - uint64_t rem_mem_offset; - - /* malloc some memory */ - count = sizeof(uint64_t) + sizeof(int); - send_buff = (char *) malloc(count); - recv_buff = (char *) malloc(count * - sm_bcol_module->super.sbgp_partner_module->group_size); - /* exchange the base pointer for the controls structures - gather - * every one else's infromation. - */ - - - /* pack the offset of the allocated region */ - memcpy((void *) send_buff, (void *) &(sm_bcol_module->super.sbgp_partner_module->my_index), sizeof(int)); - memcpy((void *) (send_buff+ sizeof(int)), (void *) &(mem_offset), sizeof(uint64_t)); - - /* get the offsets from all procs, so can setup the control data - * structures. - */ - - ret=comm_allgather_pml((void *) send_buff,(void *) recv_buff,count, - MPI_BYTE, - sm_bcol_module->super.sbgp_partner_module->my_index, - sm_bcol_module->super.sbgp_partner_module->group_size, - sm_bcol_module->super.sbgp_partner_module->group_list, - sm_bcol_module->super.sbgp_partner_module->group_comm); - if( OMPI_SUCCESS != ret ) { - goto exit_ERROR; - } - - /* get the control stucture offsets within the shared memory - * region and populate the control structures - we do not assume - * any symmetry in memory layout of each process - */ - - /* loop over the procs in the group */ - for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){ - int array_id; - /* get this peer's index in the group */ - memcpy((void *) &index_in_group, (void *) (recv_buff + i*count) , sizeof(int)); - - /* get the offset */ - memcpy((void *) &rem_mem_offset, (void *) (recv_buff + i*count + sizeof(int)), sizeof(uint64_t)); - - array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group); - result_array[array_id]=(void *)(uintptr_t)rem_mem_offset; - - } - -exit_ERROR: - /* clean up */ - if( NULL != send_buff ) { - free(send_buff); - send_buff = NULL; - } - if( NULL != recv_buff ) { - free(recv_buff); - recv_buff = NULL; - } - - return ret; - - -} - -#if 0 -int base_bcol_basesmuma_exchange_offsets( - mca_bcol_basesmuma_module_t *sm_bcol_module, - void **result_array, uint64_t mem_offset, int loop_limit, - int leading_dim) -{ - int ret=OMPI_SUCCESS,i,dummy; - int index_in_group, pcnt; - opal_list_t peers; - ompi_namelist_t *peer; - ompi_proc_t *proc_temp, *my_id; - opal_buffer_t *send_buffer = OBJ_NEW(opal_buffer_t); - opal_buffer_t *recv_buffer = OBJ_NEW(opal_buffer_t); - uint64_t rem_mem_offset; - - /* exchange the base pointer for the controls structures - gather - * every one else's infromation. - */ - /* get list of procs that will participate in the communication */ - OBJ_CONSTRUCT(&peers, opal_list_t); - for (i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++) { - /* get the proc info */ - proc_temp = ompi_comm_peer_lookup( - sm_bcol_module->super.sbgp_partner_module->group_comm, - sm_bcol_module->super.sbgp_partner_module->group_list[i]); - peer = OBJ_NEW(ompi_namelist_t); - peer->name.jobid = proc_temp->proc_name.jobid; - peer->name.vpid = proc_temp->proc_name.vpid; - opal_list_append(&peers,&peer->super); /* this is with the new field called "super" in ompi_namelist_t struct */ - } - /* pack up the data into the allgather send buffer */ - if (NULL == send_buffer || NULL == recv_buffer) { - opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for sbuffer or rbuffer\n"); - ret = OMPI_ERROR; - goto exit_ERROR; - } - - /* get my proc information */ - my_id = ompi_proc_local(); - - /* pack my information */ - ret = opal_dss.pack(send_buffer, - &(sm_bcol_module->super.sbgp_partner_module->my_index),1,OPAL_UINT32); - - if (OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "Error packing my_index!!\n"); - goto exit_ERROR; - } - - /* pack the offset of the allocated region */ - ret = opal_dss.pack(send_buffer,&(mem_offset),1,OPAL_UINT64); - if (OMPI_SUCCESS != ret) { - goto exit_ERROR; - } - - /* get the offsets from all procs, so can setup the control data - * structures. - */ - if (OMPI_SUCCESS != (ret = ompi_rte_allgather_list(&peers, send_buffer, recv_buffer))) { - opal_output (ompi_bcol_base_framework.framework_output, "ompi_rte_allgather_list returned error %d\n", ret); - goto exit_ERROR; - } - - /* unpack the dummy */ - pcnt=1; - ret = opal_dss.unpack(recv_buffer,&dummy, &pcnt, OPAL_INT32); - if (OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for dummy\n",ret); - goto exit_ERROR; - } - - /* get the control stucture offsets within the shared memory - * region and populate the control structures - we do not assume - * any symmetry in memory layout of each process - */ - - /* loop over the procs in the group */ - for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){ - int array_id; - pcnt=1; - ret = opal_dss.unpack(recv_buffer,&index_in_group, &pcnt, OPAL_UINT32); - if (OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote index_in_group\n",ret); - goto exit_ERROR; - } - - /* get the offset */ - pcnt=1; - ret = opal_dss.unpack(recv_buffer,&rem_mem_offset, &pcnt, OPAL_UINT64); - if (OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote memory offset\n",ret); - goto exit_ERROR; - } - - array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group); - result_array[array_id]=(void *)rem_mem_offset; - - } - - /* clean up */ - peer=(ompi_namelist_t *)opal_list_remove_first(&peers); - while( NULL !=peer) { - OBJ_RELEASE(peer); - peer=(ompi_namelist_t *)opal_list_remove_first(&peers); - } - OBJ_DESTRUCT(&peers); - if( send_buffer ) { - OBJ_RELEASE(send_buffer); - } - if( recv_buffer ) { - OBJ_RELEASE(recv_buffer); - } - - return ret; - -exit_ERROR: - - /* free peer list */ - peer=(ompi_namelist_t *)opal_list_remove_first(&peers); - while( NULL !=peer) { - OBJ_RELEASE(peer); - peer=(ompi_namelist_t *)opal_list_remove_first(&peers); - } - OBJ_DESTRUCT(&peers); - if( send_buffer ) { - OBJ_RELEASE(send_buffer); - } - if( recv_buffer ) { - OBJ_RELEASE(recv_buffer); - } - return ret; -} -#endif - - -static int base_bcol_basesmuma_exchange_ctl_params( - mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_bcol_basesmuma_component_t *cs, - sm_buffer_mgmt *ctl_mgmt, list_data_t *data_blk) -{ - int ret=OMPI_SUCCESS,i,loop_limit; - int leading_dim, buf_id; - void *mem_offset; - unsigned char *base_ptr; - mca_bcol_basesmuma_ctl_struct_t *ctl_ptr; - - /* data block base offset in the mapped file */ - mem_offset = (void *)((uintptr_t)data_blk->data - - (uintptr_t)cs->sm_ctl_structs->data_addr); - - /* number of buffers in data block */ - loop_limit=cs->basesmuma_num_mem_banks+ctl_mgmt->number_of_buffs; - leading_dim=ctl_mgmt->size_of_group; - ret=comm_allgather_pml(&mem_offset, ctl_mgmt->ctl_buffs, sizeof(void *), - MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index, - sm_bcol_module->super.sbgp_partner_module->group_size, - sm_bcol_module->super.sbgp_partner_module->group_list, - sm_bcol_module->super.sbgp_partner_module->group_comm); - if( OMPI_SUCCESS != ret ) { - goto exit_ERROR; - } - -#if 0 - ret=base_bcol_basesmuma_exchange_offsets( sm_bcol_module, - (void **)ctl_mgmt->ctl_buffs, mem_offset, loop_limit, leading_dim); - if( OMPI_SUCCESS != ret ) { - goto exit_ERROR; - } -#endif - - /* convert memory offset to virtual address in current rank */ - for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) { - - /* get the base pointer */ - int array_id=SM_ARRAY_INDEX(leading_dim,0,i); - if( i == sm_bcol_module->super.sbgp_partner_module->my_index) { - /* me */ - base_ptr=cs->sm_ctl_structs->map_addr; - } else { - base_ptr=sm_bcol_module->ctl_backing_files_info[i]->sm_mmap->map_addr; - } - ctl_mgmt->ctl_buffs[array_id]=(void *) - (uintptr_t)(((uint64_t)(uintptr_t)ctl_mgmt->ctl_buffs[array_id])+(uint64_t)(uintptr_t)base_ptr); - for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) { - int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i); - array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i); - ctl_mgmt->ctl_buffs[array_id]=(void *) (uintptr_t)((uint64_t)(uintptr_t)(ctl_mgmt->ctl_buffs[array_id_m1])+ - (uint64_t)(uintptr_t)sizeof(mca_bcol_basesmuma_ctl_struct_t)); - } - } - /* initialize my control structues */ - for( buf_id = 0 ; buf_id < loop_limit ; buf_id++ ) { - - int my_idx=sm_bcol_module->super.sbgp_partner_module->my_index; - int array_id=SM_ARRAY_INDEX(leading_dim,buf_id,my_idx); - ctl_ptr = (mca_bcol_basesmuma_ctl_struct_t *) - ctl_mgmt->ctl_buffs[array_id]; - - /* initialize the data structures - RLG, this is only one data - * structure that needs to be initialized, more are missing */ - ctl_ptr->sequence_number=-1; - ctl_ptr->flag=-1; - ctl_ptr->index=0; - ctl_ptr->src_ptr = NULL; - } - - return ret; - -exit_ERROR: - - return ret; -} - -static int base_bcol_basesmuma_setup_ctl (mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_bcol_basesmuma_component_t *cs) -{ - const int my_index = sm_bcol_module->super.sbgp_partner_module->my_index;; - bcol_basesmuma_smcm_file_t input_file; - int ret; - - /* exchange remote addressing information if it has not already been done */ - if (NULL == sm_bcol_module->ctl_backing_files_info) { - input_file.file_name=cs->sm_ctl_structs->map_path; - input_file.size=cs->sm_ctl_structs->map_size; - input_file.size_ctl_structure=0; - input_file.data_seg_alignment=BASESMUMA_CACHE_LINE_SIZE; - input_file.mpool_size=cs->sm_ctl_structs->map_size; - ret = bcol_basesmuma_smcm_allgather_connection(sm_bcol_module, - sm_bcol_module->super.sbgp_partner_module, - &(cs->sm_connections_list), - &(sm_bcol_module->ctl_backing_files_info), - sm_bcol_module->super.sbgp_partner_module->group_comm, - input_file, cs->clt_base_fname, - false); - if (OMPI_SUCCESS != ret) { - return ret; - } - } - - /* fill in the pointer to other ranks scartch shared memory */ - if (NULL == sm_bcol_module->shared_memory_scratch_space) { - sm_bcol_module->shared_memory_scratch_space = - calloc (sm_bcol_module->super.sbgp_partner_module->group_size, sizeof (void *)); - if (!sm_bcol_module->shared_memory_scratch_space) { - opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for shared_memory_scratch_space."); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - for (int i = 0 ; i < sm_bcol_module->super.sbgp_partner_module->group_size ; ++i) { - if (i == my_index) { - /* local file data is not cached in this list */ - continue; - } - - sm_bcol_module->shared_memory_scratch_space[i] = - (void *)((intptr_t) sm_bcol_module->ctl_backing_files_info[i]->sm_mmap + - cs->scratch_offset_from_base_ctl_file); - } - - sm_bcol_module->shared_memory_scratch_space[my_index] = - (void *)((intptr_t) cs->sm_ctl_structs->map_addr + cs->scratch_offset_from_base_ctl_file); - } - - return OMPI_SUCCESS; -} - -int base_bcol_basesmuma_setup_ctl_struct( - mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_bcol_basesmuma_component_t *cs, - sm_buffer_mgmt *ctl_mgmt) -{ - int n_ctl, n_levels; - int n_ctl_structs; - size_t malloc_size; - - /* - * set my no user-data conrol structures - */ - /* number of banks and regions per bank are already a power of 2 */ - n_ctl_structs=cs->basesmuma_num_mem_banks* - cs->basesmuma_num_regions_per_bank; - - /* initialize the control structure management struct - - * for collectives without user data - *--------------------------------------------------------------- - */ - - ctl_mgmt->number_of_buffs=n_ctl_structs; - ctl_mgmt->num_mem_banks= - cs->basesmuma_num_mem_banks; - - ctl_mgmt->num_buffs_per_mem_bank= - cs->basesmuma_num_regions_per_bank; - ctl_mgmt->size_of_group= - sm_bcol_module->super.sbgp_partner_module->group_size; - ompi_roundup_to_power_radix(2,cs->basesmuma_num_regions_per_bank,&n_levels); - ctl_mgmt->log2_num_buffs_per_mem_bank=n_levels; - - ompi_roundup_to_power_radix(2,n_ctl_structs,&n_levels); - ctl_mgmt->log2_number_of_buffs=n_levels; - ctl_mgmt->mask=n_ctl_structs-1; - sm_bcol_module->super.n_poll_loops=cs->n_poll_loops; - - malloc_size= - (ctl_mgmt->number_of_buffs + - ctl_mgmt->num_mem_banks ) * - ctl_mgmt->size_of_group * - sizeof(void *); - ctl_mgmt->ctl_buffs = malloc(malloc_size); - if (!ctl_mgmt->ctl_buffs) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* - * setup the no-data buffer managment data - */ - n_ctl = ctl_mgmt->num_mem_banks; - ctl_mgmt->ctl_buffs_mgmt = (mem_bank_management_t *) calloc (n_ctl, sizeof (mem_bank_management_t)); - if (!ctl_mgmt->ctl_buffs_mgmt) { - opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for ctl_buffs_mgmt"); - free (ctl_mgmt->ctl_buffs); - ctl_mgmt->ctl_buffs = NULL; - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* initialize each individual element */ - for (int i = 0 ; i < n_ctl ; ++i) { - opal_list_item_t *item; - opal_mutex_t *mutex_ptr; - - ctl_mgmt->ctl_buffs_mgmt[i].available_buffers= - ctl_mgmt->num_buffs_per_mem_bank; - ctl_mgmt->ctl_buffs_mgmt[i].number_of_buffers= - ctl_mgmt->num_buffs_per_mem_bank; - mutex_ptr = &(ctl_mgmt->ctl_buffs_mgmt[i].mutex); - OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t); - ctl_mgmt->ctl_buffs_mgmt[i].index_shared_mem_ctl_structs=i; - - item = (opal_list_item_t *)&(ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc); - OBJ_CONSTRUCT(item, opal_list_item_t); - ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.sm_module = - sm_bcol_module; - ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.pool_index = i; - /* get the sm_buffer_mgmt pointer for the control structures */ - ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.coll_buff = ctl_mgmt; - } - - return OMPI_SUCCESS; -} - -/* - * this function initializes the internal scratch buffers and control - * structures that will be used by the module. It also intitializes - * the payload buffer management structures. - */ -int base_bcol_basesmuma_setup_library_buffers( - mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_bcol_basesmuma_component_t *cs) -{ - int ret=OMPI_SUCCESS,i; - int n_ctl_structs; - size_t ctl_segement_size,total_memory; - int max_elements; - unsigned char *data_ptr; - - /* */ - /* setup the control struct memory */ - if(!cs->sm_ctl_structs) { - ret = mca_bcol_basesmuma_allocate_sm_ctl_memory(cs); - if(OMPI_SUCCESS != ret) { - opal_output (ompi_bcol_base_framework.framework_output, "In bcol_comm_query mca_bcol_basesmuma_allocate_sm_ctl_memory failed\n"); - return ret; - } - /* - * put the memory onto the free list - we have worried about - * alignment in the mpool allocation, and assume that the - * ctl structures have the approriate size to mantain alignment - */ - - /* figure out segment size */ - n_ctl_structs=cs->basesmuma_num_mem_banks* - cs->basesmuma_num_regions_per_bank; - - /* add memory for the control structure used for recycling the banks */ - n_ctl_structs+=cs->basesmuma_num_mem_banks; - - ctl_segement_size=n_ctl_structs* - sizeof(mca_bcol_basesmuma_ctl_struct_t); - - total_memory=cs->sm_ctl_structs->map_size - ( - (char *)(cs->sm_ctl_structs->data_addr)- - (char *)(cs->sm_ctl_structs->map_addr)); - total_memory-=cs->my_scratch_shared_memory_size; - max_elements=total_memory/ctl_segement_size; - - /* populate the free list */ - data_ptr=cs->sm_ctl_structs->data_addr; - - for( i=0 ; i < max_elements ; i++ ) { - list_data_t *item = OBJ_NEW(list_data_t); - if( !item ) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - item->data=(void *)data_ptr; - opal_list_append(&(cs->ctl_structures),(opal_list_item_t *)item); - data_ptr+=ctl_segement_size; - } - /* set the scratch memory pointer and offset */ - cs->my_scratch_shared_memory=(char *)data_ptr; - cs->scratch_offset_from_base_ctl_file=(size_t) - ((char *)data_ptr-(char *)cs->sm_ctl_structs->map_addr); - - - /* At this stage the memory is mapped and ready to use by the local rank. - * However, the memory of other processes has not yet been mmaped into the - * memory of this process. - */ - } - - /* intialize no_userdata_ctl */ - sm_bcol_module->no_userdata_ctl=(list_data_t *) - opal_list_remove_last(&(cs->ctl_structures)); - if (!sm_bcol_module->no_userdata_ctl) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* intialize userdata_ctl */ - sm_bcol_module->userdata_ctl = (list_data_t *) - opal_list_remove_last(&(cs->ctl_structures)); - if (!sm_bcol_module->userdata_ctl) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - ret = base_bcol_basesmuma_setup_ctl (sm_bcol_module, cs); - if (OMPI_SUCCESS != ret) { - return ret; - } - - ret = base_bcol_basesmuma_setup_ctl_struct (sm_bcol_module, cs, &(sm_bcol_module->colls_no_user_data)); - if( OMPI_SUCCESS != ret ) { - return ret; - } - - ret = base_bcol_basesmuma_setup_ctl_struct (sm_bcol_module, cs, &(sm_bcol_module->colls_with_user_data)); - if( OMPI_SUCCESS != ret ) { - return ret; - } - - /* used for blocking recursive doubling barrier */ - sm_bcol_module->index_blocking_barrier_memory_bank=0; - - /* gather the offsets of the control structs relative to the base - * of the shared memory file, and fill in the table with the - * address of all the control structues. - */ - ret = base_bcol_basesmuma_exchange_ctl_params(sm_bcol_module, cs, - &(sm_bcol_module->colls_no_user_data),sm_bcol_module->no_userdata_ctl); - if( OMPI_SUCCESS != ret ) { - return ret; - } - - ret = base_bcol_basesmuma_exchange_ctl_params(sm_bcol_module, cs, - &(sm_bcol_module->colls_with_user_data),sm_bcol_module->userdata_ctl); - if( OMPI_SUCCESS != ret ) { - return ret; - } - - return OMPI_SUCCESS; -} - -OBJ_CLASS_INSTANCE(list_data_t, - opal_list_item_t, NULL, NULL); diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.c deleted file mode 100644 index e0c23cae62..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.c +++ /dev/null @@ -1,460 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include -#include -#ifdef HAVE_STRINGS_H -#include -#endif - -#include "ompi/proc/proc.h" -#include "ompi/patterns/comm/coll_ops.h" -#include "opal/align.h" - -#include "opal/dss/dss.h" -#include "opal/util/error.h" -#include "opal/util/output.h" -#include "opal/class/opal_list.h" -#include "opal/class/opal_hash_table.h" - -#include "bcol_basesmuma.h" - - - -#define SM_BACKING_FILE_NAME_MAX_LEN 256 - -static bcol_basesmuma_smcm_mmap_t * bcol_basesmuma_smcm_reg_mmap(void *in_ptr, int fd, size_t length, - size_t addr_offset, size_t alignment, - char *file_name); - -struct file_info_t { - uint32_t vpid; - uint32_t jobid; - uint64_t file_size; - uint64_t size_ctl_structure; - uint64_t data_seg_alignment; - char file_name[SM_BACKING_FILE_NAME_MAX_LEN]; -}; - -/* need to allocate space for the peer */ -static void bcol_basesmuma_smcm_proc_item_t_construct (bcol_basesmuma_smcm_proc_item_t * item) -{ - memset ((char *) item + sizeof (item->item), 0, sizeof (*item) - sizeof (item->item)); -} - -/* need to free the space for the peer */ -static void bcol_basesmuma_smcm_proc_item_t_destruct (bcol_basesmuma_smcm_proc_item_t * item) -{ - if (item->sm_mmap) { - OBJ_RELEASE(item->sm_mmap); - } - - if (item->sm_file.file_name) { - free (item->sm_file.file_name); - item->sm_file.file_name = NULL; - } -} - -OBJ_CLASS_INSTANCE(bcol_basesmuma_smcm_proc_item_t, - opal_list_item_t, - bcol_basesmuma_smcm_proc_item_t_construct, - bcol_basesmuma_smcm_proc_item_t_destruct); - -static void bcol_basesmuma_smcm_mmap_construct (bcol_basesmuma_smcm_mmap_t *smcm_mmap) -{ - memset ((char *) smcm_mmap + sizeof (smcm_mmap->super), 0, sizeof (*smcm_mmap) - sizeof (smcm_mmap->super)); -} - -static void bcol_basesmuma_smcm_mmap_destruct (bcol_basesmuma_smcm_mmap_t *smcm_mmap) -{ - if (smcm_mmap->map_seg) { - munmap ((void *)smcm_mmap->map_seg, smcm_mmap->map_size); - smcm_mmap->map_seg = NULL; - } - - if (smcm_mmap->map_path) { - free (smcm_mmap->map_path); - smcm_mmap->map_path = NULL; - } -} - -OBJ_CLASS_INSTANCE(bcol_basesmuma_smcm_mmap_t, opal_list_item_t, - bcol_basesmuma_smcm_mmap_construct, - bcol_basesmuma_smcm_mmap_destruct); - - -/* smcm_allgather_connection: - This function is called when a shared memory subgroup wants to establish shared memory "connections" among - a group of processes. - - This function DOES NOT create any shared memory backing files, it only mmaps already existing files. Shared - memory files are created by the shared memory registration function - ----------------------------------------------------------------------------------------------------------- - Input params: - - - sbgp module The subgrouping module contains the list of ranks to wire up. - - - peer_list An opal list containing a list of bcol_basesmuma_smcm_proc_item_t types. This - contains a list of peers whose shared memory files I have already mapped. - Upon completion of the allgather exchange with all members of the group and depending on the - value of "map_all", my peers' shared memory files are mapped into my local virtual memory - space, with all pertinent information being stored in an bcol_basesmuma_smcm_proc_item_t which is - subsequently appended onto the "peer_list". - - - comm The ompi_communicator_t communicator. - - - input A data struct that caches the information about my shared memory file. - - - map_all Bool that determines whether or not to go ahead and map the files from all of the peers - defined in the sbgp-ing module. If map_all == true, then go ahead and mmap all of the files - obtained in the exchange and append the information to the "peer_list". If map_all == false - then make a check and only mmap those peers' files whose vpid/jobid/filename combination do - not already exist in the "peer_list". Once mapping is completed, append this peer's information - to the "peer_list". - ----------------------------------------------------------------------------------------------------------- - * - */ - - -int bcol_basesmuma_smcm_allgather_connection( - mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_sbgp_base_module_t *module, - opal_list_t *peer_list, - bcol_basesmuma_smcm_proc_item_t ***back_files, - ompi_communicator_t *comm, - bcol_basesmuma_smcm_file_t input, - char *base_fname, - bool map_all) -{ - - /* define local variables */ - - int rc, i, fd; - ptrdiff_t mem_offset; - ompi_proc_t *proc_temp, *my_id; - bcol_basesmuma_smcm_proc_item_t *temp; - bcol_basesmuma_smcm_proc_item_t *item_ptr; - bcol_basesmuma_smcm_proc_item_t **backing_files; - struct file_info_t local_file; - struct file_info_t *all_files=NULL; - - /* sanity check */ - if (strlen(input.file_name) > SM_BACKING_FILE_NAME_MAX_LEN-1) { - opal_output (ompi_bcol_base_framework.framework_output, "backing file name too long: %s len :: %d", - input.file_name, (int) strlen(input.file_name)); - return OMPI_ERR_BAD_PARAM; - } - - backing_files = (bcol_basesmuma_smcm_proc_item_t **) - calloc(module->group_size, sizeof(bcol_basesmuma_smcm_proc_item_t *)); - if (!backing_files) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* FIXME *back_files might have been already allocated - * so free it in order to avoid a memory leak */ - if (NULL != *back_files) { - free (*back_files); - } - *back_files = backing_files; - - my_id = ompi_proc_local(); - - /* Phase One: - gather a list of processes that will participate in the allgather - I'm - preparing this list from the sbgp-ing module that was passed into the function */ - - /* fill in local file information */ - local_file.vpid = ((orte_process_name_t*)&my_id->super.proc_name)->vpid; - local_file.jobid = ((orte_process_name_t*)&my_id->super.proc_name)->jobid; - local_file.file_size=input.size; - local_file.size_ctl_structure=input.size_ctl_structure; - local_file.data_seg_alignment=input.data_seg_alignment; - - strcpy (local_file.file_name, input.file_name); - - /* will exchange this data type as a string of characters - - * this routine is first called before MPI_init() completes - * and before error handling is setup, so can't use the - * MPI data types to send this data */ - all_files = (struct file_info_t *) calloc(module->group_size, - sizeof (struct file_info_t)); - if (!all_files) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* exchange data */ - rc = comm_allgather_pml(&local_file,all_files,sizeof(struct file_info_t), MPI_CHAR, - sm_bcol_module->super.sbgp_partner_module->my_index, - sm_bcol_module->super.sbgp_partner_module->group_size, - sm_bcol_module->super.sbgp_partner_module->group_list, - sm_bcol_module->super.sbgp_partner_module->group_comm); - if( OMPI_SUCCESS != rc ) { - opal_output (ompi_bcol_base_framework.framework_output, "failed in comm_allgather_pml. Error code: %d", rc); - goto Error; - } - - /* Phase four: - loop through the receive buffer, unpack the data recieved from remote peers */ - - for (i = 0; i < module->group_size; i++) { - struct file_info_t *rem_file = all_files + i; - - /* check if this is my index or if the file is already mapped (set above). ther - * is no reason to look through the peer list again because no two members of - * the group will have the same vpid/jobid pair. ignore this previously found - * mapping if map_all was requested (NTH: not sure why exactly since we re-map - * and already mapped file) */ - if (sm_bcol_module->super.sbgp_partner_module->my_index == i) { - continue; - } - - proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]); - - OPAL_LIST_FOREACH(item_ptr, peer_list, bcol_basesmuma_smcm_proc_item_t) { - /* if the vpid/jobid/filename combination already exists in the list, - then do not map this peer's file --- because you already have */ - if (0 == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, - OMPI_CAST_RTE_NAME(&proc_temp->super.proc_name), - &item_ptr->peer) && - 0 == strcmp (item_ptr->sm_file.file_name, rem_file->file_name)) { - ++item_ptr->refcnt; - /* record file data */ - backing_files[i] = item_ptr; - break; - } - } - - if (!map_all && backing_files[i]) { - continue; - } - - temp = OBJ_NEW(bcol_basesmuma_smcm_proc_item_t); - if (!temp) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - temp->peer.vpid = rem_file->vpid; - temp->peer.jobid = rem_file->jobid; - - temp->sm_file.file_name = strdup (rem_file->file_name); - if (!temp->sm_file.file_name) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - OBJ_RELEASE(temp); - goto Error; - } - - temp->sm_file.size = (size_t) rem_file->file_size; - temp->sm_file.mpool_size = (size_t) rem_file->file_size; - temp->sm_file.size_ctl_structure = (size_t) rem_file->size_ctl_structure; - temp->sm_file.data_seg_alignment = (size_t) rem_file->data_seg_alignment; - temp->refcnt = 1; - - /* Phase Five: - If map_all == true, then we map every peer's file - else we check to see if I have already mapped this - vpid/jobid/filename combination and if I have, then - I do not mmap this peer's file. - * - */ - fd = open(temp->sm_file.file_name, O_RDWR, 0600); - if (0 > fd) { - opal_output (ompi_bcol_base_framework.framework_output, "SMCM Allgather failed to open sm backing file %s. errno = %d", - temp->sm_file.file_name, errno); - rc = OMPI_ERROR; - goto Error; - } - - /* map the file */ - temp->sm_mmap = bcol_basesmuma_smcm_reg_mmap (NULL, fd, temp->sm_file.size, - temp->sm_file.size_ctl_structure, - temp->sm_file.data_seg_alignment, - temp->sm_file.file_name); - close (fd); - if (NULL == temp->sm_mmap) { - opal_output (ompi_bcol_base_framework.framework_output, "mmapping failed to map remote peer's file"); - OBJ_RELEASE(temp); - rc = OMPI_ERROR; - goto Error; - } - - /* compute memory offset */ - mem_offset = (ptrdiff_t) temp->sm_mmap->data_addr - - (ptrdiff_t) temp->sm_mmap->map_seg; - temp->sm_mmap->map_seg->seg_offset = mem_offset; - temp->sm_mmap->map_seg->seg_size = temp->sm_file.size - mem_offset; - /* more stuff to follow */ - - /* append this peer's info, including shared memory map addr, onto the - peer_list */ - - /* record file data */ - backing_files[i] = (bcol_basesmuma_smcm_proc_item_t *) temp; - - opal_list_append(peer_list, (opal_list_item_t*) temp); - } - - rc = OMPI_SUCCESS; - - Error: - - /* error clean-up and return */ - if (NULL != all_files) { - free(all_files); - } - - return rc; -} - -int bcol_basesmuma_smcm_release_connections (mca_bcol_basesmuma_module_t *sm_bcol_module, - mca_sbgp_base_module_t *sbgp_module, opal_list_t *peer_list, - bcol_basesmuma_smcm_proc_item_t ***back_files) -{ - bcol_basesmuma_smcm_proc_item_t **smcm_procs = *back_files; - - for (int i = 0 ; i < sbgp_module->group_size ; ++i) { - if (smcm_procs[i] && 0 == --smcm_procs[i]->refcnt) { - opal_list_remove_item (peer_list, (opal_list_item_t *) smcm_procs[i]); - OBJ_RELEASE(smcm_procs[i]); - } - } - - free (smcm_procs); - *back_files = NULL; - - return OMPI_SUCCESS; - } - - -/* - * mmap the specified file as a shared file. No information exchange with other - * processes takes place within this routine. - * This function assumes that the memory has already been allocated, and only the - * mmap needs to be done. - */ -bcol_basesmuma_smcm_mmap_t *bcol_basesmuma_smcm_mem_reg(void *in_ptr, - size_t length, - size_t alignment, - char* file_name) -{ - /* local variables */ - int fd = -1; - bcol_basesmuma_smcm_mmap_t *map = NULL; - int rc; - - /* if pointer is not allocated - return error. We have no clue how the user will allocate or - * free this memory. - */ - - /* open the shared memory backing file */ - - fd = open(file_name, O_CREAT|O_RDWR,0600); - if (fd < 0) { - opal_output (ompi_bcol_base_framework.framework_output, "basesmuma shared memory allocation open failed with errno: %d", - errno); - return NULL; - } - - if (0 != ftruncate(fd,length)) { - opal_output (ompi_bcol_base_framework.framework_output, "basesmuma shared memory allocation ftruncate failed with errno: %d", - errno); - } else { - /* ensure there is enough space for the backing store */ - rc = ftruncate (fd, length); - if (0 > rc) { - opal_output (ompi_bcol_base_framework.framework_output, "failed to truncate the file to be mapped. errno: %d", errno); - close(fd); - return NULL; - } - - map = bcol_basesmuma_smcm_reg_mmap(in_ptr, fd, length, 0, alignment, file_name); - if (NULL == map) { - close(fd); - return NULL; - } - } - /* no longer need this file descriptor. close it */ - close (fd); - - /* takes us to the top of the control structure */ - - return map; - -} - -static bcol_basesmuma_smcm_mmap_t * bcol_basesmuma_smcm_reg_mmap(void *in_ptr, int fd, size_t length, - size_t addr_offset, size_t alignment, - char *file_name) -{ - - /* local variables */ - bcol_basesmuma_smcm_mmap_t *map; - bcol_basesmuma_smcm_file_header_t *seg; - unsigned char* myaddr = NULL; - int flags = MAP_SHARED; - - /* set up the map object */ - map = OBJ_NEW(bcol_basesmuma_smcm_mmap_t); - if (OPAL_UNLIKELY(NULL == map)) { - return NULL; - } - - /* map the file and initialize the segment state */ - if (NULL != in_ptr) { - flags |= MAP_FIXED; - } - seg = (bcol_basesmuma_smcm_file_header_t *) - mmap(in_ptr, length, PROT_READ|PROT_WRITE, flags, fd, 0); - if((void*)-1 == seg) { - OBJ_RELEASE(map); - return NULL; - } - - map->map_path = strdup (file_name); - - /* the first entry in the file is the control structure. the first entry - in the control structure is an mca_common_sm_file_header_t element */ - map->map_seg = seg; - - myaddr = (unsigned char *) seg + addr_offset; - /* if we have a data segment (i.e. if 0 != data_seg_alignement) */ - - if (alignment) { - myaddr = OPAL_ALIGN_PTR(myaddr, alignment, unsigned char*); - - /* is addr past the end of the file? */ - if ((unsigned char *) seg+length < myaddr) { - opal_output (ompi_bcol_base_framework.framework_output, "mca_bcol_basesmuma_sm_alloc_mmap: memory region too small len %lu add %p", - (unsigned long) length, (void*)myaddr); - OBJ_RELEASE(map); - munmap ((void *)seg, length); - return NULL; - } - - } - - map->data_addr = (unsigned char*) myaddr; - map->map_addr = (unsigned char*) seg; - map->map_size = length; - - return map; -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.h b/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.h deleted file mode 100644 index db0edd6e78..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.h +++ /dev/null @@ -1,105 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef BCOL_BASESMUMA_SMCM_H -#define BCOL_BASESMUMA_SMCM_H - -#include -#include - -#include "ompi_config.h" -#include "ompi/proc/proc.h" - -#include "opal/class/opal_object.h" -#include "opal/class/opal_list.h" -#include "opal/sys/atomic.h" - - - -typedef struct bcol_basesmuma_smcm_file_header_t { - /* lock to control atomic access */ - opal_atomic_lock_t seg_lock; - - /* is the segment ready for use */ - volatile int32_t seg_inited; - - /* Offset to next available memory location available for allocation */ - size_t seg_offset; - - /* total size of the segment */ - size_t seg_size; -} bcol_basesmuma_smcm_file_header_t; - - -typedef struct bcol_basesmuma_smcm_mmap_t { - /* double link list element */ - opal_list_item_t super; - /* pointer to header imbeded in the shared memory file */ - bcol_basesmuma_smcm_file_header_t *map_seg; - /* base address of the mmap'ed file */ - unsigned char *map_addr; - /* base address of data segment */ - unsigned char *data_addr; - /* How big it is (in bytes) */ - size_t map_size; - /* Filename */ - char *map_path; -} bcol_basesmuma_smcm_mmap_t; - -OBJ_CLASS_DECLARATION(bcol_basesmuma_smcm_mmap_t); - - -/* Struct that characterizes a shared memory file */ -struct bcol_basesmuma_smcm_file_t { - - char *file_name; - size_t size; - size_t size_ctl_structure; - size_t data_seg_alignment; - size_t mpool_size; - -}; -typedef struct bcol_basesmuma_smcm_file_t bcol_basesmuma_smcm_file_t; - - -struct bcol_basesmuma_smcm_proc_item_t { - opal_list_item_t item; /* can put me on a free list */ - int refcnt; - ompi_process_name_t peer; - bcol_basesmuma_smcm_file_t sm_file; - bcol_basesmuma_smcm_mmap_t *sm_mmap; /* Pointer to peer's sm file */ - -}; -typedef struct bcol_basesmuma_smcm_proc_item_t bcol_basesmuma_smcm_proc_item_t; - -OBJ_CLASS_DECLARATION(bcol_basesmuma_smcm_proc_item_t); - - -/* allocate shared memory file - * in_ptr - pointer to preallocated memory (if NULL, this will be mmaped) - * alignment - region memory alignment - * file name - fully qualified backing file name -*/ - -OMPI_DECLSPEC extern bcol_basesmuma_smcm_mmap_t *bcol_basesmuma_smcm_mem_reg(void *in_ptr, - size_t length, - size_t alignment, - char* file_name); - -OMPI_DECLSPEC extern bcol_basesmuma_smcm_mmap_t* bcol_basesmuma_smcm_create_mmap(int fd, - size_t size, char *file_name, - size_t size_ctl_structure, - size_t data_seg_alignment); - -#endif diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_utils.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_utils.c deleted file mode 100644 index debe081913..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_utils.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#include "bcol_basesmuma_utils.h" - -/* - * Return closet power of K that is either greater than - * or equal to the group size. - */ -int pow_sm_k(int k, int number, int *pow_k) -{ - int power = 0; - int n = 1; - - if( 2 == k){ - while(n <= number){ - power++; - n <<= 1; - } - *pow_k = n >> 1; - - } else { - while (n <= number) { - n *= k; - power++; - } - *pow_k = n/k; - } - - - return (power-1); -} - - - -int get_k_nomial_src_list(int group_size, - int radix, int my_index, - int *src_list) { - - /* local variables */ - int radix_power; - int offset; - int kount = 0; - int src_temp; - - radix_power = 1; - offset = 1; - while(offset < group_size) { - if( offset % (radix * radix_power) ) { - src_temp = my_index - offset; - /* wrap around */ - if ( src_temp < 0 ) { - src_temp += group_size; - } - /* don't probe ghost nodes */ - if( src_temp < group_size ) { - src_list[kount] = src_temp; - kount++; - } - offset+=radix_power; - } else { - - radix_power *= radix; - } - - } - /* return the actual number of nodes to poll on */ - return kount; -} - -int get_k_nomial_dst_size(int group_size, int radix, int my_index) -{ - int dst_count = 0; - int radix_mask; - int k; - radix_mask = 1; - while (radix_mask < group_size) { - if (0 != my_index % (radix * radix_mask)) { - /* I found my level in tree */ - break; - } - radix_mask *= radix; - } - radix_mask /= radix; - - while(radix_mask > 0) { - /* For each level of tree, do sends */ - for (k = 1; - k < radix && my_index + radix_mask * k < group_size; - ++k) { - dst_count += 1 ; - } - radix_mask /= radix; - } - - return dst_count; -} diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_utils.h b/ompi/mca/bcol/basesmuma/bcol_basesmuma_utils.h deleted file mode 100644 index 738c6c62ed..0000000000 --- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_utils.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_BASESMUMA_UTILS_H -#define MCA_BCOL_BASESMUMA_UTILS_H - -#include "ompi_config.h" - -BEGIN_C_DECLS - -#define BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,radix,relative_index, \ - my_group_index, group_size, ready_flag) \ -do { \ - int k, child; \ - while(radix_mask > 0){ \ - for(k = 1; k < radix && relative_index+radix_mask*k= group_size) { \ - child -= group_size; \ - } \ - /*fprintf(stderr,"I am %d sending to child %d\n",my_group_index,child);*/ \ - child_ctl_pointer = data_buffs[child].ctl_struct; \ - child_ctl_pointer->src = my_group_index; \ - /* this can be improved to make better asynchronous progress, but it's - * fine for now. - */ \ - while(child_ctl_pointer->sequence_number != sequence_number ); \ - child_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; \ - } \ - radix_mask = radix_mask/radix; \ - } \ -} while( 0 ) - - - - -/* - * Return closet power of K that is greater than or equal to "number". - */ -int pow_sm_k(int radix_k, int group_size, int *pow_k_group_size); - -/* - * Get list of possible sources from which data may arrive based on a K-nomial tree fan-out. - */ - -int get_k_nomial_src_list(int group_size, int radix, - int my_index, int *src_list); - - -int get_k_nomial_dst_size(int group_size, int radix, int my_index); - -END_C_DECLS - -#endif diff --git a/ompi/mca/bcol/basesmuma/owner.txt b/ompi/mca/bcol/basesmuma/owner.txt deleted file mode 100644 index 1c86df367b..0000000000 --- a/ompi/mca/bcol/basesmuma/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL -status: unmaintained diff --git a/ompi/mca/bcol/bcol.h b/ompi/mca/bcol/bcol.h deleted file mode 100644 index c06f9eb44c..0000000000 --- a/ompi/mca/bcol/bcol.h +++ /dev/null @@ -1,805 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_H -#define MCA_BCOL_H - -#include "ompi_config.h" -#include "opal/class/opal_list.h" -#include "ompi/mca/mca.h" -#include "ompi/mca/coll/coll.h" -#include "opal/mca/mpool/mpool.h" -#include "ompi/mca/sbgp/sbgp.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/op/op.h" -#include "ompi/include/ompi/constants.h" -#include "ompi/patterns/net/netpatterns_knomial_tree.h" - -#include "opal/util/show_help.h" - -#include - -#if defined(c_plusplus) || defined(__cplusplus) -extern "C" { -#endif - -/* Forward declaration - please do not remove it */ -struct ml_buffers_t; - -struct mca_bcol_base_coll_fn_comm_attributes_t; -struct mca_bcol_base_coll_fn_invoke_attributes_t; -struct mca_bcol_base_coll_fn_desc_t; - -#define NUM_MSG_RANGES 5 -#define MSG_RANGE_INITIAL (1024)*12 -#define MSG_RANGE_INC 10 -#define BCOL_THRESHOLD_UNLIMITED (INT_MAX) -/* Maximum size of a bcol's header. This allows us to correctly calculate the message - * thresholds. If the header of any bcol exceeds this value then increase this one - * to match. */ -#define BCOL_HEADER_MAX 96 - -#define BCOL_HEAD_ALIGN 32 /* will turn into an MCA parameter after debug */ - -/* - * Functions supported - */ -enum bcol_coll { - /* blocking functions */ - BCOL_ALLGATHER, - BCOL_ALLGATHERV, - BCOL_ALLREDUCE, - BCOL_ALLTOALL, - BCOL_ALLTOALLV, - BCOL_ALLTOALLW, - BCOL_BARRIER, - BCOL_BCAST, - BCOL_EXSCAN, - BCOL_GATHER, - BCOL_GATHERV, - BCOL_REDUCE, - BCOL_REDUCE_SCATTER, - BCOL_SCAN, - BCOL_SCATTER, - BCOL_SCATTERV, - BCOL_FANIN, - BCOL_FANOUT, - - /* nonblocking functions */ - BCOL_IALLGATHER, - BCOL_IALLGATHERV, - BCOL_IALLREDUCE, - BCOL_IALLTOALL, - BCOL_IALLTOALLV, - BCOL_IALLTOALLW, - BCOL_IBARRIER, - BCOL_IBCAST, - BCOL_IEXSCAN, - BCOL_IGATHER, - BCOL_IGATHERV, - BCOL_IREDUCE, - BCOL_IREDUCE_SCATTER, - BCOL_ISCAN, - BCOL_ISCATTER, - BCOL_ISCATTERV, - BCOL_IFANIN, - BCOL_IFANOUT, - - BCOL_SYNC, - /* New function - needed for intermediate steps */ - BCOL_REDUCE_TO_LEADER, - BCOL_NUM_OF_FUNCTIONS -}; -typedef enum bcol_coll bcol_coll; - -typedef enum bcol_elem_type { - BCOL_SINGLE_ELEM_TYPE, - BCOL_MULTI_ELEM_TYPE, - BCOL_NUM_OF_ELEM_TYPES -} bcol_elem_type; - -typedef int (*mca_bcol_base_module_coll_support_all_types_fn_t)(bcol_coll coll_name); -typedef int (*mca_bcol_base_module_coll_support_fn_t)(int op, int dtype, bcol_elem_type elem_num); - -/* - * Collective function status - */ -enum { - BCOL_FN_NOT_STARTED = (OMPI_ERR_MAX - 1), - BCOL_FN_STARTED = (OMPI_ERR_MAX - 2), - BCOL_FN_COMPLETE = (OMPI_ERR_MAX - 3) -}; - - - -/** - * Collective component initialization - * - * Initialize the given collective component. This function should - * initialize any component-level. data. It will be called exactly - * once during MPI_INIT. - * - * @note The component framework is not lazily opened, so attempts - * should be made to minimze the amount of memory allocated during - * this function. - * - * @param[in] enable_progress_threads True if the component needs to - * support progress threads - * @param[in] enable_mpi_threads True if the component needs to - * support MPI_THREAD_MULTIPLE - * - * @retval OMPI_SUCCESS Component successfully initialized - * @retval ORTE_ERROR An unspecified error occurred - */ -typedef int (*mca_bcol_base_component_init_query_fn_t) - (bool enable_progress_threads, bool enable_mpi_threads); - -/** - * Query whether a component is available for the given sub-group - * - * Query whether the component is available for the given - * sub-group. If the component is available, an array of pointers should be - * allocated and returned (with refcount at 1). The module will not - * be used for collective operations until module_enable() is called - * on the module, but may be destroyed (via OBJ_RELEASE) either before - * or after module_enable() is called. If the module needs to release - * resources obtained during query(), it should do so in the module - * destructor. - * - * A component may provide NULL to this function to indicate it does - * not wish to run or return an error during module_enable(). - * - * @note The communicator is available for point-to-point - * communication, but other functionality is not available during this - * phase of initialization. - * - * @param[in] sbgp Pointer to sub-group module. - * @param[out] priority Priority setting for component on - * this communicator - * @param[out] num_modules Number of modules that where generated - * for the sub-group module. - * - * @returns An array of pointer to an initialized modules structures if the component can - * provide a modules with the requested functionality or NULL if the - * component should not be used on the given communicator. - */ -typedef struct mca_bcol_base_module_t **(*mca_bcol_base_component_comm_query_fn_t) - (mca_sbgp_base_module_t *sbgp, int *num_modules); - - -typedef int (*mca_bcol_barrier_init_fn_t)(struct mca_bcol_base_module_t *bcol_module, - mca_sbgp_base_module_t *sbgp_module); - - - -/* - * Macro for use in modules that are of type btl v2.0.0 - */ -#define MCA_BCOL_BASE_VERSION_2_0_0 \ - OMPI_MCA_BASE_VERSION_2_1_0("bcol", 2, 0, 0) - - -/* This is really an abstarction violation, but is the easiest way to get - * started. For memory management we need to know what bcol components - * have compatible memory management schemes. Such compatibility can - * be used to eliminate memory copies between levels in the collective - * operation hierarchy, by having the output buffer of one level be the - * input buffer to the next level - */ - -enum { - BCOL_SHARED_MEMORY_UMA=0, - BCOL_SHARED_MEMORY_SOCKET, - BCOL_POINT_TO_POINT, - BCOL_IB_OFFLOAD, - BCOL_SIZE -}; - -OMPI_DECLSPEC extern int bcol_mpool_compatibility[BCOL_SIZE][BCOL_SIZE]; -OMPI_DECLSPEC extern int bcol_mpool_index[BCOL_SIZE][BCOL_SIZE]; - -/* what are the input parameters ? too many void * pointers here */ -typedef int (*bcol_register_mem_fn_t)(void *context_data, void *base, - size_t size, void **reg_desc); -/* deregistration function */ -typedef int (*bcol_deregister_mem_fn_t)(void *context_data, void *reg_desc); - -/* Bcol network context definition */ -struct bcol_base_network_context_t { - opal_object_t super; - /* Context id - defined by upper layer, ML */ - int context_id; - /* Any context information that bcol what to use */ - void *context_data; - - /* registration function */ - bcol_register_mem_fn_t register_memory_fn; - /* deregistration function */ - bcol_deregister_mem_fn_t deregister_memory_fn; -}; -typedef struct bcol_base_network_context_t bcol_base_network_context_t; -OMPI_DECLSPEC OBJ_CLASS_DECLARATION(bcol_base_network_context_t); - -/* - *primitive function types - */ - -/* bcast */ -enum { - /* small data function */ - BCOL_BCAST_SMALL_DATA, - - /* small data - dynamic decision making supported */ - BCOL_BCAST_SMALL_DATA_DYNAMIC, - - /* number of functions */ - BCOL_NUM_BCAST_FUNCTIONS -}; - - -/** - * BCOL instance. - */ - -/* no limit on fragment size - this supports using user buffers rather - * than library buffers - */ -#define FRAG_SIZE_NO_LIMIT -1 - -/* forward declaration */ -struct coll_bcol_collective_description_t; - -struct mca_bcol_base_component_2_0_0_t { - - /** Base component description */ - mca_base_component_t bcol_version; - - /** Component initialization function */ - mca_bcol_base_component_init_query_fn_t collm_init_query; - - /** Query whether component is useable for given communicator */ - mca_bcol_base_component_comm_query_fn_t collm_comm_query; - - /** If bcol supports all possible data types */ - mca_bcol_base_module_coll_support_fn_t coll_support; - - /** If bcol supports all possible data types for given collective operation */ - mca_bcol_base_module_coll_support_all_types_fn_t coll_support_all_types; - - /** Use this flag to prevent init_query multiple calls - in case we have the same bcol more than on a single level */ - bool init_done; - - /** If collective calls with bcols of this type need to be ordered */ - bool need_ordering; - - /** MCA parameter: Priority of this component */ - int priority; - - /** Bcast function pointers */ - struct coll_bcol_collective_description_t * - bcast_functions[BCOL_NUM_BCAST_FUNCTIONS]; - - /** Number of network contexts - need this for resource management */ - int n_net_contexts; - - /** List of network contexts */ - bcol_base_network_context_t **network_contexts; - - /* - * Fragmentation support - */ - - /** Minimum fragement size */ - int min_frag_size; - - /** Maximum fragment size */ - int max_frag_size; - - /** Supports direct use of user-buffers */ - bool can_use_user_buffers; -}; -typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_2_0_0_t; -typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_t; -OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_component_t); - -/* forward declaration */ -struct mca_coll_ml_descriptor_t; -struct mca_bcol_base_payload_buffer_desc_t; -struct mca_bcol_base_route_info_t; - -typedef struct { - int order_num; /* Seq num of collective fragment */ - int bcols_started; /* How many bcols need ordering have been started */ - int n_fns_need_ordering; /* The number of functions are called for bcols need ordering */ -} mca_bcol_base_order_info_t; - -/* structure that encapsultes information propagated amongst multiple - * fragments whereby completing the entire ensemble of fragments is - * necessary in order to complete the entire collective - */ -struct bcol_fragment_descriptor_t { - /* start iterator */ - int head; - /* end iterator */ - int tail; - /* current iteration */ - int start_iter; - /* number of full iterations this frag */ - int num_iter; - /* end iter */ - int end_iter; -}; -typedef struct bcol_fragment_descriptor_t bcol_fragment_descriptor_t; - -struct bcol_function_args_t { - /* full message sequence number */ - int64_t sequence_num; - /* full message descriptor - single copy of fragment invariant - * parameters */ - /* Pasha: We don need this one for new flow - remove it */ - struct mca_coll_ml_descriptor_t *full_message_descriptor; - struct mca_bcol_base_route_info_t *root_route; - /* function status */ - int function_status; - /* root, for rooted operations */ - int root; - /* input buffer */ - const void *sbuf; - void *rbuf; - const void *userbuf; - struct mca_bcol_base_payload_buffer_desc_t *src_desc; - struct mca_bcol_base_payload_buffer_desc_t *dst_desc; - /* ml buffer size */ - uint32_t buffer_size; - /* index of buffer in ml payload cache */ - int buffer_index; - int count; - struct ompi_datatype_t *dtype; - struct ompi_op_t *op; - int sbuf_offset; - int rbuf_offset; - /* for bcol opaque data */ - void *bcol_opaque_data; - /* An output argument that will be used by BCOL function to tell ML that the result of the BCOL is in rbuf */ - bool result_in_rbuf; - bool root_flag; /* True if the rank is root of operation */ - bool need_dt_support; /* will trigger alternate code path for some colls */ - int status; /* Used for non-blocking collective completion */ - uint32_t frag_size; /* fragment size for large messages */ - int hier_factor; /* factor used when bcast is invoked as a service function back down - * the tree in allgather for example, the pacl_len is not the actual - * len of the data needing bcasting - */ - mca_bcol_base_order_info_t order_info; - bcol_fragment_descriptor_t frag_info; - -}; - -struct mca_bcol_base_route_info_t { - int level; - int rank; -}; -typedef struct mca_bcol_base_route_info_t mca_bcol_base_route_info_t; - -struct mca_bcol_base_lmngr_block_t { - opal_list_item_t super; - struct mca_coll_ml_lmngr_t *lmngr; - void* base_addr; -}; -typedef struct mca_bcol_base_lmngr_block_t mca_bcol_base_lmngr_block_t; -OBJ_CLASS_DECLARATION(mca_bcol_base_lmngr_block_t); - -struct mca_bcol_base_memory_block_desc_t { - - /* memory block for payload buffers */ - struct mca_bcol_base_lmngr_block_t *block; - - /* Address offset in bytes -- Indicates free memory in the block */ - uint64_t block_addr_offset; - - /* size of the memory block */ - size_t size_block; - - /* number of memory banks */ - uint32_t num_banks; - - /* number of buffers per bank */ - uint32_t num_buffers_per_bank; - - /* size of a payload buffer */ - uint32_t size_buffer; - - /* pointer to buffer descriptors initialized */ - struct mca_bcol_base_payload_buffer_desc_t *buffer_descs; - - /* index of the next free buffer in the block */ - uint64_t next_free_buffer; - - uint32_t *bank_release_counters; - - /* Counter that defines what bank should be synchronized next - * since collectives could be completed out of order, we have to make - * sure that memory synchronization collectives started in order ! */ - int memsync_counter; - - /* This arrays of flags used to signal that the bank is ready for recycling */ - bool *ready_for_memsync; - - /* This flags monitors if bank is open for usage. Usually we expect that user - * will do the check only on buffer-zero allocation */ - bool *bank_is_busy; - -}; - -/* convenience typedef */ -typedef struct mca_bcol_base_memory_block_desc_t mca_bcol_base_memory_block_desc_t; - -typedef void (*mca_bcol_base_release_buff_fn_t)(struct mca_bcol_base_memory_block_desc_t *ml_memblock, uint32_t buff_id); - -struct mca_bcol_base_payload_buffer_desc_t { - void *base_data_addr; /* buffer address */ - void *data_addr; /* buffer address + header offset */ - uint64_t generation_number; /* my generation */ - uint64_t bank_index; /* my bank */ - uint64_t buffer_index; /* my buff index */ -}; -/* convenience typedef */ -typedef struct mca_bcol_base_payload_buffer_desc_t mca_bcol_base_payload_buffer_desc_t; - - - - - - -typedef struct bcol_function_args_t bcol_function_args_t; - - -/* The collective operation is defined by a series of collective operations - * invoked through a function pointer. Each function may be different, - * so will store the arguments in a struct and pass a pointer to the struct, - * and use this as a way to hide the different function signatures. - * - * @param[in] input_args Structure with function arguments - * @param[in] bcol_desc Component specific paremeters - * @param[out] status return status of the function - * MCA_BCOL_COMPLETE - function completed - * MCA_BCOL_IN_PROGRESS - function incomplete - * - * @retval OMPI_SUCCESS successful completion - * @retval OMPI_ERROR function returned error - */ -/* forward declaration */ -struct mca_bcol_base_module_t; - -/* collective function prototype - all functions have the same interface - * so that we can call them via a function pointer */ -struct mca_bcol_base_function_t; -typedef int (*mca_bcol_base_module_collective_fn_primitives_t) - (bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args); - -typedef int (*mca_bcol_base_module_collective_init_fn_primitives_t) - (struct mca_bcol_base_module_t *bcol_module); - - /** - * function to query for collctive function attributes - * - * @param attribute (IN) the attribute of interest - * @param algorithm_parameters (OUT) the value of attribute for this - * function. If this attribute is not supported, - * OMPI_ERR_NOT_FOUND is returned. - */ - typedef int (*mca_bcol_get_collective_attributes)(int attribute, - void *algorithm_parameters); - -/* data structure for tracking the relevant data needed for ml level - * algorithm construction (e.g., function selection), initialization, and - * usage. - */ -struct coll_bcol_collective_description_t { - /* collective initiation function - first functin called */ - mca_bcol_base_module_collective_fn_primitives_t coll_fn; - - /* collective progress function - first functin called */ - mca_bcol_base_module_collective_fn_primitives_t progress_fn; - - /* collective progress function - first functin called */ - mca_bcol_get_collective_attributes get_attributes; - - /* attributes supported - bit map */ - uint64_t attribute; - -}; -typedef struct coll_bcol_collective_description_t -coll_bcol_collective_description_t; - -/* collective operation attributes */ -enum { - /* supports dynamic decisions - e.g., do not need to have the collective - * operation fully defined before it can be started - */ - BCOL_ATTRIBUTE_DYNAMIC, - - /* number of attributes */ - BCOL_NUM_ATTRIBUTES -}; - -/* For rooted collectives, - * does the algorithm knows its data source ? - */ -enum { - DATA_SRC_KNOWN=0, - DATA_SRC_UNKNOWN, - DATA_SRC_TYPES -}; - -enum { - BLOCKING, - NON_BLOCKING -}; -/* gvm For selection logic */ -struct mca_bcol_base_coll_fn_comm_attributes_t { - int bcoll_type; - int comm_size_min; - int comm_size_max; - int data_src; - int waiting_semantics; -}; - -typedef struct mca_bcol_base_coll_fn_comm_attributes_t - mca_bcol_base_coll_fn_comm_attributes_t; - -struct mca_bcol_base_coll_fn_invoke_attributes_t { - int bcol_msg_min; - int bcol_msg_max; - uint64_t datatype_bitmap; /* Max is OMPI_DATATYPE_MAX_PREDEFINED defined to be 45 */ - uint32_t op_types_bitmap; /* bit map of optypes supported */ -}; - -typedef struct mca_bcol_base_coll_fn_invoke_attributes_t - mca_bcol_base_coll_fn_invoke_attributes_t; - -struct mca_bcol_base_coll_fn_desc_t { - opal_list_item_t super; - struct mca_bcol_base_coll_fn_comm_attributes_t *comm_attr; - struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attr; - mca_bcol_base_module_collective_fn_primitives_t coll_fn; - mca_bcol_base_module_collective_fn_primitives_t progress_fn; -}; - -typedef struct mca_bcol_base_coll_fn_desc_t mca_bcol_base_coll_fn_desc_t; -OBJ_CLASS_DECLARATION(mca_bcol_base_coll_fn_desc_t); - -/* end selection logic */ - -typedef int (*mca_bcol_base_module_collective_init_fn_t) - (struct mca_bcol_base_module_t *bcol_module, - mca_sbgp_base_module_t *sbgp_module); - - /* per communicator memory initialization function */ -typedef int (*mca_bcol_module_mem_init)(struct ml_buffers_t *registered_buffers, - mca_bcol_base_component_t *module); - -/* Initialize memory block - ml_memory_block initialization interface function - * - * Invoked at the ml level, used to pass bcol specific registration information - * for the "ml_memory_block" - * - * @param[in] ml_memory_block Pointer to the ml_memory_block. This struct - * contains bcol specific registration information and a call back function - * used for resource recycling. - * - * @param[in] reg_data bcol specific registration data. - * - * @returns On Success: OMPI_SUCCESS - * On Failure: OMPI_ERROR - * - */ -/*typedef int (*mca_bcol_base_init_memory_fn_t) - (struct mca_bcol_base_memory_block_desc_t *ml_block, void *reg_data);*/ - -typedef int (*mca_bcol_base_init_memory_fn_t) - (struct mca_bcol_base_memory_block_desc_t *payload_block, - uint32_t data_offset, - struct mca_bcol_base_module_t *bcol, - void *reg_data); - -typedef int (*mca_common_allgather_init_fn_t) - (struct mca_bcol_base_module_t *bcol_module); - -typedef void (*mca_bcol_base_set_thresholds_fn_t) - (struct mca_bcol_base_module_t *bcol_module); - -enum { - MCA_BCOL_BASE_ZERO_COPY = 1, - MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG = 1 << 1, - MCA_BCOL_BASE_NO_ML_BUFFER_FOR_BARRIER = 1 << 2 -}; - -/* base module */ -struct mca_bcol_base_module_t { - /* base coll component */ - opal_object_t super; - - /* bcol component (Pasha: Do we really need cache the component?)*/ - mca_bcol_base_component_t *bcol_component; - - /* network context that is used by this bcol - only one context per bcol is allowed */ - bcol_base_network_context_t *network_context; - - /* We are going to use the context index a lot, - int order to decrease number of dereferences - bcol->network_context->index - we are caching the value on bcol */ - int context_index; - - /* Set of flags that describe features supported by bcol */ - uint64_t supported_mode; - - /* per communicator memory initialization function */ - mca_bcol_module_mem_init init_module; - - /* sub-grouping module partner */ - mca_sbgp_base_module_t *sbgp_partner_module; - - /* size of subgroup - cache this, so can have access when - * sbgp_partner_module no longer existes */ - int size_of_subgroup; - - /* sequence number offset - want to make sure that we start - * id'ing collectives with id 0, so we can have simple - * resource management. - */ - int64_t squence_number_offset; - - - /* number of times to poll for operation completion before - * breaking out of a non-blocking collective operation - */ - int n_poll_loops; - - /* size of header that will go in data buff, should not include - * any info regarding alignment, let the ml level handle this - */ - uint32_t header_size; - - - /* Each bcol is assigned a unique value - * see if we can get away with 16-bit id - */ - int16_t bcol_id; - - /*FIXME: - * Since mca_bcol_base_module_t is the only parameter which will be passed - * into the bcol_basesmuma_bcast_init(), add the flag to indicate whether - * the hdl-based algorithms will get enabled. - */ - bool use_hdl; - /* - * Collective function pointers - */ - /* changing function signature - will replace bcol_functions */ - mca_bcol_base_module_collective_fn_primitives_t bcol_function_table[BCOL_NUM_OF_FUNCTIONS]; - - /* Tables hold pointers to functions */ - mca_bcol_base_module_collective_init_fn_primitives_t bcol_function_init_table[BCOL_NUM_OF_FUNCTIONS]; - opal_list_t bcol_fns_table[BCOL_NUM_OF_FUNCTIONS]; - struct mca_bcol_base_coll_fn_desc_t* - filtered_fns_table[DATA_SRC_TYPES][2][BCOL_NUM_OF_FUNCTIONS][NUM_MSG_RANGES+1][OMPI_OP_NUM_OF_TYPES][OMPI_DATATYPE_MAX_PREDEFINED]; - - /* - * Bcol interface function to pass bcol specific - * info and memory recycling call back - */ - mca_bcol_base_init_memory_fn_t bcol_memory_init; - - /* - * netpatterns interface function, would like to invoke this on - * on the ml level - */ - mca_common_allgather_init_fn_t k_nomial_tree; - /* Each bcol caches a list which describes how many ranks - * are "below" each rank in this bcol - */ - int *list_n_connected; - - /* offsets for scatter/gather */ - int hier_scather_offset; - - /* Small message threshold for each collective */ - int small_message_thresholds[BCOL_NUM_OF_FUNCTIONS]; - - /* Set small_message_thresholds array */ - mca_bcol_base_set_thresholds_fn_t set_small_msg_thresholds; - - /* Pointer to the order counter on the upper layer, - used if the bcol needs to be ordered */ - int *next_inorder; -}; -typedef struct mca_bcol_base_module_t mca_bcol_base_module_t; -OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_module_t); - -/* function description */ -struct mca_bcol_base_function_t { - int fn_idx; - /* module */ - struct mca_bcol_base_module_t *bcol_module; - - /* - * The following two parameters are used for bcol modules - * that want to do some optimizations based on the fact that - * n functions from the same bcol module are called in a row. - * For example, in the iboffload case, on the first call one - * will want to initialize the MWR, and start to instantiate - * it, but only post it at the end of the last call. - * The index of this function in a sequence of consecutive - * functions from the same bcol - */ - int index_in_consecutive_same_bcol_calls; - - /* number of times functions from this bcol are - * called in order - */ - int n_of_this_type_in_a_row; - - /* - * number of times functions from this module are called in the - * collective operation. - */ - int n_of_this_type_in_collective; - int index_of_this_type_in_collective; -}; -typedef struct mca_bcol_base_function_t mca_bcol_base_function_t; - - - - -struct mca_bcol_base_descriptor_t { - opal_free_list_item_t super; -/* Vasily: will be described in the future */ -}; -typedef struct mca_bcol_base_descriptor_t mca_bcol_base_descriptor_t; - -static inline __opal_attribute_always_inline__ size_t - mca_bcol_base_get_buff_length(ompi_datatype_t *dtype, int count) -{ - ptrdiff_t lb, extent; - ompi_datatype_get_extent(dtype, &lb, &extent); - - return (size_t) (extent * count); -} - -#define MCA_BCOL_CHECK_ORDER(module, bcol_function_args) \ - do { \ - if (*((module)->next_inorder) != \ - (bcol_function_args)->order_info.order_num) { \ - return BCOL_FN_NOT_STARTED; \ - } \ - } while (0); - -#define MCA_BCOL_UPDATE_ORDER_COUNTER(module, order_info) \ - do { \ - (order_info)->bcols_started++; \ - if ((order_info)->n_fns_need_ordering == \ - (order_info)->bcols_started) { \ - ++(*((module)->next_inorder)); \ - } \ - } while (0); - -#if defined(c_plusplus) || defined(__cplusplus) -} -#endif -#endif /* MCA_BCOL_H */ diff --git a/ompi/mca/bcol/iboffload/.opal_ignore b/ompi/mca/bcol/iboffload/.opal_ignore deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/ompi/mca/bcol/iboffload/Makefile.am b/ompi/mca/bcol/iboffload/Makefile.am deleted file mode 100644 index 4e9dd0c966..0000000000 --- a/ompi/mca/bcol/iboffload/Makefile.am +++ /dev/null @@ -1,66 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(bcol_iboffload_CPPFLAGS) $(btl_openib_CPPFLAGS) - -sources = \ - bcol_iboffload.h \ - bcol_iboffload_device.h \ - bcol_iboffload_module.c \ - bcol_iboffload_mca.h \ - bcol_iboffload_mca.c \ - bcol_iboffload_endpoint.h \ - bcol_iboffload_endpoint.c \ - bcol_iboffload_frag.h \ - bcol_iboffload_frag.c \ - bcol_iboffload_collfrag.h \ - bcol_iboffload_collfrag.c \ - bcol_iboffload_task.h \ - bcol_iboffload_task.c \ - bcol_iboffload_component.c \ - bcol_iboffload_barrier.c \ - bcol_iboffload_bcast.h \ - bcol_iboffload_bcast.c \ - bcol_iboffload_allgather.c \ - bcol_iboffload_collreq.h \ - bcol_iboffload_collreq.c \ - bcol_iboffload_qp_info.c \ - bcol_iboffload_qp_info.h \ - bcol_iboffload_fanin.c \ - bcol_iboffload_fanout.c \ - bcol_iboffload_allreduce.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -component_noinst = -component_install = -if MCA_BUILD_ompi_bcol_iboffload_DSO -component_install += mca_bcol_iboffload.la -else -component_noinst += libmca_bcol_iboffload.la -endif - -# See ompi/mca/btl/sm/Makefile.am for an explanation of -# libmca_common_sm.la. - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_bcol_iboffload_la_SOURCES = $(sources) -mca_bcol_iboffload_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS) $(bcol_iboffload_LDFLAGS) -mca_bcol_iboffload_la_LIBADD = $(btl_openib_LIBS) $(bcol_iboffload_LIBS) \ - $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ofacm/libmca_common_ofacm.la \ - $(OMPI_TOP_BUILDDIR)/ompi/mca/common/verbs/libmca_common_verbs.la - -noinst_LTLIBRARIES = $(component_noinst) -libmca_bcol_iboffload_la_SOURCES =$(sources) -libmca_bcol_iboffload_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS) $(bcol_iboffload_LDFLAGS) diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload.h b/ompi/mca/bcol/iboffload/bcol_iboffload.h deleted file mode 100644 index 38f8ba3a31..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload.h +++ /dev/null @@ -1,765 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_IBOFFLOAD_H -#define MCA_BCOL_IBOFFLOAD_H - -#include "ompi_config.h" - -#include -#include - -#include -#include -#include - -#include "ompi/mca/mca.h" - -#include "ompi/op/op.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/datatype/ompi_datatype_internal.h" - -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" - -#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h" - -#include "opal/mca/mpool/mpool.h" -#include "ompi/request/request.h" - -#include "ompi/mca/common/ofacm/connect.h" - -#include "bcol_iboffload_qp_info.h" - -BEGIN_C_DECLS - -#define IMM_RDMA 1 -#define INLINE 1 -#define NO_INLINE 0 - -#define MCA_IBOFFLOAD_CALC_SIZE_EXT 8 -#define MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE 8 -#define MCA_IBOFFLOAD_CACHE_LINE_SIZE 128 - -#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA -#define MCA_BCOL_IBOFFLOAD_SEND_CALC IBV_M_WR_CALC_SEND -#else -#define MCA_BCOL_IBOFFLOAD_SEND_CALC IBV_M_WR_CALC -#endif - - -/* 0 - barrier rdma info - 1 - ML rdma info */ -#define MAX_REMOTE_RDMA_INFO 2 - -/* forward declarations */ -struct mca_bcol_iboffload_module_t; -struct mca_bcol_iboffload_collreq_t; -struct mca_bcol_iboffload_endpoint_t; -struct mca_bcol_iboffload_frag_t; -struct mca_bcol_iboffload_task_t; -struct mca_bcol_iboffload_qp_info_t; -struct mca_bcol_iboffload_collfrag_t; -struct mca_bcol_iboffload_algth_lst_t; -struct mca_bcol_iboffload_device_t; - -typedef int (*mca_bcol_iboffload_coll_algth_fn_t) ( - struct mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request); - -struct mca_bcol_iboffload_rdma_info_t { - uint64_t addr; - uint32_t rkey; - uint32_t lkey; -}; -typedef struct mca_bcol_iboffload_rdma_info_t mca_bcol_iboffload_rdma_info_t; - -struct mca_bcol_iboffload_rdma_buffer_desc_t { - void *data_addr; /* buffer address */ - uint64_t generation_number; /* my generation */ - uint64_t bank_index; /* my bank */ - uint64_t buffer_index; /* my buff index */ -}; -typedef struct mca_bcol_iboffload_rdma_buffer_desc_t mca_bcol_iboffload_rdma_buffer_desc_t; - -struct mca_bcol_iboffload_rdma_block_desc_t { - /* number of memory banks */ - uint32_t num_banks; - /* number of buffers per bank */ - uint32_t num_buffers_per_bank; - /* size of a payload buffer */ - uint32_t size_buffer; - /* data offset from ML */ - uint32_t data_offset; - /* pointer to buffer descriptors initialized */ - mca_bcol_iboffload_rdma_buffer_desc_t *rdma_desc; -}; -typedef struct mca_bcol_iboffload_rdma_block_desc_t mca_bcol_iboffload_rdma_block_desc_t; - -/* Information that we need to keep in order to access remote - memory. For each remote peer (endpoint) we will keep this - structure */ -struct mca_bcol_iboffload_rem_rdma_block_t { - /* IB related information first */ - mca_bcol_iboffload_rdma_info_t ib_info; - - mca_bcol_iboffload_rdma_buffer_desc_t *rdma_desc; -}; -typedef struct mca_bcol_iboffload_rem_rdma_block_t mca_bcol_iboffload_rem_rdma_block_t; - -enum { - MCA_BCOL_IBOFFLOAD_BK_COUNTER_INDEX = 0, - MCA_BCOL_IBOFFLOAD_BK_SYNC_INDEX, - MCA_BCOL_IBOFFLOAD_BK_LAST -}; - -/* Information that we need to keep in order to access and - track local memory that is used as source and destinatination - for RDMA operations */ -struct mca_bcol_iboffload_local_rdma_block_t { - /* sync counter keeps next to start bank id */ - int sync_counter; - /* Counter for released ml buffers */ - int *bank_buffer_counter[MCA_BCOL_IBOFFLOAD_BK_LAST]; - /* IB related information first */ - struct mca_bcol_iboffload_rdma_info_t ib_info; - /* back pointer to original ML memory descriptor */ - struct mca_bcol_base_memory_block_desc_t *ml_mem_desc; - /* Pasha: do we really need this one ?*/ - /* caching ml memory descriptor configurations localy */ - mca_bcol_iboffload_rdma_block_desc_t bdesc; -}; -typedef struct mca_bcol_iboffload_local_rdma_block_t mca_bcol_iboffload_local_rdma_block_t; - -struct mca_bcol_iboffload_recv_wr_manager { - opal_mutex_t lock; - /** Array of ready to use receive work requests. - * it is 2 dimensional array since for each - * qp size we want to keep separate recv wr */ - struct ibv_recv_wr **recv_work_requests; -}; -typedef struct mca_bcol_iboffload_recv_wr_manager mca_bcol_iboffload_recv_wr_manager; - -/** - * Structure to hold the basic shared memory coll component. First it holds the - * base coll component, and then holds a bunch of - * sm-coll-component-specific stuff (e.g., current MCA param - * values). - */ -struct mca_bcol_iboffload_component_t { - /** Base coll component */ - mca_bcol_base_component_2_0_0_t super; - /** Enable disable verbose mode */ - int verbose; - int num_qps; - /** Whether we want a warning if non default GID prefix is not configured - on multiport setup */ - bool warn_default_gid_prefix; - /** Whether we want a warning if the user specifies a non-existent - device and/or port via bcol_ibofflad_if_[in|ex]clude MCA params */ - bool warn_nonexistent_if; - /** initial size of free lists */ - int free_list_num; - /** maximum size of free lists */ - int free_list_max; - /** number of elements to alloc when growing free lists */ - int free_list_inc; - /** name of ib memory pool */ - char* mpool_name; - /** max outstanding CQE on the CQ */ - int cq_size; - /** Max size of inline data */ - unsigned int max_inline_data; - /** IB partition definition */ - uint32_t pkey_val; - /** Outstanding atomic reads */ - unsigned int qp_ous_rd_atom; - /** IB MTU */ - int mtu; - /** Recv not ready timer */ - int min_rnr_timer; - /** IB timeout */ - int timeout; - /** IB retry count */ - int retry_count; - /** Recv not ready retry count */ - int rnr_retry; - /** IB maximum pending RDMA */ - int max_rdma_dst_ops; - /** IB Service level (QOS) */ - int service_level; - /** Preferred communication buffer alignment in Bytes (must be power of two) */ - int buffer_alignment; - /** Max tasks number for MQ */ - int max_mqe_tasks; - /** Max MQ size */ - int max_mq_size; - /** HCA/Port include exclude list */ - char *if_include; - char **if_include_list; - char *if_exclude; - char **if_exclude_list; - /** Dummy argv-style list; a copy of names from the - if_[in|ex]clude list that we use for error checking (to ensure - that they all exist) */ - char **if_list; - /** Array of ibv devices */ - struct ibv_device **ib_devs; - /** devices count */ - int num_devs; - /** MCA param bcol_iboffload_receive_queues */ - char *receive_queues; - /** Common info about all kinds of QPs on each iboffload module */ - struct mca_bcol_iboffload_qp_info_t qp_infos[MCA_BCOL_IBOFFLOAD_QP_LAST]; - /** Array of iboffload devices */ - opal_pointer_array_t devices; - /** Free lists of collfrag descriptors */ - ompi_free_list_t collfrags_free; - /** Free lists of outstanding collective operations */ - ompi_free_list_t collreqs_free; - /** Free lists for free task operations */ - ompi_free_list_t tasks_free; - /** Free lists for free calc task operations */ - ompi_free_list_t calc_tasks_free; - /** Free list of empty frags, that do not keep any - registration information */ - ompi_free_list_t ml_frags_free; - /** Recv work request mananger */ - mca_bcol_iboffload_recv_wr_manager recv_wrs; - /** We allocate some resources on the component - * with creating of the first iboffload module - * and set this flag to true */ - bool init_done; - /** Maximal number of fragments of the same colective request that can be sent in parallel */ - unsigned int max_pipeline_depth; - /** array mapping Open MPI reduction operators to MVerbs reduction operators */ - enum ibv_m_wr_calc_op map_ompi_to_ib_calcs[OMPI_OP_NUM_OF_TYPES]; - /** array mapping Open MPI data types to MVerbs data types */ - enum ibv_m_wr_data_type map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_MAX_PREDEFINED]; - /** The order of the exchange tree */ - int exchange_tree_order; - /** Knomial tree order */ - int knomial_tree_order; - /** K-nomial radix */ - int k_nomial_radix; - /** Maximum number of pulls for completion check */ - int max_progress_pull; - /** Barrier function selector */ - int barrier_mode; - /** MCA for selecting Bruck's alltoall algorithms */ - int use_brucks_smsg_alltoall_rdma; - int use_brucks_smsg_alltoall_sr; - /** radix of small-data alltoall Bruck-like algorithm */ - int k_alltoall_bruck_radix; - /** alltoall small data buffer alignment */ - int tmp_buf_alignment; -}; - -/** - * Convenience typedef - */ -typedef struct mca_bcol_iboffload_component_t mca_bcol_iboffload_component_t; - -/* List of all algorithms that we use */ -enum { - FANIN_ALG, - FANOUT_ALG, - RECURSIVE_DOUBLING_BARRIER_ALG, - RECURSIVE_KNOMIAL_BARRIER_ALG, - RECURSIVE_DOUBLING_ALLREDUCE_ALG, - RECURSIVE_DOUBLING_REDUCE_ALG, - RECURSIVE_DOUBLING_TREE_BCAST, - ALL_ENDPOINTS, /* connected to all peers */ - ALLGATHER_KNOMIAL_ALG, - ALLGATHER_NEIGHBOR_ALG, - REMOTE_EXCHANGE_ALG, - LAST_ALG -}; - -struct mca_bcol_iboffload_port_t { - int id; /** Port number on device: 1 or 2 */ - int stat; /** Port status - Active,Init,etc.. */ - enum ibv_mtu mtu; /** MTU on this port */ - uint64_t subnet_id; /** Sunnet id for the port */ - uint16_t lid; - uint16_t lmc; -}; -typedef struct mca_bcol_iboffload_port_t mca_bcol_iboffload_port_t; - -enum { - COLL_MQ = 0, - SERVICE_MQ, - BCOL_IBOFFLOAD_MQ_NUM -}; - -struct mca_bcol_iboffload_module_t { - /* base structure */ - mca_bcol_base_module_t super; - - /* size */ - int group_size; - int log_group_size; - - /* size of each memory segment */ - size_t segment_size; - - /* collective tag */ - long long collective_tag; - - /* pointer to device */ - struct mca_bcol_iboffload_device_t *device; - - /* caching port number */ - uint32_t port; - - /* Connecting iboffload with ibnet module information */ - /* pointer to sbgp ibnet */ - mca_sbgp_ibnet_module_t *ibnet; - - /* connection group inder for the ibnet */ - int cgroup_index; - - /* array of endpoints */ - struct mca_bcol_iboffload_endpoint_t **endpoints; - - /* Size of the endpoints array */ - int num_endpoints; - - /* caching port subnet id and lid - * the same information we have on device */ - uint64_t subnet_id; - uint16_t lid; - - /* Pointer to management queue */ - struct mqe_context *mq[BCOL_IBOFFLOAD_MQ_NUM]; - int mq_credit[BCOL_IBOFFLOAD_MQ_NUM]; - - /* pending list of collfrags */ - opal_list_t collfrag_pending; - - /* recursive-doubling tree node */ - netpatterns_pair_exchange_node_t recursive_doubling_tree; - - /* N exchange tree */ - netpatterns_pair_exchange_node_t n_exchange_tree; - - /* Knomial exchange tree */ - netpatterns_k_exchange_node_t knomial_exchange_tree; - - /* Knomial exchange tree */ - netpatterns_k_exchange_node_t knomial_allgather_tree; - - /* The array will keep pre-calculated task consumption per - * algorithm - */ - uint32_t alg_task_consump[LAST_ALG]; - - /* Pointer to a func that's implementation of a barrier algorithm */ - mca_bcol_iboffload_coll_algth_fn_t barrier_algth; - - /* Pointer to a func that's implementation of a fanin algorithm */ - mca_bcol_iboffload_coll_algth_fn_t fanin_algth; - - /* Pointer to a func that's implementation of a fanin algorithm */ - mca_bcol_iboffload_coll_algth_fn_t fanout_algth; - - /* Pointer to a func that's implementation of a allreduce algorithm */ - mca_bcol_iboffload_coll_algth_fn_t allreduce_algth; - - /* Pointer to a func that's implementation of a non blocking memory syncronization algorithm */ - mca_bcol_iboffload_coll_algth_fn_t memsync_algth; - - /* rdma block memory information */ - mca_bcol_iboffload_local_rdma_block_t rdma_block; - - /* The largest power of two which 1 << power_of_2 - is not larger than the group size */ - int power_of_2; - - /* The largest power of two number which is not larger than the group size */ - int power_of_2_ranks; - - /* Connection status array */ - bool connection_status[LAST_ALG]; - - /* map from communicator ranks to ibsubnet */ - int *comm_to_ibnet_map; - - /* order preserving value */ - int64_t prev_sequence_num; - - /* Temp iovec to send the data fragments -- alltoall Brucks */ - struct iovec *alltoall_iovec; - struct iovec *alltoall_recv_iovec; - - /* tree radix for the knomial bruck small data alltoall */ - int k_alltoall_bruck_radix; - - /* Temp buffer alignment for knomial bruck small data alltoall */ - int tmp_buf_alignment; - - /* Free task list with sge's array */ - ompi_free_list_t iovec_tasks_free; -}; - -typedef struct mca_bcol_iboffload_module_t mca_bcol_iboffload_module_t; -OBJ_CLASS_DECLARATION(mca_bcol_iboffload_module_t); - -/** - * Global component instance - */ -OMPI_MODULE_DECLSPEC - extern mca_bcol_iboffload_component_t mca_bcol_iboffload_component; - -static inline int mca_bcol_iboffload_err(const char* fmt, ...) -{ - va_list list; - int ret; - - va_start(list, fmt); - ret = vfprintf(stderr, fmt, list); - va_end(list); - return ret; -} - -#define MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(ompi_op, c_type, l_operand, r_operand, result) \ -do { \ - switch (ompi_op) { \ - case OMPI_OP_MAX: \ - *((c_type *)&result) = ((*(c_type *)&(l_operand) > *(c_type *)&(r_operand)) ? \ - *(c_type *)&(l_operand) : *(c_type *)&(r_operand)); \ - break; \ - case OMPI_OP_MIN: \ - *((c_type *)&result) = ((*(c_type *)&(l_operand) < *(c_type *)&(r_operand)) ? \ - *(c_type *)&(l_operand) : *(c_type *)&(r_operand)); \ - break; \ - case OMPI_OP_SUM: \ - *((c_type *)&result) = (*((c_type *)&(l_operand)) + *((c_type *)&(r_operand))); \ - break; \ - default: \ - break; \ - } \ -} while (0); - -#define MCA_BCOL_IBOFFLOAD_PKEY_MASK 0x7fff -#define MCA_BCOL_IBOFFLOAD_DEFAULT_GID_PREFIX 0xfe80000000000000ll - -#define IBOFFLOAD_ERROR(args) \ - do { \ - mca_bcol_iboffload_err("[%s]%s[%s:%d:%s] IBOFFLOAD ", \ - ompi_process_info.nodename, \ - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - mca_bcol_iboffload_err args; \ - mca_bcol_iboffload_err("\n"); \ - } while(0) - -#if OPAL_ENABLE_DEBUG -#define IBOFFLOAD_VERBOSE(level, args) \ - do { \ - if (mca_bcol_iboffload_component.verbose >= level) { \ - mca_bcol_iboffload_err("[%s]%s[%s:%d:%s] IBOFFLOAD ", \ - ompi_process_info.nodename, \ - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - mca_bcol_iboffload_err args; \ - mca_bcol_iboffload_err("\n"); \ - } \ - } while(0) -#else -#define IBOFFLOAD_VERBOSE(level, args) -#endif - -#define MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(coll_req, coll_work_req) \ - do { \ - opal_list_append(&(coll_req)->work_requests, \ - (opal_list_item_t*) (coll_work_req)); \ - (coll_work_req)->coll_full_req = (coll_req); \ - } while(0) -/* Vasily: will be removed soon */ -#define APPEND_TO_TASKLIST(task_ptr_to_set, event, last_event_type) \ - do { \ - *task_ptr_to_set = &(event)->element; \ - last_event_type = &(event)->element; \ - task_ptr_to_set = &((event)->element.next); \ - } while(0) - -#define MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(task_ptr_to_set, task) \ - do { \ - *task_ptr_to_set = (task); \ - task_ptr_to_set = &((task)->next_task); \ - } while(0) - -#define MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(task_ptr_to_set, event) \ - do { \ - *task_ptr_to_set = &(event)->element; \ - task_ptr_to_set = &((event)->element.next); \ - } while(0) - -#define BCOL_IS_COMPLETED(req) (((req)->n_frag_mpi_complete == (req)->n_fragments) && \ - ((req)->n_fragments > 0)) - -#define BCOL_AND_NET_ARE_COMPLETED(req) (BCOL_IS_COMPLETED(req) && \ - ((req)->n_frag_net_complete == (req)->n_fragments)) - -/* Pasha: Need to add locks here */ -#define BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(module, mq_index, num_of_credits) \ - (((module)->mq_credit[mq_index] -= (num_of_credits)) < 0 ? false : true) -/* Pasha: Need to add locks here */ -#define BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(module, mq_index, num_of_credits) \ - ((module)->mq_credit[mq_index] += (num_of_credits)) - -#define BCOL_IBOFFLOAD_IS_FIRST_CALL(args) (0 == (args)->index_in_consecutive_same_bcol_calls) - -#define BCOL_IBOFFLOAD_IS_LAST_CALL(args) (((args)->n_of_this_type_in_collective - 1) == \ - (args)->index_of_this_type_in_collective) - -#define BCOL_IBOFFLOAD_READY_TO_POST(args) (((args)->n_of_this_type_in_a_row - 1) == \ - (args)->index_in_consecutive_same_bcol_calls) -/* - * bcol module functions - */ - -int mca_bcol_iboffload_rec_doubling_start_connections(struct mca_bcol_iboffload_module_t *iboffload); - -/* RDMA addr exchange with rem proc */ -int mca_bcol_iboffload_exchange_rem_addr(struct mca_bcol_iboffload_endpoint_t *ep); - -/* Progress function */ -int mca_bcol_iboffload_component_progress(void); - -/* Register memory */ -int mca_bcol_iboffload_register_mr(void *reg_data, void * base, size_t size, - mca_mpool_base_registration_t *reg); - -/* Deregister memory */ -int mca_bcol_iboffload_deregister_mr(void *reg_data, mca_mpool_base_registration_t *reg); - -/* - * The function is used for create CQ in this module. - */ -int mca_bcol_iboffload_adjust_cq(struct mca_bcol_iboffload_device_t *device, - struct ibv_cq **ib_cq); -/* - * Query to see if the component is available for use, - * and can satisfy the thread and progress requirements - */ -int mca_bcol_iboffload_init_query(bool enable_progress_threads, - bool enable_mpi_threads); - - -/* Interface to setup the allgather tree */ -int mca_bcol_iboffload_setup_knomial_tree(mca_bcol_base_module_t *super); - -/* - * Query to see if the module is available for use on - * the given communicator, and if so, what it's priority is. - */ -mca_bcol_base_module_t ** -mca_bcol_iboffload_comm_query(mca_sbgp_base_module_t *sbgp, int *num_modules); - -int -mca_bcol_iboffload_free_tasks_frags_resources( - struct mca_bcol_iboffload_collfrag_t *collfrag, - ompi_free_list_t *frags_free); - -/** - * Shared memory blocking barrier - */ - -int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t - *const_args); - -int mca_bcol_iboffload_barrier_intra_recursive_doubling_start( - struct mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request); - -int mca_bcol_iboffload_barrier_intra_recursive_knomial_start( - struct mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request); - -int mca_bcol_iboffload_barrier_intra_recursive_doubling( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request); - -int mca_bcol_iboffload_nb_memory_service_barrier_start( - struct mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request); - -int mca_bcol_iboffload_fanin_register(mca_bcol_base_module_t *super); -int mca_bcol_iboffload_fanout_register(mca_bcol_base_module_t *super); -int mca_bcol_iboffload_barrier_register(mca_bcol_base_module_t *super); -int mca_bcol_iboffload_memsync_register(mca_bcol_base_module_t *super); -int mca_bcol_iboffload_allreduce_register(mca_bcol_base_module_t *super); - -int mca_bcol_iboffload_new_style_fanin_first_call( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request); - -int mca_bcol_iboffload_new_style_fanout_first_call( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request); - -int mca_bcol_iboffload_nb_memory_service_barrier_intra(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -int mca_bcol_iboffload_coll_support_all_types(bcol_coll coll_name); -int mca_bcol_iboffload_coll_supported(int op, int dtype, bcol_elem_type elem_type); - -static inline __opal_attribute_always_inline__ int - mca_bcol_iboffload_fls(int num) -{ - int i = 1; - int j = 0; - - if (0 == num) { - return 0; - } - - while (i < num) { - i <<= 1; - j++; - } - - if (i > num) { - j--; - } - - return j; -} - -#define BCOL_IBOFFLOAD_IS_EVEN(num) (!((num) & 1)) -static inline __opal_attribute_always_inline__ int - mca_bcol_iboffload_ffs(int num) -{ - int j = 0; - - if (0 == num) { - return 0; - } - - while (BCOL_IBOFFLOAD_IS_EVEN(num)) { - num >>= 1; - j++; - } - - return j; -} - -#if OPAL_ENABLE_DEBUG - -/* Post task list MQ */ -#define IS_IMM(a) (a & MQE_WR_FLAG_IMM_EXE) -#define IS_SIG(a) (a & MQE_WR_FLAG_SIGNAL) -#define IS_BLK(a) (a & MQE_WR_FLAG_BLOCK) - -int task_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task); -int wait_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task); - -#endif - -/* MQ posting function */ -static inline __opal_attribute_always_inline__ int - mca_bcol_iboffload_post_mqe_tasks( - mca_bcol_iboffload_module_t *iboffload, - struct mqe_task *head_mqe) -{ - int rc; - struct mqe_task *bad_mqe = NULL; - -#if OPAL_ENABLE_DEBUG /* debug code */ - - struct mqe_task *curr_mqe_task = NULL; - int send_count = 0, recv_count = 0, wait_count = 0; - - curr_mqe_task = head_mqe; - IBOFFLOAD_VERBOSE(10, ("Processing MQE Head with addr %p \n", - (uintptr_t) (void*) curr_mqe_task)); - - while (NULL != curr_mqe_task) { - switch(curr_mqe_task->opcode) { - case MQE_WR_SEND: - IBOFFLOAD_VERBOSE(10, ("Posting task %p id 0x%x: send on QP 0x%x\n" - "rank %d, sg_entry: addr %p LEN %d lkey %u, flag[%d-%d-%d]\n", - (void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id, - curr_mqe_task->post.qp->qp_num, - task_to_rank(iboffload, curr_mqe_task), - curr_mqe_task->post.send_wr->sg_list->addr, - curr_mqe_task->post.send_wr->sg_list->length, - curr_mqe_task->post.send_wr->sg_list->lkey, - IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags))); - - ++send_count; - break; - case MQE_WR_RECV: - IBOFFLOAD_VERBOSE(10, ("Posting task %p id 0x%x: recv on QP 0x%x rank %d flag[%d-%d-%d]\n", - (void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id, - curr_mqe_task->post.qp->qp_num, task_to_rank(iboffload, curr_mqe_task), - IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags))); - - ++recv_count; - break; - case MQE_WR_CQE_WAIT: - - IBOFFLOAD_VERBOSE(10, ("Posting task %p id %x: wait on CQ %p for rank %d num of waits %d flag[%d-%d-%d]\n", - (void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id, - (void*) curr_mqe_task->wait.cq, wait_to_rank(iboffload, curr_mqe_task), - curr_mqe_task->wait.count, - IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags))); - - wait_count += curr_mqe_task->wait.count; - break; - default: - IBOFFLOAD_ERROR(("Fatal error, unknow packet type %d\n", - curr_mqe_task->opcode)); - return OMPI_ERROR; - } - - /* pointer to next task */ - curr_mqe_task = curr_mqe_task->next; - } - - IBOFFLOAD_VERBOSE(10, ("wait[%d] send[%d] recv[%d]\n", - wait_count, send_count, recv_count)); -#endif - - IBOFFLOAD_VERBOSE(10, ("Posting MQ %p \n", (uintptr_t) head_mqe->wr_id)); - - rc = mqe_post_task(iboffload->mq[0], head_mqe, &bad_mqe); - if (OPAL_UNLIKELY(0 != rc)) { - IBOFFLOAD_ERROR(("ibv_post_mqe failed, errno says: %s," - " the return code is [%d]\n", - strerror(errno), rc)); - - return OMPI_ERROR; - } - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ - int lognum(int n) { - int count = 1, lognum = 0; - - while (count < n) { - count = count << 1; - lognum++; - } - - return lognum; -} - -END_C_DECLS - -#endif /* MCA_BCOL_IBOFFLOAD_H */ - diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_allgather.c b/ompi/mca/bcol/iboffload/bcol_iboffload_allgather.c deleted file mode 100644 index 28140e5bb7..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_allgather.c +++ /dev/null @@ -1,1388 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include -#include -#include -#include "opal_stdint.h" - -#include "bcol_iboffload.h" -#include "bcol_iboffload_alltoall.h" -#include "bcol_iboffload_bcast.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_collfrag.h" -#include "bcol_iboffload_endpoint.h" - -#include "opal/include/opal/types.h" - -static int mca_bcol_iboffload_allgather_init( - bcol_function_args_t *fn_arguments, - mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collreq_t **coll_request, - bool if_bcol_last, int mq_credits, - collective_message_progress_function progress_fn) -{ - int rc; - - ompi_free_list_item_t *item; - mca_bcol_iboffload_collfrag_t *coll_fragment; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - OMPI_FREE_LIST_WAIT(&cm->collreqs_free, item, rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("Wait for free list failed.\n")); - return rc; - } - /* setup call request */ - (*coll_request) = (mca_bcol_iboffload_collreq_t *) item; - - (*coll_request)->n_fragments = 0; - (*coll_request)->n_frags_sent = 0; - (*coll_request)->n_frag_mpi_complete = 0; - (*coll_request)->n_frag_net_complete = 0; - (*coll_request)->if_bcol_last = if_bcol_last; - (*coll_request)->ml_buffer_index = fn_arguments->buffer_index; - (*coll_request)->completion_cb_fn = NULL; - (*coll_request)->buffer_info[SBUF].buf = (void *) ( - (unsigned char *)fn_arguments->sbuf + - fn_arguments->sbuf_offset); - (*coll_request)->buffer_info[RBUF].buf = (void *) ( - (unsigned char *)fn_arguments->rbuf + - fn_arguments->rbuf_offset); - (*coll_request)->buffer_info[SBUF].offset = fn_arguments->sbuf_offset; - (*coll_request)->buffer_info[RBUF].offset = fn_arguments->rbuf_offset; - /* seems like we should initialize the memory registration pointer to NULL here */ - (*coll_request)->buffer_info[SBUF].iboffload_reg = NULL; - (*coll_request)->buffer_info[RBUF].iboffload_reg = NULL; - (*coll_request)->dtype = fn_arguments->dtype; - (*coll_request)->count = fn_arguments->count; - (*coll_request)->module = iboffload_module; - /* TODO Pasha: we need it for pending quque. Set it later. */ - (*coll_request)->progress_fn = progress_fn; - /* TODO Pasha: fix it later */ - (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER; - - (*coll_request)->order_info = &fn_arguments->order_info; - - coll_fragment = &((*coll_request)->first_collfrag); - mca_bcol_iboffload_collfrag_init(coll_fragment); - - /** Vasily ????? */ - /* mq_credits = (*coll_request)->total_tasks_num; */ - coll_fragment->mq_credits = mq_credits; - coll_fragment->mq_index = COLL_MQ; - /* pasha: just set it to zero */ - coll_fragment->last_wait_num = 0; - coll_fragment->alg = -2; /* used only for debug */ - /* - if (my_rank == algthm_ptr->root) { - coll_fragment->last_wait_num = 0; - } else { - coll_fragment->last_wait_num = algth_lst->last_wait_num; - } - */ - /* Pasha: we have nothing to unpack */ - coll_fragment->unpack_size = 0; - /* coll_fragment->unpack_size = pack_len; */ - /* coll_fragment->alg = RECURSIVE_DOUBLING_TREE_BCAST; */ - - /* set pointers for (coll frag) <-> (coll full request) */ - (*coll_request)->user_handle_freed = false; - - fn_arguments->bcol_opaque_data = (void *) (*coll_request); - /* We don't have root.. - if (true == fn_arguments->root_flag) { - (*coll_request)->root = my_group_index; - } else { - (*coll_request)->root = fn_arguments->root_route->rank; - } - */ - - MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS((*coll_request), coll_fragment); - return OMPI_SUCCESS; -} - -#if 1 -static inline void bcol_iboffload_setup_allgather_endpoints_connection(mca_bcol_iboffload_module_t *iboffload) -{ - int i, j; - /*Seems that we don't require this*/ - netpatterns_k_exchange_node_t *exchange_node = &iboffload->knomial_allgather_tree; - - mca_bcol_iboffload_endpoint_t *ep; - - IBOFFLOAD_VERBOSE(10, ("Open connections.\n")); -#if 0 - fprintf(stderr,"Entering Open Connections\n"); -#endif - - /* start with extras and proxy connections */ - if(exchange_node->n_extra_sources > 0) { - /* connect to endpoint */ - /*ep = iboffload->endpoints[comm_to_ibnet[exchange_node->rank_extra_sources_array[0]]];*/ - ep = iboffload->endpoints[exchange_node->rank_extra_sources_array[0]]; - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - /* now move through the recursive k-ing exchanges */ - if(NULL != exchange_node->rank_exchanges) { - for( i = 0; i < exchange_node->log_tree_order; i++) { - for( j = 0; j < ( exchange_node->tree_order - 1 ); j++) { - if( exchange_node->rank_exchanges[i][j] < 0 ){ - continue; - } - /* connect to endpoint */ - /*ep = iboffload->endpoints[comm_to_ibnet[exchange_node->rank_exchanges[i][j]]];*/ - ep = iboffload->endpoints[exchange_node->rank_exchanges[i][j]]; - if (iboffload->ibnet->super.my_index < ep->index) { - while(0 == (ep)->remote_zero_rdma_addr.addr) { - opal_progress(); - } - } else { - IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index)); - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - - } - } - } - - /* set the connection status to connected */ - iboffload->connection_status[ALLGATHER_KNOMIAL_ALG] = true; -} -#endif - - -static inline void bcol_iboffload_setup_allgather_ring_endpoints_connection(mca_bcol_iboffload_module_t *iboffload) -{ - int i; - const int group_size = iboffload->ibnet->super.group_size; - mca_bcol_iboffload_endpoint_t *ep; - - IBOFFLOAD_VERBOSE(10, ("Open connections.\n")); - - /* this is algorithm specific - need to move through the algorithm here basically to set up connections, should be - * - */ - - /* I'm going to leave this alone for now, because I'm - * not sure how these endpoints map back to ibnet. Is it mapped to ibnet ids or to communicator ids? - */ - for (i = 0; i < group_size; i++) { - ep = iboffload->endpoints[i]; - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - - /* set the connection status to connected */ - - /*JSL - change this macro */ - iboffload->connection_status[ALLGATHER_NEIGHBOR_ALG] = true; -} - -#if 0 -/* allgather neighbor exchange algorithm N/2 communication steps, 2 connections */ -static int mca_bcol_iboffload_neighbor_allgather_userbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc, - src, dst; - - uint32_t pack_len; - int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; - int group_size = iboffload_module->group_size; - int step, roffset, soffset; - int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from; - int even_rank; - int parity; - - struct mqe_task *last_send = NULL, - *last_wait = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; - -#if 0 - fprintf(stderr,"entering large msg neighbor exchange allgather\n"); -#endif - IBOFFLOAD_VERBOSE(10,("Entering large msg iboffload allgather")); - if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_NEIGHBOR_ALG])) { - IBOFFLOAD_VERBOSE(10,("Allgather open new connection ")); - bcol_iboffload_setup_allgather_ring_endpoints_connection(iboffload_module); - } - - pack_len = coll_request->count * coll_request->dtype->super.size; - IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", - pack_len, - coll_request->count, - coll_request->dtype->super.size)); - - /* register send and receive sides */ - /* send side, only sending pack_len data */ - - /* I think that probably I will only register the rbuf */ - /* on receive side I need to register pack_len*group_size data */ - rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[RBUF].buf, pack_len * group_size, - &coll_request->buffer_info[RBUF].iboffload_reg, iboffload_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("Cannot register memory: " - "addr - %p, %d bytes.\n", - coll_request->buffer_info[RBUF].buf, pack_len)); - return OMPI_ERROR; - } - coll_request->buffer_info[RBUF].lkey = coll_request->buffer_info[RBUF].iboffload_reg->mr->lkey; - - /* it is estimated mq consumption... */ - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - coll_fragment->tail_next = &coll_fragment->to_post; - - - /* start the neighbor exchange */ - - even_rank = !(my_group_index % 2); - if (even_rank) { - neighbor[0] = (my_group_index + 1) % group_size; - neighbor[1] = (my_group_index - 1 + group_size) % group_size; - recv_data_from[0] = my_group_index; - recv_data_from[1] = my_group_index; - offset_at_step[0] = (+2); - offset_at_step[1] = (-2); - } else { - neighbor[0] = (my_group_index - 1 + group_size) % group_size; - neighbor[1] = (my_group_index + 1) % group_size; - recv_data_from[0] = neighbor[0]; - recv_data_from[1] = neighbor[0]; - offset_at_step[0] = (-2); - offset_at_step[1] = (+2); - } - - /* first step is special step, only send one block */ - roffset = neighbor[0]*pack_len; - soffset = my_group_index*pack_len; - /* send receive this */ - - dst = neighbor[0]; - src = neighbor[0]; - - rc = mca_bcol_iboffload_send_rtr_setup(&last_send, - src, iboffload_module, - coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - - rc = mca_bcol_iboffload_recv_rtr_setup( - &last_wait, dst, iboffload_module, coll_fragment); - /* send the data */ - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_recv_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_send_large_buff_setup( - &last_send, RBUF, - coll_request->buffer_info[RBUF].offset + - soffset/* offset calc */ , - pack_len, dst, - iboffload_module, coll_fragment); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_send_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - /* send is done */ - - - - rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF, - coll_request->buffer_info[RBUF].offset + - roffset, - pack_len, src, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - /* now for the actual neighbor exchange algorithm */ - - - /* determine initial send location */ - if(even_rank) { - send_data_from = my_group_index; - }else { - send_data_from = recv_data_from[0]; - } - for( step = 1; step < (group_size/2); step++) { - - parity = step % 2; - recv_data_from[parity] = - (recv_data_from[parity] + offset_at_step[parity] + group_size) % group_size; - src = neighbor[parity]; - dst = src; - - roffset = recv_data_from[parity] * pack_len; - soffset = send_data_from * pack_len; - - /* post send rtr and recev rtr together */ - if( 1 == step ){ - rc = mca_bcol_iboffload_send_rtr_setup(&last_send, - src, iboffload_module, - coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_recv_rtr_setup( - &last_wait, dst, iboffload_module, coll_fragment); - /* send the data */ - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_recv_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - } - - - /* I'm using the hierarchy offset used in the k-nomial allgather */ - /* this won't work...*/ - rc = mca_bcol_iboffload_send_large_buff_setup( - &last_send, RBUF, - coll_request->buffer_info[RBUF].offset + - soffset/* offset calc */ , - 2 * pack_len, dst, - iboffload_module, coll_fragment); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_send_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - /* send is done */ - - - rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF, - coll_request->buffer_info[RBUF].offset + - roffset, - 2 * pack_len, src, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - send_data_from = recv_data_from[parity]; - - } - - /* end of list */ - *coll_fragment->tail_next = NULL; - - /* finish initializing full message descriptor */ - (coll_request)->n_fragments = 1; - (coll_request)->n_frags_sent = 1; - - assert(NULL != last_wait); - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - assert(MCA_COLL_ML_NO_BUFFER == coll_request->ml_buffer_index); - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); - - IBOFFLOAD_VERBOSE(10, ("Return success.\n")); - return BCOL_FN_STARTED; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n")); - rc = - mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); - return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; -} -#endif - -#if 0 -/* debug connection routine */ -static inline void bcol_iboffload_setup_allgather_endpoints_connection(mca_bcol_iboffload_module_t *iboffload) -{ - int i; - const int group_size = iboffload->ibnet->super.group_size; - mca_bcol_iboffload_endpoint_t *ep; - - IBOFFLOAD_VERBOSE(10, ("Open connections.\n")); - - /* this is algorithm specific - need to move through the algorithm here basically to set up connections, should be - * - */ - - /* I'm going to leave this alone for now, because I'm - * not sure how these endpoints map back to ibnet. Is it mapped to ibnet ids or to communicator ids? - */ - for (i = 0; i < group_size; i++) { - ep = iboffload->endpoints[i]; - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - - /* set the connection status to connected */ - - /*JSL - change this macro */ - iboffload->connection_status[ALLGATHER_KNOMIAL_ALG] = true; -} -#endif - -static int mca_bcol_iboffload_k_nomial_allgather_userbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc, - src, dst, comm_dst, comm_src; - int tree_order, pow_k, i, j; - - uint32_t pack_len; - int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; - int group_size = iboffload_module->group_size; - int *group_list = iboffload_module->super.sbgp_partner_module->group_list; - int my_comm_index = group_list[my_group_index]; - - netpatterns_k_exchange_node_t *exchange_node = &iboffload_module->knomial_allgather_tree; - - struct mqe_task *last_send = NULL, - *last_wait = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; - -#if 0 - fprintf(stderr,"entering large msg allgather\n"); -#endif - IBOFFLOAD_VERBOSE(10,("Entering large msg iboffload allgather")); - if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_KNOMIAL_ALG])) { - IBOFFLOAD_VERBOSE(10,("Allgather open new connection ")); - bcol_iboffload_setup_allgather_endpoints_connection(iboffload_module); - } - - pack_len = coll_request->count * coll_request->dtype->super.size; - IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", - pack_len, - coll_request->count, - coll_request->dtype->super.size)); - - /* register send and receive sides */ - /* send side, only sending pack_len data */ - - /* I think that probably I will only register the rbuf */ - /* on receive side I need to register pack_len*group_size data */ - - rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[RBUF].buf, pack_len * group_size, - &coll_request->buffer_info[RBUF].iboffload_reg, iboffload_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("Cannot register memory: " - "addr - %p, %d bytes.\n", - coll_request->buffer_info[RBUF].buf, pack_len)); - return OMPI_ERROR; - } - coll_request->buffer_info[RBUF].lkey = coll_request->buffer_info[RBUF].iboffload_reg->mr->lkey; - - /* it is estimated mq consumption... */ - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - coll_fragment->tail_next = &coll_fragment->to_post; - - /* start with the extra / proxy phase */ - if( EXTRA_NODE == exchange_node->node_type ) { - - - /* send pack_len data to proxy */ - comm_dst = exchange_node->rank_extra_sources_array[0]; - /* get ib subnet id */ - dst = comm_dst; /* comm_to_ibnet[comm_dst];*/ - /* post ready-to-receive receive on sender's side */ - rc = mca_bcol_iboffload_recv_rtr_setup( - &last_wait, dst, iboffload_module, coll_fragment); - - /* send the data */ - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_recv_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_send_large_buff_setup( - &last_send, RBUF, coll_request->buffer_info[RBUF].offset + my_comm_index*pack_len, - pack_len, dst, - iboffload_module, coll_fragment); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_send_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - /* send is done */ - - /* post the receive */ - comm_src = comm_dst; - src = dst; - /* Sending this results in a race condition where if the rtr send bypasses - the large msg receive on proxy's side, then it triggers the start of the - recurssive k-ing phase prematurely causing random data corruption. - */ - /* - rc = mca_bcol_iboffload_send_rtr_setup(&last_send, - src, iboffload_module, - coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - */ - rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, - RBUF, coll_request->buffer_info[RBUF].offset, - pack_len*group_size, src, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - goto FINISHED; - - - } else if( 0 < exchange_node->n_extra_sources ) { - - /* am a proxy, receive pack_len data from extra */ - comm_src = exchange_node->rank_extra_sources_array[0]; - /* get ib subnet */ - src = comm_src; /*comm_to_ibnet[comm_src];*/ - - rc = mca_bcol_iboffload_send_rtr_setup(&last_send, - src, iboffload_module, - coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - - rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, - RBUF, coll_request->buffer_info[RBUF].offset + pack_len*comm_src, - pack_len, src, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - } - - /* start recursive k - ing */ - tree_order = exchange_node->tree_order; - pow_k = exchange_node->log_tree_order; - for( i = 0; i < pow_k; i++) { - - - /* Post ready-to-recv messages - I am here */ - for( j = 0; j <( tree_order - 1); j++) { - comm_src = exchange_node->rank_exchanges[i][j]; - if( comm_src < 0 ){ - continue; - } - /* get ib subnet */ - src = comm_src; /*comm_to_ibnet[comm_src];*/ - - rc = mca_bcol_iboffload_send_rtr_setup(&last_send, - src, iboffload_module, - coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - } - - /* Post receive ready-to-recev message - I can send to you */ - for( j = 0; j < (tree_order - 1); j++) { - /* recev ready-to-receive message */ - comm_dst = exchange_node->rank_exchanges[i][j]; - /* remember, if we have extra ranks, then we won't participate - * with a least one peer. Make a check: - */ - if( comm_dst < 0 ){ - continue; - } - - /* get ib subnet id */ - dst = comm_dst; /*comm_to_ibnet[comm_dst];*/ - /* post ready-to-receive receive on sender's side */ - rc = mca_bcol_iboffload_recv_rtr_setup( - &last_wait, dst, iboffload_module, coll_fragment); - /* send the data */ - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_recv_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - } - - - /* (k-1) sends */ - for( j = 0; j < (tree_order - 1); j++ ) { - - /* send phase - */ - comm_dst = exchange_node->rank_exchanges[i][j]; - /* remember, if we have extra ranks, then we won't participate - * with a least one peer. Make a check - */ - if( comm_dst < 0 ){ - continue; - } - - /* get ib subnet id */ - dst = comm_dst; /*comm_to_ibnet[comm_dst];*/ - rc = mca_bcol_iboffload_send_large_buff_setup( - &last_send, RBUF, - coll_request->buffer_info[RBUF].offset + pack_len*exchange_node->payload_info[i][j].s_offset/* offset calc */ , - exchange_node->payload_info[i][j].s_len*pack_len, dst, - iboffload_module, coll_fragment); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_send_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - /* send is done */ - - } - - /* we post receives after all sends in order to achieve concurrent - * sends as well as assuring blocking until completely receiving - * all data at level k before starting level k+1 sends - */ - /* (k-1) receives - these are blocking */ - for( j = 0; j < (tree_order - 1); j++) { - /*recv phase */ - comm_src = exchange_node->rank_exchanges[i][j]; - if( comm_src < 0 ){ - continue; - } - /* get ib subnet */ - src = comm_src; /*comm_to_ibnet[comm_src];*/ - - rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF, - coll_request->buffer_info[RBUF].offset + pack_len*exchange_node->payload_info[i][j].r_offset, - exchange_node->payload_info[i][j].r_len*pack_len, src, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - - - } - - - } - - /* last step, just send it back to the extra if I have one */ - if( 0 < exchange_node->n_extra_sources ) { - - comm_dst = exchange_node->rank_extra_sources_array[0]; - - /* get ib subnet id */ - dst = comm_dst; /*comm_to_ibnet[comm_dst];*/ - /* - rc = mca_bcol_iboffload_recv_rtr_setup( - &last_wait, dst, iboffload_module, coll_fragment); - - // send the data - we are already guaranteed that extra rank is waiting - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_recv_rtr_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - */ - - rc = mca_bcol_iboffload_send_large_buff_setup( - &last_send, RBUF, coll_request->buffer_info[RBUF].offset, - pack_len*group_size, dst, - iboffload_module, coll_fragment); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - "mca_bcol_iboffload_send_large_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - /* send is done */ - - } - -FINISHED: - - /* end of list */ - *coll_fragment->tail_next = NULL; - - /* finish initializing full message descriptor */ - (coll_request)->n_fragments = 1; - (coll_request)->n_frags_sent = 1; - - assert(NULL != last_wait); - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - assert(MCA_COLL_ML_NO_BUFFER == coll_request->ml_buffer_index); - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); - - IBOFFLOAD_VERBOSE(10, ("Return success.\n")); - return BCOL_FN_STARTED; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n")); - rc = - mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); - return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; -} - -static int mca_bcol_iboffload_k_nomial_allgather_mlbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc, - src, dst, comm_dst, comm_src, i, j; - int tree_order, pow_k, knt; - uint32_t pack_len; - int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; - int group_size = iboffload_module->group_size; - netpatterns_k_exchange_node_t *exchange_node = - &iboffload_module->knomial_allgather_tree; - - struct mqe_task *last_send = NULL, - *last_wait = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; - int *list_connected = iboffload_module->super.list_n_connected; - - /* test test */ - int buff_offset = iboffload_module->super.hier_scather_offset; - - IBOFFLOAD_VERBOSE(10,("Entering small msg iboffload bcast")); - - - if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_KNOMIAL_ALG])) { - IBOFFLOAD_VERBOSE(10,("Bcast open new connection ")); - bcol_iboffload_setup_allgather_endpoints_connection(iboffload_module); - } - - pack_len = coll_request->count * coll_request->dtype->super.size; - IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", - pack_len, - coll_request->count, - coll_request->dtype->super.size)); - - /* now we calculate the actual buff_offset */ - buff_offset = buff_offset*pack_len; - - /* it is estimated mq consumption... */ - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - coll_fragment->tail_next = &coll_fragment->to_post; - /* we put this in to propagate the lkey into this local data structure */ - coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey; - /* end hack */ - if( EXTRA_NODE == exchange_node->node_type ) { - /* setup the rdma "send" pack_len data to proxy rank */ - comm_dst = exchange_node->rank_extra_sources_array[0]; - /* get ib subnet id */ - dst = comm_dst; - /* now I need to calculate my own offset info */ - knt = 0; - for( i = 0; i < my_group_index; i++){ - knt += list_connected[i]; - } - - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, pack_len*list_connected[my_group_index], pack_len*knt /* source offset */, - pack_len*knt /* destination offset */, dst, - iboffload_module, coll_fragment); -#if 0 - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, pack_len, pack_len*group_list[my_group_index] /* source offset */, - pack_len*group_list[my_group_index] /* destination offset */, dst, - iboffload_module, coll_fragment); -#endif - /* old flow with ml offset */ -#if 0 - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, pack_len, pack_len*group_list[my_group_index] /* source offset */, - coll_request->buffer_info[RBUF].offset + pack_len*group_list[my_group_index] /* destination offset */, dst, - iboffload_module, coll_fragment); -#endif - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_send_small_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - /* send is done */ - - /* setup the rdma "receive" from proxy */ - comm_src = comm_dst; - src = dst; - /* more general is the number connected */ - knt = 0; - for( i = 0; i < group_size; i++) { - knt += list_connected[i]; - } - - - rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, - pack_len*knt, src, - iboffload_module, coll_fragment); - - /* - rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, - pack_len*group_size, src, - iboffload_module, coll_fragment); - */ - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - goto FINISHED; - } else if( 0 < exchange_node->n_extra_sources ) { - - /* am a proxy, receive pack_len data from extra */ - comm_src = exchange_node->rank_extra_sources_array[0]; - /* get ib subnet */ - src = comm_src; - rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, - pack_len*list_connected[src], src, - iboffload_module, coll_fragment); - /* - rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, - pack_len, src, - iboffload_module, coll_fragment); - */ - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - - } - - /* start recursive k - ing */ - tree_order = exchange_node->tree_order; - pow_k = exchange_node->log_tree_order; - /*fprintf(stderr,"tree order %d pow_k %d\n",tree_order,pow_k);*/ - for( i = 0; i < pow_k; i++) { - for( j = 0; j < (tree_order - 1); j++ ) { - /* send phase - */ - comm_dst = exchange_node->rank_exchanges[i][j]; - /* remember, if we have extra ranks, then we won't participate - * with a least one peer. Make a check - */ - /*fprintf(stderr,"AAA my index %d comm_dst %d\n",my_group_index,comm_dst);*/ - if( comm_dst < 0 ){ - continue; - } - - /* get ib subnet id */ - /* again, don't think we need this */ - /*dst = ibnet_map[comm_dst];*/ - dst = comm_dst; - /* - fprintf(stderr,"BBB my index %d dst %d pack len %d s_len %d src offset %d r_len %d \n",my_group_index,dst, - pack_len,exchange_node->payload_info[i][j].s_len,exchange_node->payload_info[i][j].s_offset, - exchange_node->payload_info[i][j].r_len); - */ - /* rdma "send" setup */ - - - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, exchange_node->payload_info[i][j].s_len * pack_len, - exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */, - exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst, - iboffload_module, coll_fragment); - -#if 0 - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, exchange_node->payload_info[i][j].s_len * pack_len, - exchange_node->payload_info[i][j].s_offset * exchange_node->payload_info[i][j].s_len*pack_len /* source offset */, - exchange_node->payload_info[i][j].s_offset * exchange_node->payload_info[i][j].s_len*pack_len /* destination offset */, dst, - iboffload_module, coll_fragment); -#endif - -#if 0 - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, exchange_node->payload_info[i][j].s_len * pack_len, - exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */, - exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst, - iboffload_module, coll_fragment); -#endif -#if 0 - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, exchange_node->payload_info[i][j].s_len * pack_len, - coll_request->buffer_info[SBUF].offset + exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */, - coll_request->buffer_info[SBUF].offset + exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst, - iboffload_module, coll_fragment); -#endif - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_send_small_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - /* send is done */ - } - - for( j = 0; j < (tree_order - 1); j++) { - - /* rdma "recv" phase */ - comm_src = exchange_node->rank_exchanges[i][j]; - /* remember, if we have extra ranks, then we won't participate - * with a least one peer. Make a check - */ - if( comm_src < 0 ){ - continue; - } - - /* get ib subnet id */ - /* shouldn't need this */ - src = comm_src; - - rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, - exchange_node->payload_info[i][j].r_len * pack_len, src, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - } - return OMPI_ERROR; - } - - } - } - - /* last step, proxies send full data back to the extra ranks */ - if( 0 < exchange_node->n_extra_sources ) { - /* send pack_len data to proxy */ - comm_dst = exchange_node->rank_extra_sources_array[0]; - /* get ibnet id */ - dst = comm_dst; - - knt = 0; - for( i = 0; i < group_size; i++){ - knt += list_connected[i]; - } - - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, pack_len*knt, 0 /* source offset */, - 0 /* destination offset */, dst, - iboffload_module, coll_fragment); -#if 0 - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, pack_len*group_size, 0 /* source offset */, - 0 /* destination offset */, dst, - iboffload_module, coll_fragment); -#endif -#if 0 - rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup( - &last_send, pack_len*group_size, coll_request->buffer_info[RBUF].offset /* source offset */, - coll_request->buffer_info[SBUF].offset /* destination offset */, dst, - iboffload_module, coll_fragment); -#endif - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_send_small_buff_setup")); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){ - goto out_of_resources; - fprintf(stderr,"I'm out of resources \n"); - } - return OMPI_ERROR; - } - /* send is done */ - - } - -FINISHED: - - /* end of list */ - *coll_fragment->tail_next = NULL; - - /* finish initializing full message descriptor */ - (coll_request)->n_fragments = 1; - (coll_request)->n_frags_sent = 1; - - assert(NULL != last_wait); - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - assert(MCA_COLL_ML_NO_BUFFER != coll_request->ml_buffer_index); - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); - - IBOFFLOAD_VERBOSE(10, ("Return success.\n")); - return BCOL_FN_STARTED; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n")); - rc = - mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); - return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; -} - -#if 0 -static int mca_bcol_iboffload_neighbor_allgather_userbuffer_intra( - bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *)const_args->bcol_module; - - int rc; - int mq_credits = iboffload_module->group_size * 2 * 2; /* large message protocol consumes - * twice as many mq credits - */ - - bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); - mca_bcol_iboffload_collreq_t *coll_request; - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); - - rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module, - &coll_request, if_bcol_last, mq_credits, - mca_bcol_iboffload_neighbor_allgather_userbuffer_exec); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - rc = coll_request->progress_fn(iboffload_module, coll_request); - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra was started [%d]\n", rc)); - return rc; -} -#endif - -#if 1 -static int mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *)const_args->bcol_module; - - int rc; - int mq_credits = ((iboffload_module->knomial_allgather_tree.tree_order - 1)* - iboffload_module->knomial_allgather_tree.log_tree_order + 1) * 2 * 2; /* large message protocol - * consumes twice as much - */ - - bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); - mca_bcol_iboffload_collreq_t *coll_request; - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); - - rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module, - &coll_request, if_bcol_last, mq_credits, - mca_bcol_iboffload_k_nomial_allgather_userbuffer_exec); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - rc = coll_request->progress_fn(iboffload_module, coll_request); - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra was started [%d]\n", rc)); - return rc; -} -#endif - -static int mca_bcol_iboffload_k_nomial_allgather_mlbuffer_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *)const_args->bcol_module; - - int rc; - - /* I'll add one for everyone, since nobody wants to feel left out */ - int mq_credits = ((iboffload_module->knomial_allgather_tree.tree_order - 1)* - iboffload_module->knomial_allgather_tree.log_tree_order + 1) * 2 ; - bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); - mca_bcol_iboffload_collreq_t *coll_request; - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); - - rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module, - &coll_request, if_bcol_last, mq_credits, - mca_bcol_iboffload_k_nomial_allgather_mlbuffer_exec); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - rc = coll_request->progress_fn(iboffload_module, coll_request); - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_small_msg_bcast_intra was started [%d]\n", rc)); - return rc; -} - - -/* these progress engines are shared between alltoall and allgather and exist in both files, - * should be moved to a common .h file - */ -static int mca_bcol_iboffload_collreq_mlbuffer_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - int i; - mca_bcol_iboffload_collreq_t *coll_request = - (mca_bcol_iboffload_collreq_t *) - input_args->bcol_opaque_data; - IBOFFLOAD_VERBOSE(10, ("Run progress (ml buffer).\n")); - for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) { - if (BCOL_IS_COMPLETED(coll_request)) { - - coll_request->user_handle_freed = true; - - if (COLLREQ_IS_DONE(coll_request)) { - IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); - RELEASE_COLLREQ(coll_request); - } - IBOFFLOAD_VERBOSE(10, ("Collective finished (ml buffer).\n")); - - return BCOL_FN_COMPLETE; - } - } - IBOFFLOAD_VERBOSE(10, ("Collective not finished (ml buffer).\n")); - return BCOL_FN_STARTED; -} - - -static int mca_bcol_iboffload_collreq_userbuffer_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - int i; - mca_bcol_iboffload_collreq_t *coll_request = - (mca_bcol_iboffload_collreq_t *) - input_args->bcol_opaque_data; - - IBOFFLOAD_VERBOSE(10, ("Run progress (user buffer)\n")); - - /* Complete the allgather - progress releases full request descriptors */ - - for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) { - if (coll_request->n_frag_mpi_complete == coll_request->n_fragments && - coll_request->n_frag_net_complete == coll_request->n_fragments) { - - IBOFFLOAD_VERBOSE(10, ("Deregister user buff.\n")); - - if (NULL != coll_request->buffer_info[SBUF].iboffload_reg) { - coll_request->module->device->mpool->mpool_deregister( - coll_request->module->device->mpool, - (mca_mpool_base_registration_t *) coll_request->buffer_info[SBUF].iboffload_reg); - coll_request->buffer_info[SBUF].iboffload_reg = NULL; - } - - - if (NULL != coll_request->buffer_info[RBUF].iboffload_reg) { - coll_request->module->device->mpool->mpool_deregister( - coll_request->module->device->mpool, - (mca_mpool_base_registration_t *) coll_request->buffer_info[RBUF].iboffload_reg); - coll_request->buffer_info[RBUF].iboffload_reg = NULL; - } - - RELEASE_COLLREQ(coll_request); - IBOFFLOAD_VERBOSE(10, ("New bcast done !!!")); - return BCOL_FN_COMPLETE; - } - } - - IBOFFLOAD_VERBOSE(10, ("Collective finished (user buffer).\n")); - - /* We are not done */ - return BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_allgather_register(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - IBOFFLOAD_VERBOSE(10, ("Register iboffload Allgather.\n")); - comm_attribs.bcoll_type = BCOL_ALLGATHER; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_k_nomial_allgather_mlbuffer_intra, - mca_bcol_iboffload_collreq_mlbuffer_progress); - - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - - - /* zero-copy k-nomial algorithm */ -#if 1 - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra, - mca_bcol_iboffload_collreq_userbuffer_progress); -#endif - /* zero-copy neighbor exchange algorithm */ -#if 0 - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_neighbor_allgather_userbuffer_intra, - mca_bcol_iboffload_collreq_userbuffer_progress); -#endif - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_allreduce.c b/ompi/mca/bcol/iboffload/bcol_iboffload_allreduce.c deleted file mode 100644 index 406442ff7c..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_allreduce.c +++ /dev/null @@ -1,1418 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/* - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#include "ompi_config.h" - -#include -#include -#include -#include -#include -#include "opal_stdint.h" - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" -#include "bcol_iboffload_collfrag.h" -#include "bcol_iboffload_endpoint.h" - -#include "opal/include/opal/types.h" - -static int mca_bcol_iboffload_calc_res_to_user(void *callback_data) -{ - int rc; - uint64_t result = 0; - - uint64_t l_operand = 0; - uint64_t r_operand = 0; - - mca_bcol_iboffload_collfrag_t *coll_frag = - (mca_bcol_iboffload_collfrag_t *) callback_data; - - mca_bcol_iboffload_collreq_t *coll_request = coll_frag->coll_full_req; - - ompi_op_t *op = coll_request->op; - ompi_datatype_t *dtype = coll_request->dtype; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - struct ibv_context *ib_dev_context = coll_request->module->device->dev.ib_dev_context; - - IBOFFLOAD_VERBOSE(10, ("Start calculating.\n")); - - rc = unpack_data_from_calc(ib_dev_context, - cm->map_ompi_to_ib_calcs[op->op_type], - cm->map_ompi_to_ib_dt[dtype->id], false, - (void *) (uintptr_t) coll_request->l_operand, - NULL, (void *) &l_operand); - if (0 != rc) { - IBOFFLOAD_VERBOSE(10, ("unpack_data_from_calc for l_operand failed: op %s, type %s\n", - op->o_name, dtype->name)); - return OMPI_ERROR; - } - - rc = unpack_data_from_calc(ib_dev_context, - cm->map_ompi_to_ib_calcs[op->op_type], - cm->map_ompi_to_ib_dt[dtype->id], false, - (void *) (uintptr_t) coll_request->r_operand, - NULL, (void *) &r_operand); - if (0 != rc) { - IBOFFLOAD_VERBOSE(10, ("unpack_data_from_calc for r_operand failed: op %s, type %s\n", - op->o_name, dtype->name)); - return OMPI_ERROR; - } - - switch (op->op_type) { - case OMPI_OP_PROD: - break; /* ronni todo - ????? */ - case OMPI_OP_LAND: - result = l_operand && r_operand; - break; - case OMPI_OP_BAND: - result = l_operand & r_operand; - break; - case OMPI_OP_LOR: - result = l_operand || r_operand; - break; - case OMPI_OP_BOR: - result = l_operand | r_operand; - break; - case OMPI_OP_LXOR: - result = ((l_operand && !r_operand) || (!l_operand && r_operand)); - break; - case OMPI_OP_BXOR: - result = l_operand ^ r_operand; - break; - case OMPI_OP_MAXLOC: - case OMPI_OP_MINLOC: - break; - case OMPI_OP_MAX: - case OMPI_OP_MIN: - case OMPI_OP_SUM: - switch (cm->map_ompi_to_ib_dt[dtype->id]) { - case IBV_M_DATA_TYPE_INT8: - MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, char, l_operand, r_operand, result); - break; - case IBV_M_DATA_TYPE_INT16: - MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, int16_t, l_operand, r_operand, result); - break; - case IBV_M_DATA_TYPE_INT32: - MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, int32_t, l_operand, r_operand, result); - break; - case IBV_M_DATA_TYPE_INT64: - MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, int64_t, l_operand, r_operand, result); - break; - case IBV_M_DATA_TYPE_FLOAT32: - MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, float, l_operand, r_operand, result); - break; - case IBV_M_DATA_TYPE_FLOAT64: - MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, double, l_operand, r_operand, result); - break; - default: - IBOFFLOAD_VERBOSE(10, ("Unsupported data type: %s.\n", dtype->name)); - return OMPI_ERROR; - } - - break; - - default: - IBOFFLOAD_VERBOSE(10, ("Unsupported op: %s.\n", coll_request->op->o_name)); - return OMPI_ERROR; - } - - memcpy(coll_request->buffer_info[RBUF].buf, &result, coll_frag->unpack_size); - IBOFFLOAD_VERBOSE(10, ("The output data after calc is %lf, result %lf, l_operand %lf, r_operand %lf: " - "sbuf addr %p, rbuf addr %p.\n", - *(double *) coll_request->buffer_info[RBUF].buf, *(double *) &result, - *(double *) &l_operand, *(double *) &r_operand, - coll_request->buffer_info[SBUF].buf, - coll_request->buffer_info[RBUF].buf)); - - return OMPI_SUCCESS; -} - -static int mca_bcol_iboffload_unpack_res_to_user(void *callback_data) -{ - int rc; - - mca_bcol_iboffload_collfrag_t *coll_frag = - (mca_bcol_iboffload_collfrag_t *) callback_data; - - mca_bcol_iboffload_collreq_t *coll_request = coll_frag->coll_full_req; - mca_bcol_iboffload_task_t *task = (mca_bcol_iboffload_task_t *) coll_frag->signal_task_wr_id; - - mca_bcol_iboffload_frag_t *recv_frag = task->frag; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - struct ibv_context *ib_dev_context = coll_request->module->device->dev.ib_dev_context; - - rc = unpack_data_from_calc(ib_dev_context, - cm->map_ompi_to_ib_calcs[coll_request->op->op_type], - cm->map_ompi_to_ib_dt[coll_request->dtype->id], - false, (void*) (uintptr_t) recv_frag->sg_entry.addr, - NULL, coll_request->buffer_info[RBUF].buf); - if (0 != rc) { - IBOFFLOAD_VERBOSE(10, ("unpack_data_from_calc is failed: op %s, type %s\n", - coll_request->op->o_name, coll_request->dtype->name)); - return OMPI_ERROR; - } - - IBOFFLOAD_VERBOSE(10, ("The naitive output data is %" PRId64 ".\n" - "The output data is %" PRId64 ".\n", - *(uint64_t *) recv_frag->sg_entry.addr, - *(uint64_t *) coll_request->buffer_info[RBUF].buf)); - - return OMPI_SUCCESS; -} - -static int -allreduce_extra_node(mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collreq_t *coll_request) -/* (EXTRA_NODE == my_exchange_node->node_type) */ -{ - /* local variables */ - int rc, extra_rank; - - mca_bcol_iboffload_frag_t *send_fragment, - *preposted_recv_frag; - - mca_bcol_iboffload_task_t *send_task, - *wait_task; - - struct mqe_task *last_wait, /* we need ask from completion on last wait */ - *last_send; - - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - struct mqe_task **mqe_ptr_to_set; - mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mqe_ptr_to_set = &coll_fragment->to_post; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* I will NOT participate in the exchange - so just "register" as here */ - extra_rank = my_exchange_node->rank_extra_source; - - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - extra_rank, coll_request->qp_index, - MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE, 0, - SBUF, - MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC); - - if (OPAL_UNLIKELY(NULL == send_fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* send my operand to EXCHANGE NODE */ - send_task = mca_bcol_iboffload_get_send_task(iboffload, extra_rank, - coll_request->qp_index, send_fragment, coll_fragment, INLINE); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, extra_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for final result from EXCHANGE NODE */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - *mqe_ptr_to_set = NULL; - - /* finish initializing full message descriptor */ - coll_request->n_fragments = 1; - coll_request->n_frags_sent = 1; - - /* Pasha: need to set to true in upper layer */ - coll_request->user_handle_freed = false; - - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - IBOFFLOAD_VERBOSE(10, ("Post tasks.\n")); - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Allreduce: adding collfrag to collfrag_pending.\n")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -/** - * Start allreduce - */ -static int do_exchange(mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collreq_t *coll_request, - struct mqe_task ***mqe_ptr_to_set, - struct mqe_task **last_wait, - struct ibv_sge **l_operand, - struct ibv_sge **r_operand) -{ - int rc = OMPI_SUCCESS, exchange, pair_rank, - my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index; - - mca_bcol_iboffload_frag_t *preposted_recv_frag; - - mca_bcol_iboffload_task_t *wait_task, - *calc_task; - - struct mqe_task *last_send; - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - size_t calc_size = MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT; - - pair_rank = my_exchange_node->rank_exchanges[0]; - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, pair_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for send from first algorithm partner */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait)); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - (*l_operand)->length = calc_size; - for (exchange = 1; exchange < my_exchange_node->n_exchanges; ++exchange) { - pair_rank = my_exchange_node->rank_exchanges[exchange]; - - (*r_operand) = &preposted_recv_frag->sg_entry; - (*r_operand)->length = calc_size; - - /* Calc and send the result to the partner */ - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - pair_rank, coll_request->qp_index, NULL, - *l_operand, *r_operand, - coll_request, NO_INLINE); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - /* Calc and send the result to myself */ - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - my_rank, coll_request->qp_index, NULL, - *l_operand, *r_operand, coll_request, NO_INLINE); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, my_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for calc from myself */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (NULL == wait_task) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait)); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - (*l_operand) = &preposted_recv_frag->sg_entry; - (*l_operand)->length = calc_size; - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, pair_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for calc from the current algorithm partner */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait)); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - } - - (*r_operand) = &preposted_recv_frag->sg_entry; - (*r_operand)->length = calc_size; - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -/* Power of 2 case */ -static int -pure_recursive_doubling(mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collreq_t *coll_request) -{ - /* local variables */ - int rc = OMPI_SUCCESS, pair_rank, - my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index; - - struct mqe_task *last_send, - *last_wait; - - mca_bcol_iboffload_task_t *send_task, - *wait_task, - *calc_task; - - mca_bcol_iboffload_frag_t *send_fragment, - *preposted_recv_frag; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - struct ibv_sge *r_operand = NULL, - *l_operand = NULL; - - struct mqe_task **mqe_ptr_to_set; - mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mqe_ptr_to_set = &coll_fragment->to_post; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - IBOFFLOAD_VERBOSE(10, ("Allreduce starting: type %d op %d, " - "n_extra_sources - %d.\n", cm->map_ompi_to_ib_dt[coll_request->dtype->id], - cm->map_ompi_to_ib_calcs[coll_request->op->op_type], - my_exchange_node->n_extra_sources)); - - pair_rank = my_exchange_node->rank_exchanges[0]; - - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - pair_rank, coll_request->qp_index, - (MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT), 0, - SBUF, - MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC); - if (OPAL_UNLIKELY(NULL == send_fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } -/* Vasily: NO_INLINE ????? */ - /* send my operand to the first algorithm partner */ - send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank, - coll_request->qp_index, send_fragment, coll_fragment, NO_INLINE); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - - l_operand = &send_fragment->sg_entry; - /* Recursive-doubling exchange */ - rc = do_exchange(iboffload, coll_request, &mqe_ptr_to_set, - &last_wait, &l_operand, &r_operand); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - if (false == coll_request->do_calc_in_cpu) { - /* Calc and send the result to myself */ - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - my_rank, coll_request->qp_index, NULL, - l_operand, - r_operand, coll_request, NO_INLINE); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, my_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for calc from myself */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - } else { - coll_request->l_operand = l_operand->addr; - coll_request->r_operand = r_operand->addr; - } - - *mqe_ptr_to_set = NULL; -/* Vasily: TODO with MACRO */ - /* finish initializing full message descriptor */ - coll_request->n_fragments = 1; - coll_request->n_frags_sent = 1; - - /* Pasha: need to set to true in upper layer */ - coll_request->user_handle_freed = false; - - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - IBOFFLOAD_VERBOSE(10, ("Post tasks.\n")); - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -static int rdma_do_exchange(mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collreq_t *coll_request, - struct mqe_task ***mqe_ptr_to_set, - struct mqe_task **last_wait, - struct ibv_sge **l_operand, - struct ibv_sge **r_operand) -{ - int rc = OMPI_SUCCESS, exchange, pair_rank, - my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index; - - mca_bcol_iboffload_frag_t *preposted_recv_frag; - - mca_bcol_iboffload_task_t *wait_task, - *calc_task; - - struct mqe_task *last_send; - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - const size_t calc_size = MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT; - size_t remote_offset = calc_size; - size_t self_offset = 0; - - pair_rank = my_exchange_node->rank_exchanges[0]; - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, pair_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for send from first algorithm partner */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait)); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - (*l_operand)->length = 2 * calc_size ; - for (exchange = 1; exchange < my_exchange_node->n_exchanges; ++exchange) { - pair_rank = my_exchange_node->rank_exchanges[exchange]; - /* Pasha: Not used - (*r_operand) = &preposted_recv_frag->sg_entry; - (*r_operand)->length = calc_size; - */ - - remote_offset += 2 * calc_size; - self_offset += 2 * calc_size; - - /* Calc and send the result to the partner */ - /* - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - pair_rank, coll_request->qp_index, NULL, - *l_operand, *r_operand, - coll_request, NO_INLINE); - */ - calc_task = mca_bcol_iboffload_get_rdma_calc_task(iboffload, - pair_rank, coll_request->qp_index, NULL, - *l_operand, NULL, - coll_request, remote_offset); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - /* Calc and send the result to myself */ - /* - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - my_rank, coll_request->qp_index, NULL, - *l_operand, NULL, - coll_request, NO_INLINE); - */ - calc_task = mca_bcol_iboffload_get_rdma_calc_task(iboffload, - my_rank, coll_request->qp_index, NULL, - *l_operand, NULL, - coll_request, self_offset); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, my_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for calc from myself */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (NULL == wait_task) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait)); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - /* - (*l_operand) = &preposted_recv_frag->sg_entry; - */ - - /* (*l_operand)->length = 2 * calc_size; */ - (*l_operand)->addr = (uint64_t) (uintptr_t) ((unsigned char *) (*l_operand)->addr + 2 * calc_size); - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, pair_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for calc from the current algorithm partner */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait)); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - } - /* Pasha: not used - (*r_operand) = &preposted_recv_frag->sg_entry; - (*r_operand)->length = calc_size; - */ - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -#define ALLREDUCE_BASE_OFFSET (MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT) - -/* RDMA Recursive doubling + cache friendly version */ -static int -rdma_pure_recursive_doubling(mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collreq_t *coll_request) -{ - /* local variables */ - int rc = OMPI_SUCCESS, pair_rank, - my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index; - - struct mqe_task *last_send, - *last_wait; - - mca_bcol_iboffload_task_t *send_task, - *wait_task, - *calc_task; - - mca_bcol_iboffload_frag_t *send_fragment, - *preposted_recv_frag; - struct ibv_sge operand; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - struct ibv_sge *r_operand = NULL, - *l_operand = NULL; - - struct mqe_task **mqe_ptr_to_set; - mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mqe_ptr_to_set = &coll_fragment->to_post; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - IBOFFLOAD_VERBOSE(10, ("Allreduce starting: type %d op %d, " - "n_extra_sources - %d.\n", cm->map_ompi_to_ib_dt[coll_request->dtype->id], - cm->map_ompi_to_ib_calcs[coll_request->op->op_type], - my_exchange_node->n_extra_sources)); - - pair_rank = my_exchange_node->rank_exchanges[0]; - - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - pair_rank, coll_request->qp_index, - (MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT), - 0, - SBUF, - MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC); - if (OPAL_UNLIKELY(NULL == send_fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - /* Vasily: NO_INLINE ????? */ - /* send my operand to the first algorithm partner */ - /* send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank, - coll_request->qp_index, send_fragment, coll_fragment, NO_INLINE); */ - - send_task = mca_bcol_iboffload_get_rdma_task( - pair_rank, ALLREDUCE_BASE_OFFSET, - send_fragment, iboffload, coll_fragment); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Pasha: ugly but faster, set inline on first send */ - SENDWR(send_task)->send_flags |= IBV_SEND_INLINE; - - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - - /* l_operand = &send_fragment->sg_entry; */ - operand = send_fragment->sg_entry; - l_operand = &operand; - - /* Recursive-doubling exchange */ - rc = rdma_do_exchange(iboffload, coll_request, &mqe_ptr_to_set, - &last_wait, &l_operand, &r_operand); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - /* Pasha: This flow is broken, print error */ - if (false == coll_request->do_calc_in_cpu) { - ML_ERROR(("Calc in CPU must be enabled !!!")); - /* Calc and send the result to myself */ - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - my_rank, coll_request->qp_index, NULL, - l_operand, - r_operand, coll_request, NO_INLINE); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, my_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for calc from myself */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - } else { - coll_request->l_operand = (uint64_t) (uintptr_t) - ((unsigned char *)l_operand->addr); - coll_request->r_operand = (uint64_t) (uintptr_t) - ((unsigned char *) (coll_request->l_operand) + ALLREDUCE_BASE_OFFSET); - } - - *mqe_ptr_to_set = NULL; -/* Vasily: TODO with MACRO */ - /* finish initializing full message descriptor */ - coll_request->n_fragments = 1; - coll_request->n_frags_sent = 1; - - /* Pasha: need to set to true in upper layer */ - coll_request->user_handle_freed = false; - - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - IBOFFLOAD_VERBOSE(10, ("Post tasks.\n")); - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} -/* - * non power of 2 & EXCHANGE_NODE case, - * need to wait for message from "extra" proc. - */ -static int -non_pure_recursive_doubling(mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collreq_t *coll_request) -{ - /* local variables */ - int rc = OMPI_SUCCESS, extra_rank, pair_rank, - my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index; - - mca_bcol_iboffload_frag_t *calc_fragment, - *preposted_recv_frag; - - mca_bcol_iboffload_task_t *wait_task, - *calc_task; - - struct ibv_sge *r_operand = NULL, - *l_operand = NULL; - - struct mqe_task *last_wait, /* we need ask from completion on last wait */ - *last_send; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - struct mqe_task **mqe_ptr_to_set; - mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mqe_ptr_to_set = &coll_fragment->to_post; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - IBOFFLOAD_VERBOSE(10, ("Allreduce starting: type %d op %d, " - "n_extra_sources - %d.\n", cm->map_ompi_to_ib_dt[coll_request->dtype->id], - cm->map_ompi_to_ib_calcs[coll_request->op->op_type], - my_exchange_node->n_extra_sources)); - - extra_rank = my_exchange_node->rank_extra_source; - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, extra_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for data from extra node */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - pair_rank = my_exchange_node->rank_exchanges[0]; - - calc_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - pair_rank, coll_request->qp_index, - MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + - MCA_IBOFFLOAD_CALC_SIZE_EXT, 0, - SBUF, - MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC); - if (OPAL_UNLIKELY(NULL == calc_fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Calc extra node operand with mine and send the result - to the first algorithm partner */ - preposted_recv_frag->sg_entry.length = MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + - MCA_IBOFFLOAD_CALC_SIZE_EXT; - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - pair_rank, coll_request->qp_index, calc_fragment, - &preposted_recv_frag->sg_entry, - &calc_fragment->sg_entry, coll_request, NO_INLINE); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - /* Calc extra node operand with mine and store the result on my buff */ - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - my_rank, coll_request->qp_index, NULL, - &preposted_recv_frag->sg_entry, - &calc_fragment->sg_entry, coll_request, NO_INLINE); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, my_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for calc from myself */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - l_operand = &preposted_recv_frag->sg_entry; - l_operand->length = MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + - MCA_IBOFFLOAD_CALC_SIZE_EXT; - /* Recursive-doubling exchange */ - rc = do_exchange(iboffload, coll_request, &mqe_ptr_to_set, - &last_wait, &l_operand, &r_operand); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - /* Need to send message to "extra" proc => - one more final result calc for extra node */ - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - extra_rank, coll_request->qp_index, NULL, - l_operand, - r_operand, coll_request, NO_INLINE); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - if (false == coll_request->do_calc_in_cpu) { - /* Calc and send the result to myself */ - calc_task = mca_bcol_iboffload_get_calc_task(iboffload, - my_rank, coll_request->qp_index, NULL, - l_operand, - r_operand, coll_request, NO_INLINE); - if (OPAL_UNLIKELY(NULL == calc_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task); - - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, my_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - /* RLG need cleanup */ - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - /* Wait for calc from myself */ - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - rc = OMPI_ERR_RESOURCE_BUSY; - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - } else { - coll_request->l_operand = l_operand->addr; - coll_request->r_operand = r_operand->addr; - } - - *mqe_ptr_to_set = NULL; - - /* finish initializing full message descriptor */ - coll_request->n_fragments = 1; - coll_request->n_frags_sent = 1; - - assert(NULL != last_wait); - - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - IBOFFLOAD_VERBOSE(10, ("Post tasks.\n")); - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if(OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -static int mca_bcol_iboffload_allreduce_init( - bcol_function_args_t *fn_arguments, - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t **coll_request, - bool if_bcol_last) -{ - int rc; - - bool exclude_case; - ompi_free_list_item_t *item; - mca_bcol_iboffload_collfrag_t *coll_fragment; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_allreduce_init.\n")); - - OMPI_FREE_LIST_WAIT(&cm->collreqs_free, item, rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n")); - return rc; - } - - (*coll_request) = (mca_bcol_iboffload_collreq_t *) item; - (*coll_request)->progress_fn = iboffload->allreduce_algth; - - (*coll_request)->if_bcol_last = if_bcol_last; - - exclude_case = (non_pure_recursive_doubling == iboffload->allreduce_algth && - (OMPI_OP_SUM == fn_arguments->op->op_type && - OMPI_DATATYPE_MPI_DOUBLE == fn_arguments->dtype->id)); - - (*coll_request)->do_calc_in_cpu = cm->last_calc_in_cpu && !exclude_case; - - if (false == (*coll_request)->do_calc_in_cpu || - allreduce_extra_node == iboffload->allreduce_algth) { - (*coll_request)->do_calc_in_cpu = false; /* Relevant for extra node only */ - (*coll_request)->completion_cb_fn = - mca_bcol_iboffload_unpack_res_to_user; - } else { - (*coll_request)->completion_cb_fn = - mca_bcol_iboffload_calc_res_to_user; - } - - (*coll_request)->module = iboffload; - (*coll_request)->op = fn_arguments->op; - - (*coll_request)->dtype = fn_arguments->dtype; - (*coll_request)->count = fn_arguments->count; - - (*coll_request)->ml_buffer_index = fn_arguments->buffer_index; - (*coll_request)->buffer_info[SBUF].lkey = iboffload->rdma_block.ib_info.lkey; - - (*coll_request)->order_info = &fn_arguments->order_info; - - /* ML buffer was provided, no need to pack the data. - * It is few assumption here: - * we CAN touch and change ML buffer - */ - (*coll_request)->buffer_info[SBUF].buf = (void *) ( - (unsigned char *) fn_arguments->sbuf + - (size_t) fn_arguments->sbuf_offset); - - (*coll_request)->buffer_info[SBUF].offset = fn_arguments->sbuf_offset; - - (*coll_request)->buffer_info[RBUF].buf = (void *) ( - (unsigned char *) fn_arguments->rbuf + - (size_t) fn_arguments->rbuf_offset); - - (*coll_request)->buffer_info[RBUF].offset = fn_arguments->rbuf_offset; - - if(mca_bcol_iboffload_component.enable_rdma_calc) { - (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER; - } else { - (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_REGULAR; - } - - (*coll_request)->n_frag_mpi_complete = 0; - (*coll_request)->n_frag_net_complete = 0; - - fn_arguments->bcol_opaque_data = (void *) (*coll_request); - - /* - * setup collective work request - */ - - /* get collective frag */ - coll_fragment = &((*coll_request)->first_collfrag); - mca_bcol_iboffload_collfrag_init(coll_fragment); - - coll_fragment->mq_index = COLL_MQ; - coll_fragment->alg = RECURSIVE_DOUBLING_ALLREDUCE_ALG; - - coll_fragment->mq_credits = - iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG]; - - /* set pointers for (coll frag) <-> (coll full request) */ - MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment); - - coll_fragment->unpack_size = - mca_bcol_base_get_buff_length(fn_arguments->dtype, fn_arguments->count); - - IBOFFLOAD_VERBOSE(10, ("The input data is %lf", *(double *) (*coll_request)->buffer_info[SBUF].buf)); - - return OMPI_SUCCESS; -} - -static int mca_bcol_iboffload_allreduce_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args) -{ - /* local variables */ - int rc; - - mca_bcol_iboffload_collreq_t *coll_request = NULL; - mca_bcol_iboffload_module_t *iboffload = - (mca_bcol_iboffload_module_t *) const_args->bcol_module; - - /* Pasha: please do not touch this line, it used for ML buffer recycling barrier call */ - bool if_bcol_last = ((const_args->index_of_this_type_in_collective + 1) == - const_args->n_of_this_type_in_collective); - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); - - IBOFFLOAD_VERBOSE(10, ("n_of_this_type_in_a_row %d, index_in_consecutive_same_bcol_calls %d", - const_args->n_of_this_type_in_a_row, - const_args->index_in_consecutive_same_bcol_calls + 1)); - - IBOFFLOAD_VERBOSE(10, ("Allreduce started.\n")); - fn_arguments->result_in_rbuf = true; - - rc = mca_bcol_iboffload_allreduce_init(fn_arguments, iboffload, - &coll_request, if_bcol_last); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Get error from mca_bcol_iboffload_allreduce_init.\n")); - return rc; - } - - /* Allreduce starting */ - rc = iboffload->allreduce_algth(iboffload, coll_request); - if (OPAL_UNLIKELY(OMPI_ERROR == rc)) { - return BCOL_FN_NOT_STARTED; - } - - IBOFFLOAD_VERBOSE(10, ("Wait for completions.\n")); - - /* done */ - return BCOL_FN_STARTED; -} - -static int mca_bcol_iboffload_allreduce_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_collreq_t *coll_request = - (mca_bcol_iboffload_collreq_t *) - input_args->bcol_opaque_data; - - if (BCOL_IS_COMPLETED(coll_request)) { - coll_request->user_handle_freed = true; - if (COLLREQ_IS_DONE(coll_request)) { - IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); - RELEASE_COLLREQ(coll_request); - } - - IBOFFLOAD_VERBOSE(10, ("Allreduce already done.\n")); - return BCOL_FN_COMPLETE; - } - - return BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_allreduce_first_call(mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collreq_t *coll_request) -{ - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - int i = 0, my_rank = iboffload->ibnet->super.my_index, - n_exchanges = my_exchange_node->n_exchanges, - *exchanges = my_exchange_node->rank_exchanges, - n_extra_src = my_exchange_node->n_extra_sources, - rank_extra_src = my_exchange_node->rank_extra_source; - - mca_bcol_iboffload_endpoint_t *ep = iboffload->endpoints[my_rank]; - - /* Connecting to myself */ - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - - iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG] = 0; - - if (0 < n_extra_src) { - iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG] += 4; /* Two CALCs and two WAITs tasks */ - ep = iboffload->endpoints[rank_extra_src]; - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - - for (i = 0; i < n_exchanges; ++i) { - iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG] += 4; /* Two CALCs and two WAITs tasks */ - ep = iboffload->endpoints[exchanges[i]]; - - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - - iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG] += 4; /* Two CALCs and two WAITs tasks */ - - if (0 < my_exchange_node->n_extra_sources) { - iboffload->allreduce_algth = - (EXTRA_NODE == my_exchange_node->node_type)? - allreduce_extra_node: - non_pure_recursive_doubling; - } else { - if(mca_bcol_iboffload_component.enable_rdma_calc) { - iboffload->allreduce_algth = - rdma_pure_recursive_doubling; - } else { - iboffload->allreduce_algth = - pure_recursive_doubling; - } - } - - return iboffload->allreduce_algth(iboffload, coll_request); -} - -int mca_bcol_iboffload_allreduce_register(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - IBOFFLOAD_VERBOSE(10, ("Register iboffload Allreduce.\n")); - - comm_attribs.bcoll_type = BCOL_ALLREDUCE; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_allreduce_intra, - mca_bcol_iboffload_allreduce_progress); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_barrier.c b/ompi/mca/bcol/iboffload/bcol_iboffload_barrier.c deleted file mode 100644 index 1eb47f5921..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_barrier.c +++ /dev/null @@ -1,934 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include -#include -#include - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_collfrag.h" -#include "bcol_iboffload_endpoint.h" -#include "ompi/mca/coll/ml/coll_ml_allocation.h" - -static int mca_bcol_iboffload_barrier_init( - bcol_function_args_t *input_args, - mca_bcol_iboffload_module_t *iboffload, - collective_message_completion_callback_function cb_fn, - struct mca_bcol_iboffload_collreq_t **coll_request); - -/** - * Start barrier - */ - -int mca_bcol_iboffload_barrier_intra_recursive_doubling( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - /* local variables */ - mca_bcol_iboffload_task_t *send_task = NULL, - *wait_task = NULL; - - struct mqe_task **mqe_ptr_to_set = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = NULL; - - struct mqe_task *last_wait = NULL, /* we need ask from completion on last wait */ - *last_send = NULL; /* If it no wait, we need ask for completion on last send */ - - int rc, exchange, extra_rank, pair_rank; - - - mca_bcol_iboffload_frag_t *send_fragment = NULL, - *preposted_recv_frag = NULL; - - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_intra_recursive_doubling.\n")); - - coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - /* Set mq credits */ - coll_fragment->mq_credits = iboffload->alg_task_consump[RECURSIVE_DOUBLING_BARRIER_ALG]; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - - goto out_of_resources; - } - - coll_fragment->alg = RECURSIVE_DOUBLING_BARRIER_ALG; - - /* - * NOTE: need to generate template, if this will be a multiple fragment - * message. This way we can progress the collective w/o knowing it's - * type - actually, this is not the case for barrier, but just a note - * to remind us that we need to generalize this. - */ - - mqe_ptr_to_set = &coll_fragment->to_post; - - /* - * Fill in the communication pattern - */ - - /* - * If non power of 2, may need to wait for message from "extra" proc. - */ - - if (0 < my_exchange_node->n_extra_sources) { - if (EXCHANGE_NODE == my_exchange_node->node_type) { - /* I will participate in the exchange (of the algorithm) - - * wait for signal from extra process */ - extra_rank = my_exchange_node->rank_extra_source; - preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, extra_rank, coll_request->qp_index); - - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: " - "Failing for getting prepost recv frag.\n")); - goto out_of_resources; - } - - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, - extra_rank, 1, preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: " - "Failing for getting wait task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - } else { - /* I will not participate in the exchange - so just "register" as here */ - extra_rank = my_exchange_node->rank_extra_source; - /* send - no need to send any data, in-order delivery */ - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - extra_rank, coll_request->qp_index, 0, - 0, SBUF,MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); - - send_task = mca_bcol_iboffload_get_send_task(iboffload, extra_rank, - coll_request->qp_index, send_fragment, coll_fragment, INLINE); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: " - "Failing for getting send task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - } - } - - /* loop over exchange send/recv pairs */ - for (exchange = 0; exchange < my_exchange_node->n_exchanges; ++exchange) { - /* rank of exchange partner */ - pair_rank = my_exchange_node->rank_exchanges[exchange]; - /* post send */ - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - pair_rank, coll_request->qp_index, 0, - 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); - - assert(NULL != send_fragment); - - send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank, - coll_request->qp_index, - send_fragment, coll_fragment, INLINE); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_VERBOSE(10, ("Exchaging: " - "Failing for getting send task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - - /* post wait */ - preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, pair_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - IBOFFLOAD_VERBOSE(10, ("Exchaging: " - "Failing for getting prepost recv frag.\n")); - goto out_of_resources; - } - - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1, - preposted_recv_frag, - coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Exchaging: " - "Failing for getting wait task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - } - - /* if non power of 2, may need to send message to "extra" proc */ - if (0 < my_exchange_node->n_extra_sources) { - if (EXTRA_NODE == my_exchange_node->node_type) { - /* I will not participate in the exchange - - * wait for signal from exchange process */ - extra_rank = my_exchange_node->rank_extra_source; - /* post wait */ - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag(iboffload, extra_rank, - coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: " - "Failing for getting prepost recv frag.\n")); - goto out_of_resources; - } - - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1, - preposted_recv_frag, - coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: " - "Failing for getting wait task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - } else { - /* I will participate in the exchange - - * send signal to extra process */ - extra_rank = my_exchange_node->rank_extra_source; - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - extra_rank, coll_request->qp_index, 0, - 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); - - send_task = mca_bcol_iboffload_get_send_task( - iboffload, extra_rank, - coll_request->qp_index, - send_fragment, coll_fragment, INLINE); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: " - "Failing for getting send task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - } - } - - /* Fill in the the rest of the coll_fragment */ - IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n")); - /* end of list */ - *mqe_ptr_to_set = NULL; - - /* finish initializing full message descriptor */ - coll_request->n_fragments = 1; - coll_request->n_frags_sent = 1; - - coll_request->n_frag_mpi_complete = 0; - coll_request->n_frag_net_complete = 0; - - coll_request->user_handle_freed = false; - - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - if (MCA_BCOL_IBOFFLOAD_QP_SYNC != coll_request->qp_index) { - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - } else { - /* Special flow for ML service barrier , only this function supposed to - post service requests */ - struct mqe_task *bad_mqe = NULL; - assert (MCA_BCOL_IBOFFLOAD_QP_SYNC == coll_request->qp_index ); - /* Post to special service MQ - 1 */ - rc = mqe_post_task(iboffload->mq[1], coll_fragment->to_post, &bad_mqe); - if (OPAL_UNLIKELY(0 != rc)) { - IBOFFLOAD_ERROR(("ibv_post_mqe failed on device (%s), errno says: %s," - " the return code is [%d]\n", - ibv_get_device_name(iboffload->device->dev.ib_dev), - strerror(errno), rc)); - return OMPI_ERROR; - } - } - - IBOFFLOAD_VERBOSE(10, ("Return success.\n")); - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -int mca_bcol_iboffload_barrier_intra_recursive_doubling_start( - struct mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc; - - rc = mca_bcol_iboffload_rec_doubling_start_connections(iboffload); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - iboffload->barrier_algth = - mca_bcol_iboffload_barrier_intra_recursive_doubling; - return - mca_bcol_iboffload_barrier_intra_recursive_doubling(iboffload, coll_request); -} - -int mca_bcol_iboffload_nb_memory_service_barrier_start( - struct mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc; - - rc = mca_bcol_iboffload_rec_doubling_start_connections(iboffload); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - iboffload->memsync_algth = - mca_bcol_iboffload_barrier_intra_recursive_doubling; - - return - mca_bcol_iboffload_barrier_intra_recursive_doubling - (iboffload, coll_request); -} - -int mca_bcol_iboffload_nb_memory_service_barrier_intra(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - - /* local variables */ - int rc; - mca_bcol_iboffload_collreq_t *coll_request; - mca_bcol_iboffload_module_t *iboffload = - (mca_bcol_iboffload_module_t *) const_args->bcol_module; - /* - * recursive doubling - */ - - - IBOFFLOAD_VERBOSE(10, ("Memory syncranization barrier was started\n")); - - /* init barrier collective request */ - rc = mca_bcol_iboffload_barrier_init(input_args, iboffload, NULL, &coll_request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("Get error from mca_bcol_iboffload_barrier_init")); - return rc; - } - - /* set the qp index to special qp that is used only for synchronization */ - coll_request->qp_index = MCA_BCOL_IBOFFLOAD_QP_SYNC; - /* overwrite mq index to run over service setup */ - coll_request->first_collfrag.mq_index = SERVICE_MQ; - - /* start the barrier */ - rc = iboffload->memsync_algth(iboffload, coll_request); - if (OPAL_UNLIKELY(OMPI_ERROR == rc)) { - return rc; - } - - /* complete the barrier - progress releases full request descriptors */ - IBOFFLOAD_VERBOSE(10, ("Memory syncranization barrier was started\n")); - - /* done */ - return BCOL_FN_STARTED; -} - -/* Recursive K - ing*/ -static int recursive_knomial_start_connections(struct mca_bcol_iboffload_module_t *iboffload) -{ - netpatterns_k_exchange_node_t *my_exchange_node = - &iboffload->knomial_exchange_tree; - int k, i, n_exchanges = my_exchange_node->n_exchanges, - **exchanges = my_exchange_node->rank_exchanges, - n_extra_src = my_exchange_node->n_extra_sources, - tree_order = my_exchange_node->tree_order - 1, - rank_extra_src; - - mca_bcol_iboffload_endpoint_t *ep; - - iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG] += 0; - - IBOFFLOAD_VERBOSE(10, ("\nMy sbgp rank (index) - %d, " - "num of endpoints = %d, iboffload module - %p" - " extra n %d, n_exchanges %d", - iboffload->ibnet->super.my_index, iboffload->num_endpoints, iboffload, - n_extra_src, n_exchanges)); - if (0 < n_extra_src) { - for (k = 0; k < n_extra_src; k++) { - iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG] += 2; /* One send task one wait */ - rank_extra_src = my_exchange_node->rank_extra_sources_array[k]; - ep = iboffload->endpoints[rank_extra_src]; - if (iboffload->ibnet->super.my_index < ep->index) { - while(0 == (ep)->remote_zero_rdma_addr.addr) { - opal_progress(); - } - } else { - IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index)); - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - } - } - - for (i = 0; i < n_exchanges; ++i) { - for (k = 0; k < tree_order; k++) { - iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG] += 2; /* One send task one wait */ - ep = iboffload->endpoints[exchanges[i][k]]; - - IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index)); - if (iboffload->ibnet->super.my_index < ep->index) { - while(0 == (ep)->remote_zero_rdma_addr.addr) { - opal_progress(); - } - } else { - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - } - } - - return OMPI_SUCCESS; -} - -static int mca_bcol_iboffload_barrier_intra_recursive_knomial( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - /* local variables */ - mca_bcol_iboffload_task_t *send_task = NULL, - *wait_task = NULL; - - struct mqe_task **mqe_ptr_to_set = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = NULL; - - struct mqe_task *last_wait = NULL, /* we need ask from completion on last wait */ - *last_send = NULL; /* If it no wait, we need ask for completion on last send */ - - int rc, exchange, extra_rank, pair_rank, k; - - - mca_bcol_iboffload_frag_t *send_fragment = NULL, - *preposted_recv_frag = NULL; - - netpatterns_k_exchange_node_t *my_exchange_node = - &iboffload->knomial_exchange_tree; - IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_intra_recursive_knomial. Node type %d\n", my_exchange_node->node_type)); - - coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - /* Set mq credits */ - coll_fragment->mq_credits = iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG]; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - - goto out_of_resources; - } - - coll_fragment->alg = RECURSIVE_KNOMIAL_BARRIER_ALG; - - /* - * NOTE: need to generate template, if this will be a multiple fragment - * message. This way we can progress the collective w/o knowing it's - * type - actually, this is not the case for barrier, but just a note - * to remind us that we need to generalize this. - */ - - mqe_ptr_to_set = &coll_fragment->to_post; - - /* - * Fill in the communication pattern - */ - - /* - * If non power of 2, may need to wait for message from "extra" proc. - */ - - if (0 < my_exchange_node->n_extra_sources) { - if (EXCHANGE_NODE == my_exchange_node->node_type) { - /* I will participate in the exchange (of the algorithm) - - * wait for signal from extra process */ - for (k = 0; k < my_exchange_node->n_extra_sources; k++) { - extra_rank = my_exchange_node->rank_extra_sources_array[k]; - IBOFFLOAD_VERBOSE(10,("Exchange [ %d ] extra get %d", k, extra_rank)); - - preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, extra_rank, coll_request->qp_index); - - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: " - "Failing for getting prepost recv frag.\n")); - goto out_of_resources; - } - - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, - extra_rank, 1, preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: " - "Failing for getting wait task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - } - } else { - /* I will not participate in the exchange - so just "register" as here */ - extra_rank = my_exchange_node->rank_extra_sources_array[0]; - IBOFFLOAD_VERBOSE(10,("Send to proxy %d", extra_rank)); - /* send - no need to send any data, in-order delivery */ - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - extra_rank, coll_request->qp_index, 0, - 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); - - send_task = mca_bcol_iboffload_get_send_task(iboffload, extra_rank, - coll_request->qp_index, send_fragment, coll_fragment, INLINE); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: " - "Failing for getting send task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - } - } - - /* loop over exchange send/recv pairs */ - for (exchange = 0; exchange < my_exchange_node->n_exchanges; ++exchange) { - for (k = 0; k < my_exchange_node->tree_order - 1; k++) { - /* rank of exchange partner */ - pair_rank = my_exchange_node->rank_exchanges[exchange][k]; - IBOFFLOAD_VERBOSE(10,("Exchange [ %d ,%d ] send to %d", exchange, k, pair_rank)); - /* post send */ - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - pair_rank, coll_request->qp_index, 0, - 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); - - send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank, - coll_request->qp_index, - send_fragment, coll_fragment, INLINE); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_VERBOSE(10, ("Exchaging: " - "Failing for getting send task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - } - - for (k = 0; k < my_exchange_node->tree_order - 1; k++) { - - pair_rank = my_exchange_node->rank_exchanges[exchange][k]; - IBOFFLOAD_VERBOSE(10,("Exchange [ %d ,%d ] recv %d", exchange, k, pair_rank)); - /* post wait */ - preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, pair_rank, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - IBOFFLOAD_VERBOSE(10, ("Exchaging: " - "Failing for getting prepost recv frag.\n")); - goto out_of_resources; - } - - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Exchaging: " - "Failing for getting wait task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - } - } - - /* if non power of 2, may need to send message to "extra" proc */ - if (0 < my_exchange_node->n_extra_sources) { - if (EXTRA_NODE == my_exchange_node->node_type) { - /* I will not participate in the exchange - - * wait for signal from exchange process */ - extra_rank = my_exchange_node->rank_extra_sources_array[0]; - IBOFFLOAD_VERBOSE(10,("Wait from proxy %d", extra_rank)); - /* post wait */ - preposted_recv_frag = - mca_bcol_iboffload_get_preposted_recv_frag(iboffload, extra_rank, - coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: " - "Failing for getting prepost recv frag.\n")); - goto out_of_resources; - } - - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1, - preposted_recv_frag, - coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: " - "Failing for getting wait task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - } else { - /* I will participate in the exchange - - * send signal to extra process */ - for (k = 0; k < my_exchange_node->n_extra_sources; k++) { - extra_rank = my_exchange_node->rank_extra_sources_array[k]; - IBOFFLOAD_VERBOSE(10,("Exchange [ %d ] extra release %d", k, extra_rank)); - - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - extra_rank, coll_request->qp_index, 0, - 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); - - send_task = mca_bcol_iboffload_get_send_task( - iboffload, extra_rank, - coll_request->qp_index, - send_fragment, coll_fragment, INLINE); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: " - "Failing for getting send task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - } - } - } - - /* Fill in the the rest of the coll_fragment */ - IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n")); - /* end of list */ - *mqe_ptr_to_set = NULL; - - /* finish initializing full message descriptor */ - coll_request->n_fragments = 1; - coll_request->n_frags_sent = 1; - - coll_request->n_frag_mpi_complete = 0; - coll_request->n_frag_net_complete = 0; - - coll_request->user_handle_freed = false; - - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - if (MCA_BCOL_IBOFFLOAD_QP_SYNC != coll_request->qp_index) { - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - } else { - /* Special flow for ML service barrier , only this function supposed to - post service requests */ - struct mqe_task *bad_mqe = NULL; - assert (MCA_BCOL_IBOFFLOAD_QP_SYNC == coll_request->qp_index ); - /* Post to special service MQ - 1 */ - rc = mqe_post_task(iboffload->mq[1], coll_fragment->to_post, &bad_mqe); - if (OPAL_UNLIKELY(0 != rc)) { - IBOFFLOAD_ERROR(("ibv_post_mqe failed on device (%s), errno says: %s," - " the return code is [%d]\n", - ibv_get_device_name(iboffload->device->dev.ib_dev), - strerror(errno), rc)); - return OMPI_ERROR; - } - } - - IBOFFLOAD_VERBOSE(10, ("Return success.\n")); - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -int mca_bcol_iboffload_barrier_intra_recursive_knomial_start( - struct mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc; - - rc = recursive_knomial_start_connections(iboffload); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - iboffload->barrier_algth = - mca_bcol_iboffload_barrier_intra_recursive_knomial; - return - mca_bcol_iboffload_barrier_intra_recursive_knomial(iboffload, coll_request); -} - -int mca_bcol_iboffload_rec_doubling_start_connections(mca_bcol_iboffload_module_t *iboffload) -{ - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - int i, n_exchanges = my_exchange_node->n_exchanges, - *exchanges = my_exchange_node->rank_exchanges, - n_extra_src = my_exchange_node->n_extra_sources, - rank_extra_src = my_exchange_node->rank_extra_source; - - mca_bcol_iboffload_endpoint_t *ep; - - IBOFFLOAD_VERBOSE(10, ("\nMy sbgp rank (index) - %d, " - "num of endpoints = %d, iboffload module - %p\n", - iboffload->ibnet->super.my_index, iboffload->num_endpoints, iboffload)); - if (0 < n_extra_src) { - iboffload->alg_task_consump[RECURSIVE_DOUBLING_BARRIER_ALG] += 2; /* One send task one wait */ - ep = iboffload->endpoints[rank_extra_src]; - - if (iboffload->ibnet->super.my_index < ep->index) { - while(0 == (ep)->remote_zero_rdma_addr.addr) { - opal_progress(); - } - } else { - IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index)); - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - } - - for (i = 0; i < n_exchanges; ++i) { - iboffload->alg_task_consump[RECURSIVE_DOUBLING_BARRIER_ALG] += 2; /* One send task one wait */ - ep = iboffload->endpoints[exchanges[i]]; - - if (iboffload->ibnet->super.my_index < ep->index) { - while(0 == (ep)->remote_zero_rdma_addr.addr) { - opal_progress(); - } - } else { - IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index)); - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - } - - return OMPI_SUCCESS; -} - -static int mca_bcol_iboffload_barrier_init( - bcol_function_args_t *input_args, - mca_bcol_iboffload_module_t *iboffload, - collective_message_completion_callback_function cb_fn, - struct mca_bcol_iboffload_collreq_t **coll_request) -{ - ompi_free_list_item_t *item; - mca_bcol_iboffload_collfrag_t *coll_fragment; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init")); - - OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item); - if (OPAL_UNLIKELY(NULL == item)) { - IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - (*coll_request) = (mca_bcol_iboffload_collreq_t *) item; - (*coll_request)->progress_fn = iboffload->barrier_algth; - - /* - * For usual barrier it is null. For memory - * service barrier we need some work to do - */ - (*coll_request)->completion_cb_fn = cb_fn; - (*coll_request)->order_info = &input_args->order_info; - - (*coll_request)->module = iboffload; - (*coll_request)->ml_buffer_index = input_args->buffer_index; - (*coll_request)->buffer_info[SBUF].offset = 0; - (*coll_request)->buffer_info[RBUF].offset = 0; - (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER; - - input_args->bcol_opaque_data = (void *) (*coll_request); - - /* - * setup collective work request - */ - - /* get collective frag */ - coll_fragment = &(*coll_request)->first_collfrag; - mca_bcol_iboffload_collfrag_init(coll_fragment); - - coll_fragment->mq_index = COLL_MQ; - - /* set pointers for (coll frag) <-> (coll full request) */ - MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment); - - return OMPI_SUCCESS; -} - -/************************************************************************ - ************************ New style Barrier ***************************** - ***********************************************************************/ - -static int mca_bcol_iboffload_new_style_barrier_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_collreq_t *coll_request = - (mca_bcol_iboffload_collreq_t *) - input_args->bcol_opaque_data; - - if (BCOL_IS_COMPLETED(coll_request)) { - coll_request->user_handle_freed = true; - if (COLLREQ_IS_DONE(coll_request)) { - IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); - RELEASE_COLLREQ(coll_request); - } - - IBOFFLOAD_VERBOSE(10, ("Barrier already done.\n")); - return BCOL_FN_COMPLETE; - } - - return BCOL_FN_STARTED; -} - -static int mca_bcol_iboffload_new_style_barrier_intra( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variables */ - int rc; - mca_bcol_iboffload_collreq_t *coll_request; - mca_bcol_iboffload_module_t *iboffload = - (mca_bcol_iboffload_module_t *) const_args->bcol_module; - - /* check for ordering */ - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args); - - /* - * recursive doubling - */ - - - IBOFFLOAD_VERBOSE(10, ("Barrier starts.\n")); - - /* init barrier collective reqeust */ - rc = mca_bcol_iboffload_barrier_init(input_args, iboffload, NULL, &coll_request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("Get error from mca_bcol_iboffload_barrier_init")); - return rc; - } - - /* start the barrier */ - rc = iboffload->barrier_algth(iboffload, coll_request); - if (OPAL_UNLIKELY(OMPI_ERROR == rc)) { - return BCOL_FN_NOT_STARTED; - } - - /* done */ - return BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_barrier_register(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - IBOFFLOAD_VERBOSE(10, ("Register iboffload Barrier.\n")); - - comm_attribs.bcoll_type = BCOL_BARRIER; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_new_style_barrier_intra, - mca_bcol_iboffload_new_style_barrier_progress); - - return OMPI_SUCCESS; -} - -int mca_bcol_iboffload_memsync_register(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - IBOFFLOAD_VERBOSE(10, ("Register sync function\n")); - - comm_attribs.bcoll_type = BCOL_SYNC; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_nb_memory_service_barrier_intra, - mca_bcol_iboffload_new_style_barrier_progress); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_bcast.c b/ompi/mca/bcol/iboffload/bcol_iboffload_bcast.c deleted file mode 100644 index dd392117ed..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_bcast.c +++ /dev/null @@ -1,1065 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include -#include -#include -#include "opal_stdint.h" - -#include "bcol_iboffload.h" -#include "bcol_iboffload_bcast.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_collfrag.h" -#include "bcol_iboffload_endpoint.h" - -#include "opal/include/opal/types.h" - -static int mca_bcol_iboffload_bcast_init( - bcol_function_args_t *fn_arguments, - mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collreq_t **coll_request, - bool if_bcol_last, int mq_credits, - collective_message_progress_function progress_fn) -{ - ompi_free_list_item_t *item; - mca_bcol_iboffload_collfrag_t *coll_fragment; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; - - OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item); - if (OPAL_UNLIKELY(NULL == item)) { - IBOFFLOAD_ERROR(("Wait for free list failed.\n")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - /* setup call request */ - (*coll_request) = (mca_bcol_iboffload_collreq_t *) item; - - (*coll_request)->n_fragments = 0; - (*coll_request)->n_frags_sent = 0; - (*coll_request)->n_frag_mpi_complete = 0; - (*coll_request)->n_frag_net_complete = 0; - (*coll_request)->if_bcol_last = if_bcol_last; - (*coll_request)->ml_buffer_index = fn_arguments->buffer_index; - (*coll_request)->completion_cb_fn = NULL; - (*coll_request)->buffer_info[SBUF].buf = (void *) ( - (unsigned char *)fn_arguments->sbuf + - fn_arguments->sbuf_offset); - (*coll_request)->buffer_info[SBUF].offset = fn_arguments->sbuf_offset; - (*coll_request)->buffer_info[RBUF].offset = fn_arguments->rbuf_offset; - - (*coll_request)->dtype = fn_arguments->dtype; - (*coll_request)->count = fn_arguments->count; - (*coll_request)->module = iboffload_module; - /* TODO Pasha: we need it for pending quque. Set it later. */ - (*coll_request)->progress_fn = progress_fn; - /* TODO Pasha: fix it later */ - (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_REGULAR; - - (*coll_request)->order_info = &fn_arguments->order_info; - - coll_fragment = &((*coll_request)->first_collfrag); - mca_bcol_iboffload_collfrag_init(coll_fragment); - - /** Vasily ????? */ - /* mq_credits = (*coll_request)->total_tasks_num; */ - coll_fragment->mq_credits = mq_credits; - coll_fragment->mq_index = COLL_MQ; - /* Pasha: just set it to zero */ - coll_fragment->last_wait_num = 0; - coll_fragment->alg = -2; /* used only for debug */ - /* - if (my_rank == algthm_ptr->root) { - coll_fragment->last_wait_num = 0; - } else { - coll_fragment->last_wait_num = algth_lst->last_wait_num; - } - */ - /* Pasha: we have nothing to unpack */ - coll_fragment->unpack_size = 0; - /* coll_fragment->unpack_size = pack_len; */ - /* coll_fragment->alg = RECURSIVE_DOUBLING_TREE_BCAST; */ - - /* set pointers for (coll frag) <-> (coll full request) */ - (*coll_request)->user_handle_freed = false; - - fn_arguments->bcol_opaque_data = (void *) (*coll_request); - - if (true == fn_arguments->root_flag) { - (*coll_request)->root = my_group_index; - } else { - (*coll_request)->root = fn_arguments->root_route->rank; - } - - MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS((*coll_request), coll_fragment); - return OMPI_SUCCESS; -} -static inline __opal_attribute_always_inline__ int -binomial_scatter_smsg( - mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collfrag_t *coll_fragment, - struct mqe_task **last_send, - int radix_mask_pow, - uint32_t my_group_index, - size_t send_size - ) -{ - int rc, dst; - int radix_mask = radix_mask_pow >= 0 ? 1 << radix_mask_pow : 0; - - while(radix_mask > 0) { - /* For each level of tree, do sends */ - dst = my_group_index ^ radix_mask; - rc = mca_bcol_iboffload_send_small_buff_setup( - last_send, send_size, dst, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); - return rc; - } - - radix_mask >>= 1; - } - - return OMPI_SUCCESS; -} - -#define BINOMIAL_SMALL_SCATTER( \ - iboffload_module, coll_fragment, \ - last_wait, last_send, \ - distance, \ - my_group_index, \ - segment_size \ - ) \ -do { \ - int rc = OMPI_SUCCESS; \ - int dst; \ - int send_size; \ - int dst_boundary_rank; \ - int radix_mask_pow = distance; \ - int radix_mask = (distance) >= 0 ? 1 << (distance) : 0; \ - IBOFFLOAD_VERBOSE(10, ("BCAST SCATTER %d %d", radix_mask, distance)); \ - \ - while(radix_mask > 0) { \ - /* For each level of tree, do sends */ \ - dst = my_group_index ^ radix_mask; \ - dst_boundary_rank = dst & ((~(int)0) << (radix_mask_pow)); \ - \ - IBOFFLOAD_VERBOSE(10, ("Scatter data to %d , len %d offset %d", dst, send_size, send_offset)); \ - \ - rc = mca_bcol_iboffload_send_small_buff_setup( \ - &last_send, send_size, dst, \ - iboffload_module, coll_fragment); \ - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { \ - IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); \ - return rc; \ - } \ - radix_mask >>= 1; \ - /* radix_mask_pow--; */ \ - } \ -} while(0) - - -int mca_bcol_iboffload_small_msg_bcast_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_collreq_t *coll_request = - (mca_bcol_iboffload_collreq_t *) - input_args->bcol_opaque_data; - - IBOFFLOAD_VERBOSE(10, ("Run progress.\n")); - - /* We should send the data to our children in the tree before - the upper layer will start with buffers recycling */ - if (BCOL_AND_NET_ARE_COMPLETED(coll_request)) { - coll_request->user_handle_freed = true; - if (COLLREQ_IS_DONE(coll_request)) { - IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); - RELEASE_COLLREQ(coll_request); - } - - IBOFFLOAD_VERBOSE(10, ("New bcast done !!!")); - return BCOL_FN_COMPLETE; - } - - return BCOL_FN_STARTED; -} - -static int mca_bcol_iboffload_small_msg_bcast_exec(mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collreq_t *coll_request) -{ - netpatterns_pair_exchange_node_t *recursive_doubling_tree = - &iboffload_module->recursive_doubling_tree; - - int rc, - distance_mask_pow , dst, - group_src, power_of_2_distance; - - uint32_t pack_len; - int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; - - struct mqe_task *last_send = NULL, - *last_wait = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; - - IBOFFLOAD_VERBOSE(10,("Entering small msg iboffload bcast")); - - if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { - IBOFFLOAD_VERBOSE(10,("Bcast open new connection ")); - bcol_iboffload_setup_binomial_connection(iboffload_module); - } - - pack_len = coll_request->count * coll_request->dtype->super.size; - IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", - pack_len, - coll_request->count, - coll_request->dtype->super.size)); - - /* it is estimated mq consumption... */ - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - coll_fragment->tail_next = &coll_fragment->to_post; - coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey; - - if (coll_request->root == my_group_index) { - IBOFFLOAD_VERBOSE(10, ("I'm root of the data")); - - /* Send data to the extra peer */ - if (recursive_doubling_tree->n_extra_sources > 0) { - /* send the all data to your extra peer */ - dst = recursive_doubling_tree->rank_extra_source; - IBOFFLOAD_VERBOSE(10,("Sending the dat to Dst %d",dst)); - rc = mca_bcol_iboffload_send_small_buff_setup( - &last_send, pack_len, dst, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_send_large_buff_setup")); - goto out_of_resources; - } - } - - distance_mask_pow = - iboffload_module->power_of_2 - 1; - - rc = binomial_scatter_smsg(iboffload_module, coll_fragment, - &last_send, distance_mask_pow, - my_group_index, pack_len); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg")); - goto out_of_resources; - } - - goto finalize; - } - - /* prepare and post recv operation */ - group_src = bcol_iboffload_binomial_root_to_src(coll_request->root, - my_group_index, iboffload_module->power_of_2_ranks, - iboffload_module->group_size, &power_of_2_distance); - assert(group_src >= 0); - - if (0 > power_of_2_distance) { - /* the rank is virtual root for this group, receive the data - and scatter gather as root */ - IBOFFLOAD_VERBOSE(10,("Virtual root distance_mask_pow %d ",iboffload_module->power_of_2)); - distance_mask_pow = iboffload_module->power_of_2 - 1; - } else { - distance_mask_pow = power_of_2_distance - 1; - } - - IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d", - group_src)); - - rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, - pack_len, group_src, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); - goto out_of_resources; - } - - rc = binomial_scatter_smsg(iboffload_module, coll_fragment, - &last_send, distance_mask_pow, - my_group_index, pack_len); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg")); - goto out_of_resources; - } - - if (recursive_doubling_tree->n_extra_sources > 0 && - iboffload_module->power_of_2 - 1 != distance_mask_pow) { -/* - - if ((recursive_doubling_tree->n_extra_sources > 0) && - ((my_group_index + iboffload_module->power_of_2_ranks ) < - iboffload_module->group_size) ) { - */ - dst = recursive_doubling_tree->rank_extra_source; - /* - dst = my_group_index + iboffload_module->power_of_2_ranks; - */ - - rc = mca_bcol_iboffload_send_small_buff_setup( - &last_send, pack_len, dst, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_send_small_buff_setup")); - goto out_of_resources; - } - } - -finalize: - /* end of list */ - *coll_fragment->tail_next = NULL; - - /* finish initializing full message descriptor */ - (coll_request)->n_fragments += 1; - (coll_request)->n_frags_sent += 1; - - if (NULL != last_wait) { - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - } else { - last_send->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_send->wr_id; - last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; - } - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); - - IBOFFLOAD_VERBOSE(10, ("Return success.\n")); - return BCOL_FN_STARTED; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); - rc = - mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); - return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *) const_args->bcol_module; - - int rc; - int mq_credits = iboffload_module->power_of_2 + 2; - bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); - mca_bcol_iboffload_collreq_t *coll_request; - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); - - rc = mca_bcol_iboffload_bcast_init(fn_arguments, iboffload_module, - &coll_request, if_bcol_last, mq_credits, - mca_bcol_iboffload_small_msg_bcast_exec); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - rc = coll_request->progress_fn(iboffload_module, coll_request); - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_small_msg_bcast_intra was started [%d]\n", rc)); - return rc; -} - -static int mca_bcol_iboffload_small_msg_bcast_extra_exec(mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collreq_t *coll_request) -{ - netpatterns_pair_exchange_node_t *recursive_doubling_tree = - &iboffload_module->recursive_doubling_tree; - - int rc, - dst; - int my_group_index = iboffload_module->super.sbgp_partner_module->my_index; - uint32_t pack_len; - - struct mqe_task *last_send = NULL, - *last_wait = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; - - IBOFFLOAD_VERBOSE(10,("Entering small msg extra iboffload bcast")); - - if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { - IBOFFLOAD_VERBOSE(10,("Bcast open new connection ")); - bcol_iboffload_setup_binomial_connection(iboffload_module); - } - - - pack_len = coll_request->count * coll_request->dtype->super.size; - coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey; - - IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ", - pack_len, - coll_request->count, - coll_request->dtype->super.size)); - - /* it is estimated mq consumption... */ - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload_module, - coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - coll_fragment->tail_next = &coll_fragment->to_post; - - - if (coll_request->root == my_group_index) { - IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2)); - /* send the all data to your extra peer */ - - dst = recursive_doubling_tree->rank_extra_source; - IBOFFLOAD_VERBOSE(10,("Im extra root sending data to %d \n",dst)); - rc = mca_bcol_iboffload_send_small_buff_setup( - &last_send, pack_len, dst, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_send_small_buff_setup")); - goto out_of_resources; - } - } else { - /* Not root case */ - dst = recursive_doubling_tree->rank_extra_source; - rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait, - pack_len, dst, - iboffload_module, coll_fragment); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); - return OMPI_ERROR; - } - } - - /* end of list */ - *coll_fragment->tail_next = NULL; - - /* finish initializing full message descriptor */ - (coll_request)->n_fragments = 1; - (coll_request)->n_frags_sent = 1; - - if (NULL != last_wait) { - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - } else { - last_send->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_send->wr_id; - last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; - } - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); - - IBOFFLOAD_VERBOSE(10, ("Return success.\n")); - return BCOL_FN_STARTED; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); - rc = - mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); - return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_small_msg_bcast_extra_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *)const_args->bcol_module; - - int rc; - int mq_credits = 2; - bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); - mca_bcol_iboffload_collreq_t *coll_request; - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); - - rc = mca_bcol_iboffload_bcast_init(fn_arguments, iboffload_module, - &coll_request, if_bcol_last, mq_credits, - mca_bcol_iboffload_small_msg_bcast_extra_exec); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - rc = coll_request->progress_fn(iboffload_module, coll_request); - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_small_msg_bcast_extra_exec was started [%d]\n", rc)); - return rc; -} - -/* Large message scatter-allgather with zero copy */ -int mca_bcol_iboffload_zero_copy_progress(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args) -{ - int i; - mca_bcol_iboffload_collreq_t *coll_request = - (mca_bcol_iboffload_collreq_t *)fn_arguments->bcol_opaque_data; - - /* IBOFFLOAD_VERBOSE(10, ("Run general progress. %d == %d * %d == %d", - coll_request->n_frag_mpi_complete, coll_request->n_fragments, - coll_request->n_frag_net_complete, coll_request->n_fragments)); */ - - /* Complete the bcast - progress releases full request descriptors */ - for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) { - if (coll_request->n_frag_mpi_complete == coll_request->n_fragments && - coll_request->n_frag_net_complete == coll_request->n_fragments) { - - IBOFFLOAD_VERBOSE(10, ("Deregister user buff.\n")); - coll_request->module->device->mpool->mpool_deregister( - coll_request->module->device->mpool, - (mca_mpool_base_registration_t *) coll_request->buffer_info[SBUF].iboffload_reg); - coll_request->buffer_info[SBUF].iboffload_reg = NULL; - - RELEASE_COLLREQ(coll_request); - IBOFFLOAD_VERBOSE(10, ("New bcast done !!!")); - return BCOL_FN_COMPLETE; - } - } - - /* IBOFFLOAD_VERBOSE(10, ("Bcast general progress done")); */ - - /* done */ - return BCOL_FN_STARTED; -} -/* Pasha: I have to move it to static inline later, it looks too ugly for macro */ -#define BINOMIAL_SCATTER( \ - iboffload_module, coll_fragment, \ - last_wait, last_send, \ - distance, \ - my_group_index, \ - segment_size, count \ - ) \ -do { \ - int rc = OMPI_SUCCESS; \ - int dst; \ - int send_size; \ - int send_offset; \ - int delta; \ - int dst_boundary_rank; \ - int radix_mask_pow = distance; \ - int radix_mask = (distance) >= 0 ? 1 << (distance) : 0; \ - IBOFFLOAD_VERBOSE(10, ("BCAST SCATTER %d %d", radix_mask, distance)); \ - \ - while(radix_mask > 0) { \ - /* For each level of tree, do sends */ \ - dst = my_group_index ^ radix_mask; \ - dst_boundary_rank = dst & ((~(int)0) << (radix_mask_pow)); \ - send_offset = segment_size * dst_boundary_rank; \ - /* Pasha: make sure that we handle the corner cases */ \ - delta = count - send_offset; \ - if (OPAL_UNLIKELY(delta <= 0)) { \ - radix_mask >>= 1; \ - radix_mask_pow--; \ - continue; /* we have to send something, other way it will hang */ \ - } else { \ - /* the tail case */ \ - send_size = (int) \ - (delta - (int)segment_size * radix_mask) < 0 ? delta : \ - (int)segment_size * radix_mask; \ - } \ - IBOFFLOAD_VERBOSE(10, ("Scatter data to %d , len %d offset %d", dst, send_size, send_offset)); \ - rc = mca_bcol_iboffload_recv_rtr_setup( \ - &last_wait, dst, iboffload_module, coll_fragment); \ - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { \ - IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); \ - return OMPI_ERROR; \ - } \ - rc = mca_bcol_iboffload_send_large_buff_setup( \ - &last_send, SBUF, send_offset, send_size, dst, \ - iboffload_module, coll_fragment); \ - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { \ - IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); \ - return OMPI_ERROR; \ - } \ - radix_mask >>= 1; \ - radix_mask_pow--; \ - } \ -} while(0) - -static int mca_bcol_iboffload_bcast_scatter_allgather_exec(mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collreq_t *coll_request) -{ - netpatterns_pair_exchange_node_t *recursive_doubling_tree = - &iboffload_module->recursive_doubling_tree; - - int rc, - dst, - group_src, power_of_2_distance, - recv_count; - size_t offset; - int count = coll_request->count * coll_request->dtype->super.size; - int my_group_index = iboffload_module->ibnet->super.my_index; - size_t base_block_size = - (count + iboffload_module->power_of_2_ranks - 1) / - iboffload_module->power_of_2_ranks; - - struct mqe_task *last_send = NULL, - *last_wait = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; - - if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { - bcol_iboffload_setup_binomial_connection(iboffload_module); - } - - /* register memory in mpool/rcache */ - rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count, - &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("Cannot register memory: " - "addr - %p, %d bytes.\n", - coll_request->buffer_info[SBUF].buf, count)); - return OMPI_ERROR; - } - - coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey; - - /* it is estimated mq consumption... */ - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) || - false == opal_list_is_empty(&iboffload_module->collfrag_pending))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - coll_fragment->tail_next = &coll_fragment->to_post; - - if (coll_request->root == my_group_index) { - IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d %d", - iboffload_module->power_of_2, recursive_doubling_tree->n_extra_sources )); - /* for proxy we have little bit more work to do */ - if (recursive_doubling_tree->n_extra_sources > 0) { - /* send the all data to your extra peer */ - dst = recursive_doubling_tree->rank_extra_source; - rc = mca_bcol_iboffload_recv_rtr_setup( - &last_wait, dst, iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_recv_rtr_setup")); - return OMPI_ERROR; - } - rc = mca_bcol_iboffload_send_large_buff_setup( - &last_send, SBUF, 0, count, dst, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_send_large_buff_setup")); - return OMPI_ERROR; - } - } - power_of_2_distance = iboffload_module->power_of_2; - - BINOMIAL_SCATTER(iboffload_module, coll_fragment, - last_wait, last_send, power_of_2_distance - 1, - my_group_index, base_block_size, count - ); - /* EXIT OR GO TO Gather */ - goto GATHER; - } - - /* prepare and post recv operation */ - group_src = bcol_iboffload_binomial_root_to_src(coll_request->root, - my_group_index, iboffload_module->power_of_2_ranks, - iboffload_module->group_size, &power_of_2_distance); - - IBOFFLOAD_VERBOSE(10, ("SRC %d DIST %d ranks %d gsize %d root %d my rank %d", - group_src, power_of_2_distance, iboffload_module->power_of_2_ranks, - iboffload_module->group_size, - coll_request->root, my_group_index)); - assert(group_src >= 0); - - if (0 > power_of_2_distance) { - /* the rank is virtual root for this group, receive the data - and scatter gather as root */ - power_of_2_distance = - iboffload_module->power_of_2; - offset = 0; - recv_count = count; - IBOFFLOAD_VERBOSE(10, ("Virtual root %d , set mask to %d", - my_group_index, power_of_2_distance)); - } else { - int my_left_boundary_rank; - int delta; - recv_count = base_block_size * (1 << power_of_2_distance); /* we may receive larger data */ - my_left_boundary_rank = my_group_index & ((~(int)0) << power_of_2_distance ); - offset = (size_t) (base_block_size * my_left_boundary_rank); - delta = count - offset; - if (OPAL_UNLIKELY(delta <= 0)) { - /* no data to recv */ - goto GATHER; - } else { - recv_count = (delta < recv_count) ? delta : recv_count; - } - - IBOFFLOAD_VERBOSE(10, ("Recv data set mask to %d", - power_of_2_distance)); - } - - IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d", - group_src, recv_count, offset)); - - /* Receive data to user buffer */ - rc = mca_bcol_iboffload_send_rtr_setup(&last_send, - group_src, iboffload_module, - coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, - SBUF, offset, recv_count, group_src, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); - return OMPI_ERROR; - } - - BINOMIAL_SCATTER(iboffload_module, coll_fragment, - last_wait, last_send, power_of_2_distance - 1, - my_group_index, base_block_size, count); - -GATHER: - rc = bcol_iboffload_bcast_binomial_gather(iboffload_module, - &last_send, &last_wait, coll_fragment, - count, base_block_size, power_of_2_distance); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup gather. Return %d", rc)); - return rc; - } - - if (recursive_doubling_tree->n_extra_sources > 0 && - iboffload_module->power_of_2 != power_of_2_distance) { - dst = recursive_doubling_tree->rank_extra_source; - - rc = mca_bcol_iboffload_recv_rtr_setup( - &last_wait, dst, iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_recv_rtr_setup")); - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_send_large_buff_setup( - &last_send, SBUF, 0, count, dst, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_send_large_buff_setup")); - return OMPI_ERROR; - } - } - - IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n")); - - /* end of list */ - *coll_fragment->tail_next = NULL; - - /* finish initializing full message descriptor */ - coll_request->n_fragments += 1; - coll_request->n_frags_sent += 1; - - if (NULL != last_wait) { - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - } else { - last_send->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_send->wr_id; - last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; - } - - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); - - IBOFFLOAD_VERBOSE(10, ("Return success.\n")); - - return BCOL_FN_STARTED; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); - rc = - mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); - return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_bcast_scatter_allgather_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *) const_args->bcol_module; - - int rc; - int mq_credits = iboffload_module->power_of_2 * 3 + 4; - bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); - mca_bcol_iboffload_collreq_t *coll_request; - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); - - rc = mca_bcol_iboffload_bcast_init(fn_arguments, iboffload_module, - &coll_request, if_bcol_last, mq_credits, - mca_bcol_iboffload_bcast_scatter_allgather_exec); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - rc = coll_request->progress_fn(iboffload_module, coll_request); - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_bcast_scatter_allgather_intra was started [%d]\n", rc)); - return rc; -} - -static int mca_bcol_iboffload_bcast_scatter_allgather_extra_exec(mca_bcol_iboffload_module_t *iboffload_module, - mca_bcol_iboffload_collreq_t *coll_request) -{ - netpatterns_pair_exchange_node_t *recursive_doubling_tree = - &iboffload_module->recursive_doubling_tree; - - int rc, dst; - int count = coll_request->count * coll_request->dtype->super.size; - int my_group_index = iboffload_module->ibnet->super.my_index; - struct mqe_task *last_send = NULL, - *last_wait = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag; - - if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) { - bcol_iboffload_setup_binomial_connection(iboffload_module); - } - - /* register memory in mpool/rcache */ - rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count, - &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("Cannot register memory: " - "addr - %p, %d bytes.\n", - coll_request->buffer_info[SBUF].buf, count)); - return OMPI_ERROR; - } - - coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey; - - /* it is estimated mq consumption... */ - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) || - false == opal_list_is_empty(&iboffload_module->collfrag_pending))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - coll_fragment->tail_next = &coll_fragment->to_post; - - /* send or recv the data */ - - if (coll_request->root == my_group_index) { - IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2)); - /* send the all data to your extra peer */ - dst = recursive_doubling_tree->rank_extra_source; - rc = mca_bcol_iboffload_recv_rtr_setup( - &last_wait, dst, iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_recv_rtr_setup")); - return OMPI_ERROR; - } - rc = mca_bcol_iboffload_send_large_buff_setup( - &last_send, SBUF, 0, count, dst, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to" - " mca_bcol_iboffload_send_large_buff_setup")); - return OMPI_ERROR; - } - } else { - /* Not root case */ - dst = recursive_doubling_tree->rank_extra_source; - rc = mca_bcol_iboffload_send_rtr_setup(&last_send, - dst, iboffload_module, - coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, - SBUF, 0, count, dst, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); - return OMPI_ERROR; - } - } - - IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n")); - - /* end of list */ - *coll_fragment->tail_next = NULL; - - /* finish initializing full message descriptor */ - coll_request->n_fragments += 1; - coll_request->n_frags_sent += 1; - - if (NULL != last_wait) { - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - } else { - last_send->flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = last_send->wr_id; - last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; - } - - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info); - - IBOFFLOAD_VERBOSE(10, ("Return success.\n")); - - return BCOL_FN_STARTED; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n")); - rc = - mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module); - return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_bcast_scatter_allgather_extra_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *) const_args->bcol_module; - - int rc; - int mq_credits = iboffload_module->power_of_2 * 3 + 4; - bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args); - mca_bcol_iboffload_collreq_t *coll_request; - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments); - - rc = mca_bcol_iboffload_bcast_init(fn_arguments, iboffload_module, - &coll_request, if_bcol_last, mq_credits, - mca_bcol_iboffload_bcast_scatter_allgather_extra_exec); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - rc = coll_request->progress_fn(iboffload_module, coll_request); - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_bcast_scatter_allgather_extra_intra was started [%d]\n", rc)); - return rc; -} - -int mca_bcol_iboffload_bcast_register(mca_bcol_base_module_t *super) -{ - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *) super; - - int my_group_index = iboffload_module->ibnet->super.my_index; - - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - IBOFFLOAD_VERBOSE(10, ("Register iboffload Bcast.\n")); - - comm_attribs.bcoll_type = BCOL_BCAST; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - if (my_group_index < iboffload_module->power_of_2_ranks) { - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_small_msg_bcast_intra, - mca_bcol_iboffload_small_msg_bcast_progress); - - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_bcast_scatter_allgather_intra, - mca_bcol_iboffload_zero_copy_progress); - - } else { - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_small_msg_bcast_extra_intra, - mca_bcol_iboffload_small_msg_bcast_progress); - - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_bcast_scatter_allgather_extra_intra, - mca_bcol_iboffload_zero_copy_progress); - - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_bcast.h b/ompi/mca/bcol/iboffload/bcol_iboffload_bcast.h deleted file mode 100644 index f283ab65e9..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_bcast.h +++ /dev/null @@ -1,606 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_IBOFFLOAD_BCAST_H -#define MCA_BCOL_IBOFFLOAD_BCAST_H - -#include "ompi_config.h" - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_collfrag.h" -#include "bcol_iboffload_endpoint.h" - -#include "opal/include/opal/types.h" - -BEGIN_C_DECLS - -int mca_bcol_iboffload_small_msg_bcast_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -int mca_bcol_iboffload_small_msg_bcast_extra_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args); -int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args); -int mca_bcol_iboffload_bcast_scatter_allgather_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args); -int mca_bcol_iboffload_zero_copy_progress(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args); -int mca_bcol_iboffload_bcast_scatter_allgather_extra_intra(bcol_function_args_t *fn_arguments, - struct mca_bcol_base_function_t *const_args); -int mca_bcol_iboffload_bcast_register(mca_bcol_base_module_t *super); - -static inline __opal_attribute_always_inline__ int -mca_bcol_iboffload_recv_rtr_setup( - struct mqe_task **last_wait, - uint32_t dest_rank, - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collfrag_t *coll_fragment) -{ - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_frag_t *fragment; - - /* Wait for RTR message over credit QP */ - fragment = mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, dest_rank, - MCA_BCOL_IBOFFLOAD_QP_CREDIT); - if (OPAL_UNLIKELY(NULL == fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - task = mca_bcol_iboffload_get_wait_task( - iboffload, dest_rank, 1, fragment, MCA_BCOL_IBOFFLOAD_QP_CREDIT, - iboffload->endpoints[dest_rank]->qps[MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF].qp->lcl_qp); - if (OPAL_UNLIKELY(NULL == task)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); - MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ int -mca_bcol_iboffload_send_small_buff_setup( - struct mqe_task **last_send, - size_t len, uint32_t dest_rank, - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collfrag_t *coll_fragment) -{ - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_frag_t *fragment; - - mca_bcol_iboffload_collreq_t *coll_request = - coll_fragment->coll_full_req; - - IBOFFLOAD_VERBOSE(10,("Get ml frag that I will send dest rank %d, len %d, lkey %d", - dest_rank, len, iboffload->rdma_block.ib_info.lkey)); - - fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank, - coll_request->qp_index, len, 0, - SBUF, /* this could be problematic */ - MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML); - if (OPAL_UNLIKELY(NULL == fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - IBOFFLOAD_VERBOSE(10,("Get an rdma task for dest %d for packet size %d", - dest_rank,len)); - task = mca_bcol_iboffload_get_rdma_task( - dest_rank, 0, - fragment, iboffload, coll_fragment); - - if (OPAL_UNLIKELY(NULL == task)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - *last_send = &task->element; - - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); - MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ int -mca_bcol_iboffload_send_large_buff_setup( - struct mqe_task **last_send, - int buf_index, int offset, - size_t len, uint32_t dest_rank, - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collfrag_t *coll_fragment) -{ - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_frag_t *fragment; - - mca_bcol_iboffload_collreq_t *coll_request = - coll_fragment->coll_full_req; - - fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank, - MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, - len, - offset, buf_index, MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML); - if (OPAL_UNLIKELY(NULL == fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - task = mca_bcol_iboffload_get_send_task( - iboffload, dest_rank, - MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, - fragment, coll_fragment, NO_INLINE); - - if (OPAL_UNLIKELY(NULL == task)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - *last_send = &task->element; - - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); - MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ int -mca_bcol_iboffload_send_rtr_setup( - struct mqe_task **last_send, - uint32_t dest_rank, - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collfrag_t *coll_fragment) -{ - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_frag_t *fragment; - - /* Recv is ready , Send RTR message */ - fragment = mca_bcol_iboffload_get_send_frag(coll_fragment->coll_full_req, - dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT, 0, - 0, RBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); - if (OPAL_UNLIKELY(NULL == fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - task = mca_bcol_iboffload_get_send_task(iboffload, dest_rank, - MCA_BCOL_IBOFFLOAD_QP_CREDIT, - fragment, coll_fragment, INLINE); - if (OPAL_UNLIKELY(NULL == task)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - IBOFFLOAD_VERBOSE(10, ("dest_rank - %d. qp index - %d.\n", - dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT)); - - *last_send = &task->element; - - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); - MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ int -mca_bcol_iboffload_recv_small_preposted_buff_setup( - struct mqe_task **last_wait, - size_t len, uint32_t dest_rank, - int qp_index, - int nwaits, - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collfrag_t *coll_fragment) -{ - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_frag_t *fragment; - - IBOFFLOAD_VERBOSE(10,("Get preposted recv from rank %d", dest_rank)); - - fragment = mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, dest_rank, - qp_index); - if (OPAL_UNLIKELY(NULL == fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, nwaits, - fragment, qp_index, NULL); - if (OPAL_UNLIKELY(NULL == task)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - *last_wait = &task->element; - - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); - MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ int -mca_bcol_iboffload_recv_small_buff_setup( - struct mqe_task **last_wait, - size_t len, uint32_t dest_rank, - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collfrag_t *coll_fragment) -{ - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_frag_t *fragment; - - mca_bcol_iboffload_collreq_t *coll_request = - coll_fragment->coll_full_req; - - IBOFFLOAD_VERBOSE(10, ("Get preposted recv from rank %d", dest_rank)); - - fragment = mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, dest_rank, - coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1, - fragment, coll_request->qp_index, NULL); - if (OPAL_UNLIKELY(NULL == task)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - *last_wait = &task->element; - - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); - MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ int -mca_bcol_iboffload_recv_large_buff_setup( - struct mqe_task **last_wait, - int buf_index, int offset, - size_t len, uint32_t dest_rank, - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collfrag_t *coll_fragment) -{ - int num_preposted; - - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_frag_t *fragment; - - mca_bcol_iboffload_collreq_t *coll_request = coll_fragment->coll_full_req; - - /* Post message to recv queue for large messages */ - fragment = mca_bcol_iboffload_get_ml_frag( - iboffload, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, len, - coll_request->buffer_info[buf_index].iboffload_reg->mr->lkey, - (uint64_t)((unsigned char *)coll_request->buffer_info[buf_index].buf + offset)); - if (OPAL_UNLIKELY(NULL == fragment)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - num_preposted = mca_bcol_iboffload_prepost_ml_recv_frag( - MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, - dest_rank, fragment, iboffload); - if (0 >= num_preposted) { - IBOFFLOAD_ERROR(("Failed to prepost recv fragments " - "return code - %d; dest_rank - %d", - num_preposted, dest_rank)); - - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1, - fragment, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, NULL); - if (OPAL_UNLIKELY(NULL == task)) { - IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n")); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - *last_wait = &task->element; - - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task); - MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task); - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ -int bcol_iboffload_binomial_root_to_src(int group_root, int my_rank, - int pow2_size, int group_size, int *distance) -{ - int root, relative_rank, src, - pow2_distance = 0, i; - - if (group_root < pow2_size) { - root = group_root; - } else { - /* the source of the data is extra node, - the real root it represented by some rank from - pow2 group */ - root = group_root - pow2_size; - /* shortcut for the case when my rank is root for the group */ - if (my_rank == root) { - *distance = -1; - return group_root; - } - } - - relative_rank = (my_rank - root) < 0 ? my_rank - root + pow2_size : - my_rank - root; - - for (i = 1; i < pow2_size; i<<=1, pow2_distance++) { - if (relative_rank & i) { - src = my_rank ^ i; - if (src >= pow2_size) - src -= pow2_size; - - *distance = pow2_distance; - IBOFFLOAD_VERBOSE(10, ("AAAAA d %d rel %d it %d root %d my %d", *distance, relative_rank, i, root, my_rank)); - return src; - } - } - - /* error case */ - *distance = -1; - return -1; -} - -static inline void bcol_iboffload_setup_binomial_connection(mca_bcol_iboffload_module_t *iboffload) -{ - netpatterns_pair_exchange_node_t *my_exchange_node = - &iboffload->recursive_doubling_tree; - - int i, n_exchanges = my_exchange_node->n_exchanges, - *exchanges = my_exchange_node->rank_exchanges, - n_extra_src = my_exchange_node->n_extra_sources, - my_rank = iboffload->ibnet->super.my_index, - rank_extra_src = my_exchange_node->rank_extra_source; - - mca_bcol_iboffload_endpoint_t *ep; - - IBOFFLOAD_VERBOSE(10, ("Open connections.\n")); - - if (0 < n_extra_src) { - ep = iboffload->endpoints[rank_extra_src]; - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - -#if OPAL_ENABLE_DEBUG - { - int qp_index, num_qps = mca_bcol_iboffload_component.num_qps; - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - assert(NULL != ep->qps[qp_index].qp->lcl_qp); - IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.", - ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num)); - } - } -#endif - - /* Connect to all extra nodes */ - if (EXTRA_NODE == my_exchange_node->node_type) { - for (i = iboffload->power_of_2_ranks; - i < iboffload->num_endpoints; ++i) { - if (i != my_rank) { - ep = iboffload->endpoints[i]; - - IBOFFLOAD_VERBOSE(10, ("subgroup rank %d: Connect to rank %d.\n", my_rank, i)); - - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - -#if OPAL_ENABLE_DEBUG - { - int qp_index, num_qps = mca_bcol_iboffload_component.num_qps; - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - assert(NULL != ep->qps[qp_index].qp->lcl_qp); - IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.", - ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num)); - } - } -#endif - } - } - } - } - - for (i = 0; i < n_exchanges; ++i) { - ep = iboffload->endpoints[exchanges[i]]; - - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - -#if OPAL_ENABLE_DEBUG - { - int qp_index, num_qps = mca_bcol_iboffload_component.num_qps; - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - assert(NULL != ep->qps[qp_index].qp->lcl_qp); - IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.", - ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num)); - } - } -#endif - } - /* set the connection status to connected */ - iboffload->connection_status[RECURSIVE_DOUBLING_TREE_BCAST] = true; -} - -static inline __opal_attribute_always_inline__ -int bcol_iboffload_bcast_binomial_gather(mca_bcol_iboffload_module_t *iboffload_module, - struct mqe_task **last_send, struct mqe_task **last_wait, - mca_bcol_iboffload_collfrag_t *coll_fragment, - int count, int base_block_size, int radix_mask_pow) -{ - int rc; - int i; - int my_group_index = iboffload_module->ibnet->super.my_index; - int delta, rdelta; - - IBOFFLOAD_VERBOSE(10, ("bcol_iboffload_bcast_binomial_gather %d %d", - radix_mask_pow, my_group_index)); - - /* we assume the iteration #iteration already was completed with probe */ - for (i = 0; i < iboffload_module->power_of_2; i++) { - int pow2 = 1 << i; - int peer_index = my_group_index ^ pow2; - int slen, rlen, - send_offset, - recv_offset; - - if (i > radix_mask_pow) { - slen = rlen = pow2 * base_block_size; - send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i)); - recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i)); - - rdelta = count - recv_offset; - if (rdelta > 0) { - IBOFFLOAD_VERBOSE(10, ("Recv1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d", - pow2, 1 << iboffload_module->power_of_2, - recv_offset, rlen, peer_index)); - - rc = mca_bcol_iboffload_send_rtr_setup(last_send, - peer_index, iboffload_module, - coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); - return OMPI_ERROR; - } - } - - delta = count - send_offset; - if (delta > 0) { - if (delta < slen) { - /* recv the tail */ - slen = delta; - } - - IBOFFLOAD_VERBOSE(10, ("Send1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d", - pow2, 1 << iboffload_module->power_of_2, - send_offset, slen, peer_index)); - rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); - return OMPI_ERROR; - } - } - - if (rdelta > 0) { - if (rdelta < rlen) { - /* recv the tail */ - rlen = rdelta; - } - - rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait, - SBUF, recv_offset, rlen, peer_index, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); - return OMPI_ERROR; - } - } - - } else if (i == radix_mask_pow) { - /* only receive data */ - rlen = pow2 * base_block_size; - recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i)); - delta = count - recv_offset; - if (0 >= delta) { - /* we have nothing to send, skip the iteration */ - continue; - } - if (delta < rlen) { - /* recv the tail */ - rlen = delta; - } - /* receive data from the peer */ - IBOFFLOAD_VERBOSE(10, ("Recv2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d", - pow2, - 1 << iboffload_module->power_of_2, - recv_offset, - rlen, peer_index)); - rc = mca_bcol_iboffload_send_rtr_setup(last_send, - peer_index, iboffload_module, - coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr")); - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait, - SBUF, recv_offset, rlen, peer_index, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive")); - return OMPI_ERROR; - } - } else if (i < radix_mask_pow) { - /* Only send data */ - slen = pow2 * base_block_size; - send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i)); - delta = count - send_offset; - if (0 >= delta) { - /* we have nothing to send, skip the iteration */ - continue; - } - - if (delta < slen) { - slen = delta; - } - - IBOFFLOAD_VERBOSE(10, ("Send2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d", - pow2, - 1 << iboffload_module->power_of_2, - send_offset, - slen, - peer_index)); - - rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index, - iboffload_module, coll_fragment); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Failed to isend data")); - return OMPI_ERROR; - } - } - } - - return OMPI_SUCCESS; -} - -END_C_DECLS - -#endif diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.c b/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.c deleted file mode 100644 index dc447d879d..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include - -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_collfrag.h" - -static void -collfrag_constructor(struct mca_bcol_iboffload_collfrag_t *collfrag) -{ - collfrag->n_sends = 0; - collfrag->n_sends_completed = 0; - - memset(collfrag->pre_posted_recvs, 0, - sizeof(struct mca_bcol_iboffload_task_t *) * MAX_MQE_TASKS); - - collfrag->signal_task_wr_id = (uint64_t) 0; - collfrag->complete = false; - - collfrag->seq_n = -1; - collfrag->coll_full_req = NULL; - - collfrag->unpack_size = 0; - - collfrag->tasks_posted = 0; - collfrag->to_post = NULL; - collfrag->task_next = NULL; - collfrag->tasks_to_release = NULL; - - collfrag->in_pending_list = false; -} - -static void -collfrag_destruct(struct mca_bcol_iboffload_collfrag_t *collfrag) -{ -} - -OBJ_CLASS_INSTANCE(mca_bcol_iboffload_collfrag_t, - ompi_free_list_item_t, - collfrag_constructor, - collfrag_destruct); diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.h b/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.h deleted file mode 100644 index 3be53aacaa..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_IBOFFLOAD_COLLFRAG_H -#define MCA_BCOL_IBOFFLOAD_COLLFRAG_H - -#include "ompi_config.h" - -#include -#include -#include - -#include "bcol_iboffload.h" - -#include "opal/class/ompi_free_list.h" - -BEGIN_C_DECLS - -#define MAX_MQE_TASKS 128 /* Pasha - do we want to make it dynamic ?*/ - -struct mca_bcol_iboffload_task_t; -struct mca_bcol_iboffload_collreq_t; - -/* collective fragment descriptor */ -struct mca_bcol_iboffload_collfrag_t { - ompi_free_list_item_t super; - - /* number of asynchronous sends scheduled */ - uint32_t n_sends; - - /* number of sends completed */ - uint32_t n_sends_completed; - - /* Algorithm ID that was user for this fragment*/ - int32_t alg; - - /* pre-posted receive sources */ - struct mca_bcol_iboffload_task_t *pre_posted_recvs[MAX_MQE_TASKS]; - - /* cache here pointer to signaled task */ - uint64_t signal_task_wr_id; - - /* mwr completion from the mcq */ - volatile bool complete; - - /* sequence number - we use it for - correct ordering of resources release */ - uint32_t seq_n; - - /* pointer to the full collective request descriptor */ - struct mca_bcol_iboffload_collreq_t *coll_full_req; - - size_t unpack_size; - - bool in_pending_list; - - /* Num of posted tasks */ - int tasks_posted; - - /* Pointer to head of not posted elements list */ - struct mqe_task *to_post; - - /* Pointer to tail next */ - struct mqe_task **tail_next; - - /* List of the all tasks of this coll frag */ - struct mca_bcol_iboffload_task_t *tasks_to_release; - - /* Pointer to the next elem in All tasks list */ - struct mca_bcol_iboffload_task_t **task_next; - - /* Num of needed mq credits */ - int mq_credits; - - /* MQ index, that used for this frag */ - int mq_index; - - /* - * Last wait sequence number; zero i.e. - * there isn't any wait in the coll request - */ - int32_t last_wait_num; - /* fragment descriptor for non contiguous data */ - bcol_fragment_descriptor_t *bcol_frag_info; - /* frag-len of ml buffer */ - int frag_len; -}; -typedef struct mca_bcol_iboffload_collfrag_t mca_bcol_iboffload_collfrag_t; -OBJ_CLASS_DECLARATION(mca_bcol_iboffload_collfrag_t); - -static inline __opal_attribute_always_inline__ - void mca_bcol_iboffload_collfrag_init( - mca_bcol_iboffload_collfrag_t *cf) -{ - /* init the request */ - cf->n_sends = 0; - cf->complete = false; - cf->n_sends_completed = 0; - cf->alg = -1; - cf->in_pending_list = false; - cf->tail_next = NULL; - cf->tasks_posted = 0; - cf->to_post = NULL; - cf->mq_credits = 0; - cf->mq_index = 0; - cf->tasks_to_release = NULL; - cf->task_next = &cf->tasks_to_release; - cf->last_wait_num = 0; -} - -static inline __opal_attribute_always_inline__ - struct mca_bcol_iboffload_collfrag_t * - mca_bcol_iboffload_get_collfrag(void) -{ - ompi_free_list_item_t *item; - mca_bcol_iboffload_collfrag_t *cf; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - /* blocking allocation for collectives fragment */ - OMPI_FREE_LIST_GET_MT(&cm->collfrags_free, item); - if (OPAL_UNLIKELY(NULL == item)) { - IBOFFLOAD_ERROR(("Failed to allocated collfrag.\n")); - return NULL; - } - - cf = (mca_bcol_iboffload_collfrag_t*) item; - mca_bcol_iboffload_collfrag_init(cf); - - return cf; -} - -END_C_DECLS - -#endif diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_collreq.c b/ompi/mca/bcol/iboffload/bcol_iboffload_collreq.c deleted file mode 100644 index bae677bb51..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_collreq.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "bcol_iboffload_collreq.h" - -static void -collreq_construct(struct mca_bcol_iboffload_collreq_t *collreq) -{ - int i; - collreq->n_fragments = 0; - collreq->n_frag_mpi_complete = 0; - collreq->n_frag_net_complete = 0; - collreq->user_handle_freed = false; - - for (i = 0; i < BCOL_IBOFFLOAD_BUFFERS; i++) { - collreq->buffer_info[i].buf = NULL; - collreq->buffer_info[i].offset = 0; - collreq->buffer_info[i].iboffload_reg = NULL; - } - - OBJ_CONSTRUCT(&collreq->work_requests, opal_list_t); - OBJ_CONSTRUCT(&collreq->first_collfrag, mca_bcol_iboffload_collfrag_t); - - OBJ_CONSTRUCT(&collreq->send_convertor, opal_convertor_t); - OBJ_CONSTRUCT(&collreq->recv_convertor, opal_convertor_t); -} - -static void -collreq_destruct(struct mca_bcol_iboffload_collreq_t *collreq) -{ - OBJ_DESTRUCT(&collreq->work_requests); - OBJ_DESTRUCT(&collreq->first_collfrag); - - OBJ_DESTRUCT(&collreq->send_convertor); - OBJ_DESTRUCT(&collreq->recv_convertor); -} - -OBJ_CLASS_INSTANCE(mca_bcol_iboffload_collreq_t, - ompi_request_t, - collreq_construct, - collreq_destruct); diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_collreq.h b/ompi/mca/bcol/iboffload/bcol_iboffload_collreq.h deleted file mode 100644 index 31344009d3..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_collreq.h +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_IBOFFLOAD_COLLREQ_H -#define MCA_BCOL_IBOFFLOAD_COLLREQ_H - -#include "ompi_config.h" - -#include -#include -#include - -#include "opal/class/ompi_free_list.h" - -#include "bcol_iboffload.h" -#include "bcol_iboffload_device.h" -#include "bcol_iboffload_collfrag.h" - -#define SBUF 0 -#define RBUF 1 - -#define BCOL_IBOFFLOAD_BUFFERS 2 - -BEGIN_C_DECLS - -struct mca_bcol_iboffload_reg_t; - -/* - * collective progress function - */ -typedef int (*collective_message_progress_function)( - struct mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *full_message_descriptor); -/* - * callback function to be called after the collective work request - * completes. This is invoked in user-space, and is typically where - * data may be copied out of library buffers, or when any other user- - * level protocol may be completed - * - * input: - * callback data: typically, this may be the work request just finished - */ -typedef int (*collective_message_completion_callback_function)( - void *callback_data); - -struct mca_bcol_iboffload_buff_info { - void *buf; - size_t offset; - uint32_t lkey; - struct mca_bcol_iboffload_reg_t *iboffload_reg; -}; -typedef struct mca_bcol_iboffload_buff_info mca_bcol_iboffload_buff_info; - -/* - * Collective message descriptor - * the mca_bcol_iboffload_message_desc_t was replaced with mca_bcol_iboffload_collreq_t - * ************************************************************************************************* - * - * Brief description of iboffload collective request dependencies: - * - * mca_bcol_iboffload_collreq_t <----<< Full coll request - * | - * --(0)-- mca_bcol_iboffload_collfrag_t <----<< Fragment of coll request ( for example - * | | 10MB Bcast maybe split to 2MB fragments ) - * | | - * | --(0)-- mca_bcol_iboffload_task_t---mqe_task - * | | | - * | | ---mca_bcol_iboffload_frag_t---ibv_sge - * | --(1)-- mca_bcol_iboffload_task_t---mqe_task - * | | | - * | | ---mca_bcol_iboffload_frag_t---ibv_sge - * | ..(M).. - * | - * --(1)-- mca_bcol_iboffload_collfrag_t - * | - * ..(N).. - * - * ************************************************************************************************* - */ - -struct mca_bcol_iboffload_collreq_t { - ompi_request_t super; - - /* op type */ - struct ompi_op_t *op; - - /* Sometimes the operation that should be performed - by the IB is different than the mpi_op and is then set - by the pack_data_for_calc function */ - enum ibv_m_wr_calc_op actual_ib_op; - - /* Sometimes the data type that should be used by the IB - to peroform the calc s different than the mpi dtype, - and is then set by the pack_data_for_calc function */ - enum ibv_m_wr_data_type actual_ib_dtype; - - /* data type */ - struct ompi_datatype_t *dtype; - - /* convertor for send operation */ - opal_convertor_t send_conv; - - /* convertor for recv operation */ - opal_convertor_t recv_conv; - - /* - * count (in data type units) - */ - uint64_t count; - - /* - * root of collective operation - */ - int root; - - /* number of message fragments */ - int n_fragments; - - /* number of fragments sent - all resrouces for a fragment are allocated - * or none at all are - */ - int n_frags_sent; - - /* number of fragments completed from the MPI perspective */ - int n_frag_mpi_complete; - - /* number of fragments completed from a network perspective */ - int n_frag_net_complete; - - /* collective free and may be released - message complete from the - ** MPI perspective, the network prespective, and the user is done - ** with the message handle */ - volatile bool user_handle_freed; - - /* list of collective fragements - only 1 for now */ - opal_list_t work_requests; - - /* message progress function */ - collective_message_progress_function progress_fn; - - /* work request completion callback function */ - collective_message_completion_callback_function completion_cb_fn; - - /* index of qp with enough length of buffs for this collective */ - int qp_index; - - bool if_bcol_last; - - /* The flag is used for the last bcol to indicate if the calculation should be done by the cpu */ - bool do_calc_in_cpu; - - /* in Allreduce case, if (true == do_calc_in_cpu) => - the final result will be calc on local CPU */ - uint64_t l_operand; - uint64_t r_operand; - - /* caching ML-rdma buffer descriptor */ - mca_bcol_iboffload_rdma_buffer_desc_t *ml_rdma_desc; - - /* ML buffer index code */ - int ml_buffer_index; - - /* In the current implementation the collrequest connected to 1 single - iboffload module */ - struct mca_bcol_iboffload_module_t *module; - - mca_bcol_iboffload_collfrag_t first_collfrag; - - /* Send/recv buffs info - user buffers registration if needed etc. */ - mca_bcol_iboffload_buff_info buffer_info[BCOL_IBOFFLOAD_BUFFERS]; - - /* My bi nominal tree children in this collective */ - int *bi_nominal_tree_children; - - /* Convertors for send/recv if needed */ - opal_convertor_t send_convertor; - opal_convertor_t recv_convertor; - - /* Order info from upper layer */ - mca_bcol_base_order_info_t *order_info; -}; -typedef struct mca_bcol_iboffload_collreq_t mca_bcol_iboffload_collreq_t; -OBJ_CLASS_DECLARATION(mca_bcol_iboffload_collreq_t); - -#define COLLREQ_IS_DONE(cr) (cr->user_handle_freed && \ - (cr->n_frag_mpi_complete == cr->n_fragments) && \ - (cr->n_frag_net_complete == cr->n_fragments)) - -#define RELEASE_COLLREQ(cr) \ -do { \ - (cr)->user_handle_freed = false; \ - OMPI_FREE_LIST_RETURN_MT(&mca_bcol_iboffload_component.collreqs_free, \ - (ompi_free_list_item_t *) (cr)); \ -} while (0) - -static inline __opal_attribute_always_inline__ - int mca_bcol_iboffload_free_resources_and_move_to_pending( - mca_bcol_iboffload_collfrag_t *coll_fragment, - mca_bcol_iboffload_module_t *iboffload) -{ - int rc = mca_bcol_iboffload_free_tasks_frags_resources(coll_fragment, - iboffload->device->frags_free); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - - IBOFFLOAD_VERBOSE(10, ("iboffload - %p, coll_fragment - %p, " - "coll frag in_pending_list ? - %d, pending_list size - %d.\n", - iboffload, coll_fragment, coll_fragment->in_pending_list, - opal_list_get_size(&iboffload->collfrag_pending))); - - BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(iboffload, coll_fragment->mq_index, coll_fragment->mq_credits); - - /* Remove coll frag from coll request opal_list */ - opal_list_remove_item(&coll_fragment->coll_full_req->work_requests, - (opal_list_item_t *) coll_fragment); - - if (false == coll_fragment->in_pending_list) { - /* Put the collfrag on pending list */ - coll_fragment->in_pending_list = true; - opal_list_append(&iboffload->collfrag_pending, - (opal_list_item_t *) coll_fragment); - } else { - /* The item is already on pending list => - insert it first that not break order - between frags on the list */ - opal_list_prepend(&iboffload->collfrag_pending, - (opal_list_item_t *) coll_fragment); - } - - return OMPI_SUCCESS; -} - -/* Forward declaration */ -struct mca_bcol_iboffload_reg_t; -static inline __opal_attribute_always_inline__ - int mca_bcol_iboffload_prepare_buffer( - void *buffer, - size_t size, - struct mca_bcol_iboffload_reg_t **registration_handler, - mca_bcol_iboffload_module_t *iboffload) -{ - int rc; - mca_mpool_base_registration_t *reg = NULL; - - assert(size > 0); - rc = iboffload->device->mpool->mpool_register( - iboffload->device->mpool, - buffer, size, - (uint32_t) 0 /* flags */, - ®); - - *registration_handler = - (struct mca_bcol_iboffload_reg_t *) reg; - - return rc; -} - -int mca_bcol_iboffload_coll_req_implement( - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collreq_t *coll_request); - -END_C_DECLS - -#endif diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_component.c b/ompi/mca/bcol/iboffload/bcol_iboffload_component.c deleted file mode 100644 index 01cee48fe7..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_component.c +++ /dev/null @@ -1,1075 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include -#include - -#include - -#include "ompi/constants.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "opal/mca/mpool/base/base.h" -#include "ompi/mca/common/ofacm/connect.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/common/ofacm/base.h" -#include "ompi/mca/common/verbs/common_verbs.h" - -#include "opal/util/argv.h" -#include "opal/include/opal/types.h" - -#include "bcol_iboffload_mca.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" -#include "bcol_iboffload_device.h" -#include "bcol_iboffload_qp_info.h" -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_collfrag.h" - -/* - * Public string showing the bcol ompi_sm V2 component version number - */ -const char *mca_bcol_iboffload_component_version_string = - "Open MPI bcol - iboffload collective MCA component version " OMPI_VERSION; - -/* - * Local functions - */ - -static int setup_qps(void); -static int iboffload_open(void); -static int iboffload_close(void); - -#define GET_IB_DTYPE_BY_CTYPE(ctype, is_int, ib_dtype) \ -do { \ - switch (sizeof(ctype)) { \ - case 1: \ - ib_dtype = ((is_int) ? IBV_M_DATA_TYPE_INT8 : IBV_M_DATA_TYPE_INVALID); \ - break; \ - case 2: \ - ib_dtype = ((is_int) ? IBV_M_DATA_TYPE_INT16 : IBV_M_DATA_TYPE_INVALID); \ - break; \ - case 4: \ - ib_dtype = ((is_int) ? IBV_M_DATA_TYPE_INT32 : IBV_M_DATA_TYPE_FLOAT32); \ - break; \ - case 8: \ - ib_dtype = ((is_int) ? IBV_M_DATA_TYPE_INT64 : IBV_M_DATA_TYPE_FLOAT64); \ - break; \ - default: \ - ib_dtype = IBV_M_DATA_TYPE_INVALID; \ - } \ -} while (0) - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ -mca_bcol_iboffload_component_t mca_bcol_iboffload_component = { - - /* First, fill in the super */ - - .super = { - /* First, the mca_component_t struct containing meta - information about the component itself */ - - .bcol_version = { - MCA_BCOL_BASE_VERSION_2_0_0, - - /* Component name and version */ - - .mca_component_name = "iboffload", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - - /* Component open and close functions */ - - .mca_open_component = iboffload_open, - .mca_close_component = iboffload_close, - .mca_register_component_params = mca_bcol_iboffload_register_params, - }, - - .collm_init_query = mca_bcol_iboffload_init_query, - .collm_comm_query = mca_bcol_iboffload_comm_query, - .coll_support = mca_bcol_iboffload_coll_supported, - .coll_support_all_types = mca_bcol_iboffload_coll_support_all_types, - .init_done = false, - .need_ordering = true, /* collective calls with iboffload should to be ordered */ - }, - /* iboffload-component specifc information */ - .verbose = 0, /* verbose */ - .num_qps = 0, /* number of qps to use */ - .warn_default_gid_prefix = false, /* warn_default_gid_prefix */ - .warn_nonexistent_if = false, /* warn_nonexistent_if */ - .free_list_num = 0, /* free_list_num */ - .free_list_max = 0, /* free_list_max */ - .free_list_inc = 0, /* free_list_inc */ - .mpool_name = NULL, /* mpool_name */ - .cq_size = 0, /* cq_size */ - .max_inline_data = 0, /* max_inline_data */ - .pkey_val = 0, /* pkey_val */ - .qp_ous_rd_atom = 0, /* qp_ous_rd_atom */ - .mtu = 0, /* mtu */ - .min_rnr_timer = 0, /* min_rnr_timer */ - .timeout = 0, /* timeout */ - .retry_count = 0, /* retry_count */ - .rnr_retry = 0, /* rnr_retry */ - .max_rdma_dst_ops = 0, /* max_rdma_dst_ops */ - .service_level = 0, /* service_level */ - .buffer_alignment = 0, /* buffer_alignment */ - .max_mqe_tasks = 0, /* max_mqe_tasks */ - .max_mq_size = 0, /* max_mq_size */ - .if_include = NULL, /* if_include */ - .if_include_list = NULL, /* if_include_list */ - .if_exclude = NULL, /* if_exclude */ - .if_exclude_list = NULL, /* if_exclude_list */ - .if_list = NULL, /* if_list */ - .ib_devs = NULL, /* ib_devs */ - .num_devs = 0, /* num_devs */ - .receive_queues = NULL, /* receive_queues */ -}; - -static int mca_bcol_iboffload_dummy_init_query( - bool enable_progress_threads, bool enable_mpi_threads) -{ - return OMPI_SUCCESS; -} - -static void mca_bcol_iboffload_device_constructor - (mca_bcol_iboffload_device_t *device) -{ - /* Init OFACM stuf */ - device->dev.ib_dev = NULL; - device->dev.ib_dev_context = NULL; - device->dev.capabilities = 0; - /* device->dev.type = MCA_COMMON_OFACM_COLL;*/ - /* Init other stuff */ - device->ib_pd = NULL; - device->ib_cq = NULL; - device->ports = NULL; - - device->mpool = NULL; - device->ib_mq_cq = NULL; - device->frags_free = NULL; - - device->activated = false; - device->num_act_ports = 0; - - memset(&device->ib_dev_attr, 0, sizeof(struct ibv_device_attr)); - memset(&device->dummy_reg, 0, sizeof( mca_bcol_iboffload_reg_t)); -} - -static void mca_bcol_iboffload_device_destructor - (mca_bcol_iboffload_device_t *device) -{ - int qp_index, num_qps = mca_bcol_iboffload_component.num_qps; - - IBOFFLOAD_VERBOSE(10, ("Device %s will be destroyed.\n", - ibv_get_device_name(device->dev.ib_dev))); - - if (NULL != device->frags_free) { - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - mca_bcol_iboffload_dealloc_qps_resource_fn_t dealloc_resource = - mca_bcol_iboffload_component.qp_infos[qp_index].dealloc_resource; - if (NULL != dealloc_resource) { - dealloc_resource(qp_index, device); - } - } - - free(device->frags_free); - } - - if (NULL != device->mpool) { - IBOFFLOAD_VERBOSE(10, ("Mpool destroy - %p.\n", device->mpool)); - if (OMPI_SUCCESS != mca_mpool_base_module_destroy(device->mpool)) { - IBOFFLOAD_ERROR(("Device %s, failed to destroy mpool", - ibv_get_device_name(device->dev.ib_dev))); - } - } - - if (NULL != device->dummy_reg.mr) { - IBOFFLOAD_VERBOSE(10, ("Dummy memory MR unregister - %p.\n", device->dummy_reg.mr)); - if (OMPI_SUCCESS != - mca_bcol_iboffload_deregister_mr((void *) device, &device->dummy_reg.base)) { - IBOFFLOAD_ERROR(("Device %s: failed to unregister dummy memory MR.", - ibv_get_device_name(device->dev.ib_dev))); - } - } - - if (NULL != device->ib_cq) { - if (ibv_destroy_cq(device->ib_cq)) { - IBOFFLOAD_ERROR(("Device %s, failed to destroy CQ, errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - } - } - - if (NULL != device->ib_mq_cq) { - if (ibv_destroy_cq(device->ib_mq_cq)) { - IBOFFLOAD_ERROR(("Device %s, failed to destroy mq CQ, errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - } - } - - /* Release IB PD if we have one */ - if (NULL != device->ib_pd) { - if(ibv_dealloc_pd(device->ib_pd)){ - IBOFFLOAD_ERROR(("Device %s, failed to release PD, errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - } - } - - /* close the device */ - if (NULL != device->dev.ib_dev_context) { - if (ibv_close_device(device->dev.ib_dev_context)) { - IBOFFLOAD_ERROR(("Device %s " - ", failed to close the device, errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - } - } - - /* release memory */ - if (NULL != device->ports) { - free(device->ports); - } -} - -OBJ_CLASS_INSTANCE(mca_bcol_iboffload_device_t, - opal_list_item_t, - mca_bcol_iboffload_device_constructor, - mca_bcol_iboffload_device_destructor); - -int mca_bcol_iboffload_coll_supported(int op, int dtype, bcol_elem_type elem_type) -{ - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - return (IBV_M_DATA_TYPE_INVALID != cm->map_ompi_to_ib_dt[dtype]) && - (IBV_M_CALC_OP_INVALID != cm->map_ompi_to_ib_calcs[op]) && - (BCOL_SINGLE_ELEM_TYPE == elem_type); -} - -int mca_bcol_iboffload_coll_support_all_types(bcol_coll coll_name) -{ - return BCOL_ALLREDUCE ^ coll_name; -} - -/* Unload devices */ -static int iboffload_release_devices(void) -{ - int i; - mca_bcol_iboffload_device_t *device = NULL; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - opal_pointer_array_t *devs = &cm->devices; - - IBOFFLOAD_VERBOSE(10, ("Destroy all devices.\n")); - - for (i = 0; i < cm->num_devs; i++) { - device = opal_pointer_array_get_item(devs, i); - - IBOFFLOAD_VERBOSE(10, ("Device %s with index %d will be destroyed.\n", - ibv_get_device_name(device->dev.ib_dev), i)); - if (NULL != device) { - OBJ_RELEASE(device); - } - } - - IBOFFLOAD_VERBOSE(10, ("All devices were destroyed.\n")); - - opal_pointer_array_remove_all(devs); - OBJ_DESTRUCT(devs); - - /* release device list */ - /*ibv_free_device_list_compat(cm->ib_devs);*/ - ompi_ibv_free_device_list(cm->ib_devs); - cm->ib_devs = NULL; - - IBOFFLOAD_VERBOSE(10, ("All devices destroyed.\n")); - - return OMPI_SUCCESS; -} - -/* Create list of IB HCA that have active port */ -static int iboffload_load_devices(void) -{ - int num_devs = 0, i; - mca_bcol_iboffload_device_t *device = NULL; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Entering to iboffload_load_devices")); - - /* Get list of devices */ - /*cm->ib_devs = ibv_get_device_list_compat(&num_devs);*/ - cm->ib_devs = ompi_ibv_get_device_list(&num_devs); - if (0 == num_devs || NULL == cm->ib_devs) { - IBOFFLOAD_ERROR(("No IB devices found")); - /* No hca error*/ - opal_show_help("help-mpi-btl-base.txt", "btl:no-nics", true); - return OMPI_ERROR; - } - - cm->num_devs = num_devs; - - for (i = 0; i < num_devs; i++) { - device = OBJ_NEW(mca_bcol_iboffload_device_t); - if (NULL != device) { - opal_pointer_array_set_item(&cm->devices, i, (void *) device); - device->dev.ib_dev = cm->ib_devs[i]; - - IBOFFLOAD_VERBOSE(10, ("Device %s with index %d was appended.\n", - ibv_get_device_name(device->dev.ib_dev), i)); - } - } - - if (0 == opal_pointer_array_get_size(&cm->devices)) { - /* No relevand devices were found, return error */ - IBOFFLOAD_ERROR(("No active devices found.\n")); - - return OMPI_ERROR; - } - - return OMPI_SUCCESS; -} - -static void map_ompi_to_ib_dtype(void) -{ - int dt; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - for (dt = 0; dt < OMPI_DATATYPE_MAX_PREDEFINED; ++dt) { - cm->map_ompi_to_ib_dt[dt] = IBV_M_DATA_TYPE_INVALID; - } - - GET_IB_DTYPE_BY_CTYPE(char, true, cm->map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_SIGNED_CHAR]); - - GET_IB_DTYPE_BY_CTYPE(short, true, cm->map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_SHORT]); - GET_IB_DTYPE_BY_CTYPE(int, true, cm->map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_INT]); - GET_IB_DTYPE_BY_CTYPE(long, true, cm->map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_LONG]); - GET_IB_DTYPE_BY_CTYPE(long long, true, cm->map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_LONG_LONG]); - GET_IB_DTYPE_BY_CTYPE(float, false, cm->map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_FLOAT]); - GET_IB_DTYPE_BY_CTYPE(double, false, cm->map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_DOUBLE]); - - /* Check (only in DEBUG mode) if size of double equal to 64 bit */ - assert(8 == sizeof(double)); -} - -static void map_ompi_to_ib_op_type(void) -{ - int op; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - for (op = 0; op < OMPI_OP_NUM_OF_TYPES; ++op) { - cm->map_ompi_to_ib_calcs[op] = IBV_M_CALC_OP_INVALID; - } - - cm->map_ompi_to_ib_calcs[OMPI_OP_MAX] = IBV_M_CALC_OP_MAX; - cm->map_ompi_to_ib_calcs[OMPI_OP_MIN] = IBV_M_CALC_OP_MIN; - cm->map_ompi_to_ib_calcs[OMPI_OP_SUM] = IBV_M_CALC_OP_ADD; - - cm->map_ompi_to_ib_calcs[OMPI_OP_LAND] = IBV_M_CALC_OP_LAND; - cm->map_ompi_to_ib_calcs[OMPI_OP_BAND] = IBV_M_CALC_OP_BAND; - cm->map_ompi_to_ib_calcs[OMPI_OP_LOR] = IBV_M_CALC_OP_LOR; - cm->map_ompi_to_ib_calcs[OMPI_OP_BOR] = IBV_M_CALC_OP_BOR; - cm->map_ompi_to_ib_calcs[OMPI_OP_LXOR] = IBV_M_CALC_OP_LXOR; - cm->map_ompi_to_ib_calcs[OMPI_OP_BXOR] = IBV_M_CALC_OP_BXOR; -} - -/* - * Open the component - */ -static int iboffload_open(void) -{ - int rc; - - /* local variables */ - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Open Iboffload component.\n")); - - (void) mca_bcol_iboffload_verify_params(); - - cm->super.priority = 100; - cm->super.n_net_contexts = 0; - cm->super.network_contexts = NULL; - - OBJ_CONSTRUCT(&cm->recv_wrs.lock, opal_mutex_t); - - /* construct lists */ - OBJ_CONSTRUCT(&cm->devices, opal_pointer_array_t); - rc = opal_pointer_array_init(&cm->devices, 10, INT_MAX, 10); - if (OMPI_SUCCESS != rc) { - goto close_device; - } - - /* Check MCA parameters */ - if (0 != (mca_bcol_iboffload_component.exchange_tree_order & (mca_bcol_iboffload_component.exchange_tree_order - 1))) { - IBOFFLOAD_ERROR(("Warning: ibcol_iboffload_exchange_tree_order is %d which is not a power of 2, setting it to 2", - mca_bcol_iboffload_component.exchange_tree_order)); - mca_bcol_iboffload_component.exchange_tree_order = 2; - } - - /* Pasha: Since we do not have max inline check like in openib, - I will put some dummy check here. All mlnx devices support at least 512b */ - if (mca_bcol_iboffload_component.max_inline_data > 512) { - IBOFFLOAD_ERROR(("Warning the inline %d, is to big and unsupported", - mca_bcol_iboffload_component.max_inline_data)); - rc = OMPI_ERROR; - goto close_device; - } - - /* Register the progress function */ - rc = opal_progress_register(mca_bcol_iboffload_component_progress); - if (OMPI_SUCCESS != rc) { - IBOFFLOAD_ERROR(("Failed to register the progress function" - " for iboffload component.\n")); - goto close_device; - } - - map_ompi_to_ib_dtype(); - map_ompi_to_ib_op_type(); - - /* The init_done set to true on first component usage */ - cm->init_done = false; - - return OMPI_SUCCESS; - -close_device: - OBJ_DESTRUCT(&cm->devices); - OBJ_DESTRUCT(&cm->recv_wrs.lock); - return rc; -} - -/* - * Close the component - */ -static int iboffload_close(void) -{ - int rc; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Destroy component free lists.\n")); - - if (true == cm->init_done) { - OBJ_DESTRUCT(&cm->tasks_free); - OBJ_DESTRUCT(&cm->collreqs_free); - OBJ_DESTRUCT(&cm->collfrags_free); - OBJ_DESTRUCT(&cm->calc_tasks_free); - } - - /* Unregister the progress function */ - rc = opal_progress_unregister(mca_bcol_iboffload_component_progress); - if (OMPI_SUCCESS != rc) { - IBOFFLOAD_ERROR(("Failed to unregister the progress function" - " for iboffload component.\n")); - } - - rc = iboffload_release_devices(); - if (OMPI_SUCCESS != rc) { - return rc; - } - - if (NULL != cm->receive_queues) { - free(cm->receive_queues); - } - - OBJ_DESTRUCT(&cm->recv_wrs.lock); - - IBOFFLOAD_VERBOSE(10, ("The component closed.\n")); - - return OMPI_SUCCESS; -} - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_bcol_iboffload_init_query(bool enable_progress_threads, - bool enable_mpi_threads) -{ - int rc; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Init Iboffload component.\n")); - - /* Get list of HCAs and ports */ - rc = iboffload_load_devices(); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("Load devices error.\n")); - goto unload_devices; - } - - /* Setup the BSRQ QP's based on the final value of - mca_bcol_iboffload_component.receive_queues. */ - rc = setup_qps(); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("QPs setup error.\n")); - goto unload_devices; - } - - cm->super.collm_init_query = mca_bcol_iboffload_dummy_init_query; - - return OMPI_SUCCESS; - - /* done */ -unload_devices: - IBOFFLOAD_ERROR(("Release devices: an error occured.\n")); - - iboffload_release_devices(); - - return rc; -} - -static int32_t atoi_param(char *param, int32_t dflt) -{ - if (NULL == param || '\0' == param[0]) { - return dflt ? dflt : 1; - } - - return atoi(param); -} - -static int setup_qps(void) -{ - int ret = OMPI_SUCCESS, qp = 0; - int rd_num = 0, rd_low = 0, size = 0, - rd_win = 0, rd_rsv = 0, sd_max = 0; - - mca_bcol_iboffload_qp_type_t type = 0; - - char **queues = NULL, **params = NULL; - - queues = opal_argv_split(mca_bcol_iboffload_component.receive_queues, ':'); - if (0 == opal_argv_count(queues)) { - opal_show_help("help-mpi-btl-openib.txt", - "no qps in receive_queues", true, - ompi_process_info.nodename, - mca_bcol_iboffload_component.receive_queues); - - ret = OMPI_ERROR; - - goto exit; - } - - while (queues[qp] != NULL) { - if (0 == strncmp("P,", queues[qp], 2)) { - type = MCA_BCOL_IBOFFLOAD_PP_QP; - } else if (0 == strncmp("S,", queues[qp], 2)) { - type = MCA_BCOL_IBOFFLOAD_SRQ_QP; - } else if (0 == strncmp("X,", queues[qp], 2)) { -#if HAVE_XRC - type = MCA_BCOL_IBOFFLOAD_XRC_QP; -#else - opal_show_help("help-mpi-btl-openib.txt", "No XRC support", true, - ompi_process_info.nodename, - mca_bcol_iboffload_component.receive_queues); - ret = OMPI_ERR_NOT_AVAILABLE; - goto exit; -#endif - } else { - opal_show_help("help-mpi-btl-openib.txt", - "invalid qp type in receive_queues", true, - ompi_process_info.nodename, - mca_bcol_iboffload_component.receive_queues, - queues[qp]); - - ret = OMPI_ERR_BAD_PARAM; - - goto exit; - } - - ++qp; - } - - mca_bcol_iboffload_component.num_qps = MCA_BCOL_IBOFFLOAD_QP_LAST; - - qp = 0; -#define P(N) (((N) > count) ? NULL : params[(N)]) - while (NULL != queues[qp]) { - int count; - - params = opal_argv_split_with_empty(queues[qp], ','); - count = opal_argv_count(params); - - if ('P' == params[0][0]) { - if (count < 3 || count > 6) { - opal_show_help("help-mpi-btl-openib.txt", - "invalid pp qp specification", true, - ompi_process_info.nodename, queues[qp]); - - ret = OMPI_ERR_BAD_PARAM; - - goto exit; - } - - size = atoi_param(P(1), 0); - - rd_num = atoi_param(P(2), 256); - - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - rd_win = atoi_param(P(4), (rd_num - rd_low) * 2); - rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win); - - - if ((rd_num - rd_low) > rd_win) { - opal_show_help("help-mpi-btl-openib.txt", "non optimal rd_win", - true, rd_win, rd_num - rd_low); - } - } else { - if (count < 3 || count > 5) { - opal_show_help("help-mpi-btl-openib.txt", - "invalid srq specification", true, - ompi_process_info.nodename, queues[qp]); - - ret = OMPI_ERR_BAD_PARAM; - - goto exit; - } - - size = atoi_param(P(1), 0); - rd_num = atoi_param(P(2), 256); - - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - sd_max = atoi_param(P(4), rd_low / 4); - - IBOFFLOAD_VERBOSE(10, ("srq: rd_num is %d rd_low is %d sd_max is %d", - rd_num, rd_low, sd_max)); - - } - - if (rd_num <= rd_low) { - opal_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low", - true, ompi_process_info.nodename, queues[qp]); - ret = OMPI_ERR_BAD_PARAM; - - goto exit; - } - - opal_argv_free(params); - - ++qp; - } - - params = NULL; - - for (qp = 0; qp < MCA_BCOL_IBOFFLOAD_QP_LAST; ++qp) { - mca_bcol_iboffload_component.qp_infos[qp].qp_index = qp; - - mca_bcol_iboffload_component.qp_infos[qp].type = type; - mca_bcol_iboffload_component.qp_infos[qp].size = size; - - mca_bcol_iboffload_component.qp_infos[qp].rd_num = rd_num; - mca_bcol_iboffload_component.qp_infos[qp].rd_low = rd_low; - - mca_bcol_iboffload_component.qp_infos[qp].rd_pp_win = rd_num - rd_low; - - if (MCA_BCOL_IBOFFLOAD_PP_QP == type) { - mca_bcol_iboffload_component.qp_infos[qp].u.pp_qp.rd_win = rd_win; - mca_bcol_iboffload_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv; - } else { - mca_bcol_iboffload_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; - } - - if (NULL != setup_qps_fn[qp]) { - setup_qps_fn[qp](&mca_bcol_iboffload_component.qp_infos[qp]); - } - } - -exit: - if (NULL != params) { - opal_argv_free(params); - } - - if (NULL != queues) { - opal_argv_free(queues); - } - - return ret; -} - -static int progress_pending_collfrags(mca_bcol_iboffload_module_t *iboffload) -{ - mca_bcol_iboffload_collfrag_t *pending_collfrag; - int rc, size = opal_list_get_size(&iboffload->collfrag_pending); - - IBOFFLOAD_VERBOSE(10, ("Calling progress_pending_collfrags")); - - do { - pending_collfrag = (mca_bcol_iboffload_collfrag_t *) - opal_list_remove_first(&iboffload->collfrag_pending); - - IBOFFLOAD_VERBOSE(10, ("Get pending_collfrag - %p, iboffload - %p, " - "pending list size - %d.", pending_collfrag, iboffload, - opal_list_get_size(&iboffload->collfrag_pending))); - - /* Return back coll frag to coll request opal_list */ - opal_list_append(&pending_collfrag->coll_full_req->work_requests, - (opal_list_item_t *) pending_collfrag); - - rc = pending_collfrag->coll_full_req->progress_fn - (iboffload, pending_collfrag->coll_full_req); - if (OPAL_UNLIKELY(BCOL_FN_STARTED != rc && OMPI_SUCCESS != rc)) { - return OMPI_ERROR; - } - } while (--size > 0); - - return OMPI_SUCCESS; -} - - -/** - * Test - if we finished with the coll fragment descriptor, - * and free all resouces if so. - **/ -int -mca_bcol_iboffload_free_tasks_frags_resources( - mca_bcol_iboffload_collfrag_t *collfrag, - ompi_free_list_t *frags_free) -{ - int rc; - - mca_bcol_iboffload_task_t *task = collfrag->tasks_to_release; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - /* Support for multiple frags we will add later - * n_outstanding_frags = coll_req->n_frags_sent - coll_req->n_frag_net_complete; */ - - while (NULL != task) { - /* Return frag (is the reference counter is zero)*/ - rc = release_frags_on_task(task, frags_free); - if (OMPI_SUCCESS != rc) { - return OMPI_ERROR; - } - - /* Return task: if the pointer is NULL => we assume the task - is a member of the common task list (tasks_free) */ - if (NULL == task->task_list) { - OMPI_FREE_LIST_RETURN_MT(&cm->tasks_free, - (ompi_free_list_item_t *) task); - } else { - OMPI_FREE_LIST_RETURN_MT(task->task_list, - (ompi_free_list_item_t *) task); - } - - task = task->next_task; - } - - return OMPI_SUCCESS; -} - -static void fatal_error(char *mesg) -{ - IBOFFLOAD_ERROR(("FATAL ERROR: %s", mesg)); - ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_INTERN); -} - -#define RELEASE_COLLFRAG(cf) \ - do { \ - opal_list_remove_item(&(cf)->coll_full_req->work_requests, \ - (opal_list_item_t *) (cf)); \ - if (&(cf)->coll_full_req->first_collfrag != (cf)) { \ - OMPI_FREE_LIST_RETURN_MT(&mca_bcol_iboffload_component.collfrags_free, \ - (ompi_free_list_item_t *) (cf)); \ - } \ - } while (0) - -#define COLLFRAG_IS_DONE(cf) ((cf)->complete && (cf)->n_sends_completed == (cf)->n_sends) - -/* Pasha: Need to modify the code to progress pending queue only if relevant -* resource was released */ -#define PROGRESS_PENDING_COLLFRAG(cf) \ - if (OPAL_UNLIKELY(opal_list_get_size(&(cf)->coll_full_req->module->collfrag_pending) > 0)) { \ - int rc; \ - IBOFFLOAD_VERBOSE(10, ("Calling for PROGRESS_PENDING_COLLFRAG")); \ - rc = progress_pending_collfrags((cf)->coll_full_req->module); \ - if (OPAL_UNLIKELY(OMPI_ERROR == rc)) { \ - fatal_error("failed to progress_pending_collfrags\n"); \ - return 0; \ - } \ - } - - -static inline __opal_attribute_always_inline__ int - handle_collfrag_done(mca_bcol_iboffload_collfrag_t *coll_frag, - mca_bcol_iboffload_collreq_t *coll_request, - mca_bcol_iboffload_device_t *device) -{ - int rc; - - if (COLLFRAG_IS_DONE(coll_frag)) { - IBOFFLOAD_VERBOSE(10, ("Coll frag - %p already done.\n", coll_frag)); - - coll_request->n_frag_net_complete++; - IBOFFLOAD_VERBOSE(10, ("Free tasks resourse.\n")); - /* Check if we are done with this coll_frag and release resources if so. */ - rc = mca_bcol_iboffload_free_tasks_frags_resources(coll_frag, device->frags_free); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("mca_bcol_iboffload_free_tasks_frags_resources FAILED")); - fatal_error("Failed to mca_bcol_iboffload_free_tasks_frags_resources\n"); - return -1; - } - - BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(coll_request->module, coll_frag->mq_index, coll_frag->mq_credits); - - RELEASE_COLLFRAG(coll_frag); - - PROGRESS_PENDING_COLLFRAG(coll_frag); - - IBOFFLOAD_VERBOSE(10, ("Alg %d: user_handle_freed - %d, n_frag_mpi_complete - %d, " - "n_fragments- %d, n_frag_net_complete - %d, n_fragments - %d.\n", - coll_frag->alg, - coll_request->user_handle_freed, - coll_request->n_frag_mpi_complete, - coll_request->n_fragments, - coll_request->n_frag_net_complete, - coll_request->n_fragments)); - - /* check for full message completion */ - if (COLLREQ_IS_DONE(coll_request)) { - IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); - RELEASE_COLLREQ(coll_request); - } - } - - IBOFFLOAD_VERBOSE(10, ("Exit with success.\n")); - - return 0; -} - -/* - * IBOFFLOAD component progress. - */ - -static int progress_one_device(mca_bcol_iboffload_device_t *device) -{ - int ne, rc, count = 0; - - mca_bcol_iboffload_collfrag_t *coll_frag; - mca_bcol_iboffload_collreq_t *coll_request; - - struct ibv_wc wc; - memset(&wc, 0, sizeof(struct ibv_wc)); - - /* - * poll for collective completion - does not mean resources can - * be freed, as incomplete network level sends may still be pending - */ - - /* Poll for completion on completion on wait MQEs */ - if(0 != (ne = ibv_poll_cq(device->ib_mq_cq, 1, &wc))) { - do { - if (OPAL_UNLIKELY(0 > ne)) { - IBOFFLOAD_ERROR(("Device %s: " - "failed to poll MQ completion queue\n", - ibv_get_device_name(device->dev.ib_dev))); - fatal_error("failed to poll MQ completion queue\n"); - return count; - } - - if (OPAL_UNLIKELY(IBV_WC_SUCCESS != wc.status)) { - IBOFFLOAD_ERROR(("Device %s: " - "the completion with error on wait was gotten, status %d, opcode %d, " - "vendor_err 0x%x, qp %x, id 0x%x\n", ibv_get_device_name(device->dev.ib_dev), - wc.status, wc.opcode, wc.vendor_err, wc.qp_num, wc.wr_id)); - fatal_error("wc.status \n"); - return count; - } - - IBOFFLOAD_VERBOSE(10, ("The MQ completion was polled.\n")); - - ++count; - - /* get pointer to mca_bcol_iboffload_collfrag_t */ - coll_frag = (mca_bcol_iboffload_collfrag_t*) - (uint64_t) (uintptr_t) wc.wr_id; - - /* Only last MQ task of collective frag - sends completion signal, so if we got it => - all MQEs were done. */ - coll_frag->complete = true; - - IBOFFLOAD_VERBOSE(10, ("MQ completion for algorithm %d coll_frag_addr %p ml buffer index %d", - coll_frag->alg, (void *)coll_frag, coll_frag->coll_full_req->ml_buffer_index)); - - /* full request descriptor */ - coll_request = coll_frag->coll_full_req; - - coll_request->n_frag_mpi_complete++; - - /* - * at this stage all receives have been completed, so - * unpack the data to user buffer, the resources will be released when we will done with all - * element in the task list - */ - - if (NULL != coll_request->completion_cb_fn) { - if (OMPI_SUCCESS != - coll_request->completion_cb_fn(coll_frag)) { - fatal_error("coll_request->completion_cb_fn\n"); - return count; - } - } - - if (coll_request->n_frag_mpi_complete == - coll_request->n_fragments) { - OPAL_ATOMIC_SWAP_PTR(&coll_request->super.reg_complete, REQUEST_COMPLETED); - IBOFFLOAD_VERBOSE(10, ("After request completion.\n")); - } - - rc = handle_collfrag_done(coll_frag, coll_request, device); - if (0 != rc) { - return count; - } - } while(0 != (ne = ibv_poll_cq(device->ib_mq_cq, 1, &wc))); - - return count; - } - - /* poll the send completion queue */ - do { - ne = ibv_poll_cq(device->ib_cq, 1, &wc); - if (0 < ne) { - if (OPAL_UNLIKELY(IBV_WC_SUCCESS != wc.status)) { - IBOFFLOAD_ERROR(("Device %s, " - "the completion with error on send was gotten, status %d, opcode %d, " - "vendor_err 0x%x, qp %x, id 0x%x\n", ibv_get_device_name(device->dev.ib_dev), - wc.status, wc.opcode, wc.vendor_err, wc.qp_num, wc.wr_id)); - -#if OPAL_ENABLE_DEBUG - { - mca_bcol_iboffload_module_t *iboffload; - int i, qp_index, num_qps = mca_bcol_iboffload_component.num_qps; - - coll_frag = (mca_bcol_iboffload_collfrag_t*) - (uint64_t) (uintptr_t) wc.wr_id; - - iboffload = coll_frag->coll_full_req->module; - - for (i = 0; i < iboffload->num_endpoints; ++i) { - mca_bcol_iboffload_endpoint_t *ep = iboffload->endpoints[i]; - - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - if (NULL != ep->qps[qp_index].qp->lcl_qp && - wc.qp_num == ep->qps[qp_index].qp->lcl_qp->qp_num) { - IBOFFLOAD_ERROR(("Module - %p, coll_frag - %p, " - "destination %d, qp index - %d.", - iboffload, coll_frag, i, qp_index)); - } - } - } - } -#endif - fatal_error("Failed to ibv_poll_cq\n"); - return count; - } - - ++count; - - /* get pointer to mca_bcol_iboffload_collfrag_t */ - coll_frag = (mca_bcol_iboffload_collfrag_t*) - (uint64_t) (uintptr_t) wc.wr_id; - - /* update the number of completed sends */ - coll_frag->n_sends_completed++; - - IBOFFLOAD_VERBOSE(10, ("Send CQ completion for algorithm %d coll_frag_addr %p ml buffer index %d", - coll_frag->alg, (void *)coll_frag, coll_frag->coll_full_req->ml_buffer_index)); - - IBOFFLOAD_VERBOSE(10, ("Alg %d coll_frag_addr %p: n_sends_completed - %d, n_sends - %d.\n", - coll_frag->alg, (void *)coll_frag, - coll_frag->n_sends_completed, - coll_frag->n_sends)); - - assert(coll_frag->n_sends_completed <= coll_frag->n_sends); - - /* full message descriptor */ - coll_request = coll_frag->coll_full_req; - - /* check to see if all sends are complete from the network - * perspective */ - rc = handle_collfrag_done(coll_frag, coll_request, device); - if (0 != rc) { - return count; - } - } else if (OPAL_UNLIKELY(0 > ne)) { - IBOFFLOAD_ERROR(("Device %s: " - "failed to poll send completion queue\n", - ibv_get_device_name(device->dev.ib_dev))); - fatal_error("failed to poll send completion queue\n"); - return count; - } - } while (0 != ne); - - return count; -} - -int mca_bcol_iboffload_component_progress(void) -{ - int i, count = 0; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - opal_pointer_array_t *devs = &cm->devices; - - int devices_count = cm->num_devs; - - for(i = 0; i < devices_count; ++i) { - mca_bcol_iboffload_device_t *device = - opal_pointer_array_get_item(devs, i); - - if (OPAL_LIKELY(device->activated)) { - count += progress_one_device(device); - } - } - - return count; -} - -#if OPAL_ENABLE_DEBUG /* debug code */ -int task_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task) -{ - int i, j, num_qps = mca_bcol_iboffload_component.num_qps; - for (i = 0; i < iboffload->num_endpoints; i++) { - for (j = 0; j < num_qps; j++) { - if (task->post.qp == iboffload->endpoints[i]->qps[j].qp->lcl_qp) { - return i; - } - } - } - - return -1; /* not found ! */ -} - -int wait_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task) -{ - int i, j; - for (i = 0; i < iboffload->num_endpoints; i++) { - for (j = 0; j < IBOFFLOAD_CQ_LAST; j++) { - if (task->wait.cq == iboffload->endpoints[i]->recv_cq[j]) { - return i; - } - } - } - - return -1; /* not found ! */ -} - -#endif /* debug code */ diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_device.h b/ompi/mca/bcol/iboffload/bcol_iboffload_device.h deleted file mode 100644 index a7503df4f0..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_device.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_IBOFFLOAD_DEVICE_H -#define MCA_BCOL_IBOFFLOAD_DEVICE_H - -#include "ompi_config.h" - -#include -#include - -#include - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" - -#define BCOL_IBOFFLOAD_DUMMY_MEM_SIZE 1 - -BEGIN_C_DECLS - -/* Device OBJ */ -struct mca_bcol_iboffload_device_t { - opal_list_item_t super; - - bool activated; - - struct ompi_common_ofacm_base_dev_desc_t dev; - struct ibv_pd *ib_pd; - struct ibv_device_attr ib_dev_attr; - - int num_act_ports; - - struct mca_bcol_iboffload_port_t *ports; - struct ibv_cq *ib_cq; - - /* CQ for MQs of all iboffload modules on this device */ - struct ibv_cq *ib_mq_cq; - - /* The free list of registered buffers - * since the registration depends on PD, it is - * most resonable place to keep the frags */ - ompi_free_list_t *frags_free; - mca_mpool_base_module_t *mpool; - - /* netowrk context */ - bcol_base_network_context_t *net_context; - - /* We keep dummy frags for all QPs on each device, - possibly some of QPs don't need it but anyway we distribute dummy - for them. All dummies point to a same byte of memory. */ - mca_bcol_iboffload_frag_t dummy_frags[MCA_BCOL_IBOFFLOAD_QP_LAST]; - - /* Registred memory for the dummy frags */ - char dummy_mem[BCOL_IBOFFLOAD_DUMMY_MEM_SIZE]; - - /* Registration info of the dummy memory */ - mca_bcol_iboffload_reg_t dummy_reg; -}; - -typedef struct mca_bcol_iboffload_device_t mca_bcol_iboffload_device_t; -OBJ_CLASS_DECLARATION(mca_bcol_iboffload_device_t); - -END_C_DECLS - -#endif /* MCA_BCOL_IBOFFLOAD_DEVICE_H */ - diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_endpoint.c b/ompi/mca/bcol/iboffload/bcol_iboffload_endpoint.c deleted file mode 100644 index 50d0eeeb8c..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_endpoint.c +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include - -#include "ompi/constants.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/mca/common/ofacm/connect.h" - -#include "opal/threads/mutex.h" -#include "opal/class/opal_object.h" - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_device.h" -#include "bcol_iboffload_endpoint.h" - -static void mca_bcol_iboffload_endpoint_construct(mca_bcol_iboffload_endpoint_t *ep) -{ - ep->iboffload_module = NULL; - ep->ibnet_proc = NULL; - - ep->qps = (mca_bcol_iboffload_endpoint_qp_t *) - calloc(mca_bcol_iboffload_component.num_qps, - sizeof(mca_bcol_iboffload_endpoint_qp_t)); - - ep->index = 0; - OBJ_CONSTRUCT(&ep->endpoint_lock, opal_mutex_t); - OBJ_CONSTRUCT(&ep->pending_frags, opal_list_t); - - memset(ep->recv_cq, 0, IBOFFLOAD_CQ_LAST * sizeof(ep->recv_cq[0])); - memset(&ep->qp_config, 0, sizeof(ompi_common_ofacm_base_qp_config_t)); - - ep->cpc_context = NULL; - - memset(&ep->remote_zero_rdma_addr, 0, sizeof(mca_bcol_iboffload_rdma_info_t)); - memset(&ep->remote_rdma_block, 0, sizeof(mca_bcol_iboffload_rem_rdma_block_t)); - - ep->need_toset_remote_rdma_info = false; -} - -static void mca_bcol_iboffload_endpoint_destruct(mca_bcol_iboffload_endpoint_t *ep) -{ - int qp_index, num_qps, i; - ompi_free_list_item_t *item; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - num_qps = cm->num_qps; - - IBOFFLOAD_VERBOSE(10, ("Destruct: ep - %p, ep->index - %d", ep, ep->index)); - - if (NULL != ep->qps) { - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - do { - item = (ompi_free_list_item_t *) - opal_list_remove_first(&ep->qps[qp_index].preposted_frags); - if(OPAL_LIKELY(NULL != item)) { - OMPI_FREE_LIST_RETURN_MT(&ep->device->frags_free[qp_index], item); - } - } while (NULL != item); - - OBJ_DESTRUCT(&ep->qps[qp_index].preposted_frags); - } - - free(ep->qps); - } - - OBJ_DESTRUCT(&ep->endpoint_lock); - OBJ_DESTRUCT(&ep->pending_frags); - - /* If the CPC has an endpoint_finalize function, call it */ - if (NULL != ep->endpoint_cpc->cbm_endpoint_finalize) { - ep->endpoint_cpc->cbm_endpoint_finalize(ep->cpc_context); - } - - for (i = 0; i < IBOFFLOAD_CQ_LAST; i++) { - if (NULL != ep->recv_cq[i]) { - if (ibv_destroy_cq(ep->recv_cq[i])) { - IBOFFLOAD_ERROR(("Endpoint %x " - ", failed to destroy CQ, errno says %s", - ep, strerror(errno))); - } - } - } -} - -OBJ_CLASS_INSTANCE(mca_bcol_iboffload_endpoint_t, - opal_list_item_t, - mca_bcol_iboffload_endpoint_construct, - mca_bcol_iboffload_endpoint_destruct); - -/* Pasha: Add some error message here */ - -/* - * Called when the CPC has established a connection on an endpoint - */ -static void mca_bcol_iboffload_endpoint_invoke_error(void *context) -{ - mca_bcol_iboffload_endpoint_t *endpoint = (mca_bcol_iboffload_endpoint_t *) context; - IBOFFLOAD_ERROR(("Getting error on endpoint - %p!", endpoint)); -} - - -/* Pasha: Need to add more logic here */ -static void mca_bcol_iboffload_endpoint_cpc_complete(void *context) -{ - mca_bcol_iboffload_endpoint_t *endpoint = (mca_bcol_iboffload_endpoint_t *) context; - - IBOFFLOAD_VERBOSE(10, ("Endpoint - %p for comm rank %d: CPC complete.\n", - endpoint, endpoint->iboffload_module->ibnet->super.group_list[endpoint->index])); - - if (OMPI_SUCCESS != - mca_bcol_iboffload_exchange_rem_addr(endpoint)) { - IBOFFLOAD_ERROR(("endpoint - %p, " - "remote addr exchange error.\n", endpoint)); - } - /* The connection is correctly setup. Now we can decrease the - event trigger. */ - opal_progress_event_users_decrement(); -} - -/* Vasily: Need to add more logic here */ -int mca_bcol_iboffload_endpoint_post_recvs(void *context) -{ - int qp_index, rc, num_qps; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - mca_bcol_iboffload_endpoint_t *endpoint = - (mca_bcol_iboffload_endpoint_t *) context; - - IBOFFLOAD_VERBOSE(10, ("endpoint - %p, post of %d recvs !", - endpoint, cm->qp_infos[0].rd_num)); - /* TODO Pasha - fix later */ - num_qps = cm->num_qps; - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - rc = mca_bcol_iboffload_prepost_recv(endpoint, qp_index, - cm->qp_infos[qp_index].rd_num); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - /* Pasha: Need to add more failure logic */ - IBOFFLOAD_ERROR(("Failed to prepost recv fragments " - "on qp index %d, return code - %d", - qp_index, rc)); - - return OMPI_ERROR; - } - } - - return OMPI_SUCCESS; -} - -/* The function go over each ibnet proc and creates endpoint for each one */ -int mca_bcol_iboffloads_create_endpoints(mca_sbgp_ibnet_connection_group_info_t *cgroup, - mca_bcol_iboffload_module_t *module) { - uint32_t i; - mca_bcol_iboffload_endpoint_t *ep; - - if (NULL == cgroup || NULL == module) { - IBOFFLOAD_ERROR(("Bad parameters for create endpoints function.")); - return OMPI_ERROR; - } - - module->num_endpoints = cgroup->num_procs; - module->endpoints = (mca_bcol_iboffload_endpoint_t **) - calloc(module->num_endpoints, - sizeof(mca_bcol_iboffload_endpoint_t *)); - if (NULL == module->endpoints) { - IBOFFLOAD_ERROR(("Error memory allocation for endpoints array" - ", errno says %s", strerror(errno))); - return OMPI_ERROR; - } - - IBOFFLOAD_VERBOSE(10, ("iboffload - %p, num of endpoints - %d.\n", - module, module->num_endpoints)); -/* Ishai: No need to open so many endpoints. We are not talking with all procs */ - for (i = 0; i < cgroup->num_procs; i++) { - ep = OBJ_NEW(mca_bcol_iboffload_endpoint_t); - /* check qp memory allocation */ - if (NULL == ep->qps) { - IBOFFLOAD_ERROR(("Failed to allocate memory for qps")); - return OMPI_ERROR; - } - /* init new endpoint */ - ep->index = i; - ep->iboffload_module = module; - /* saving the device for the destruction - iboffload module amy not exist than */ - ep->device = ep->iboffload_module->device; - ep->ibnet_proc = (mca_sbgp_ibnet_proc_t *) - opal_pointer_array_get_item(cgroup->ibnet_procs, i); - if (NULL == ep->ibnet_proc) { - IBOFFLOAD_ERROR(("Failed to get proc pointer, for index %d", i)); - return OMPI_ERROR; - } - - if (OMPI_SUCCESS != - mca_bcol_iboffload_endpoint_init(ep)) { - IBOFFLOAD_ERROR(("Failed to init endpoint - %p", ep)); - return OMPI_ERROR; - } - - IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, ep index - %d, iboffload - %p, " - "cpc contex - %p.\n", ep, ep->index, - ep->iboffload_module, ep->cpc_context)); - - /* Add the new endpoint to array of endpoints */ - module->endpoints[i] = ep; - } - - /* Pasha: Need to add better clean-up here */ - return OMPI_SUCCESS; -} - -static int config_qps(mca_bcol_iboffload_endpoint_t *ep) -{ - int qp_index; - int ret = OMPI_SUCCESS; - - ompi_common_ofacm_base_qp_config_t *qp_config = &ep->qp_config; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - qp_config->num_srqs = 0; - qp_config->srq_num = NULL; - - qp_config->num_qps = cm->num_qps; - - qp_config->init_attr = (struct ibv_qp_init_attr *) - calloc(qp_config->num_qps, sizeof(struct ibv_qp_init_attr)); - - if (NULL == qp_config->init_attr) { - IBOFFLOAD_ERROR(("Failed allocate memory for qp init attributes")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - - goto config_qps_exit; - } - - qp_config->attr = (struct ibv_qp_attr *) - calloc(qp_config->num_qps, sizeof(struct ibv_qp_attr)); - - if (OPAL_UNLIKELY(NULL == qp_config->attr)) { - IBOFFLOAD_ERROR(("Failed allocate memory for qp attributes")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - - goto config_qps_exit; - } - - /* we must to specify that the qps are special */ - qp_config->init_attr_mask = (uint32_t *) - calloc(qp_config->num_qps, sizeof(uint32_t)); - - if (OPAL_UNLIKELY(NULL == qp_config->init_attr_mask)) { - IBOFFLOAD_ERROR(("Failed allocate memory for qp mask.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - - goto config_qps_exit; - } - - /* qp_config->rtr_attr_mask = qp_config->rts_attr_mask = NULL; */ - - qp_config->rtr_attr_mask = (uint32_t *) - calloc(qp_config->num_qps, sizeof(uint32_t)); - - if (OPAL_UNLIKELY(NULL == qp_config->rtr_attr_mask)) { - IBOFFLOAD_ERROR(("Failled allocate memory for qp rtr attributes mask.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - - goto config_qps_exit; - } - - qp_config->rts_attr_mask = (uint32_t *) - calloc(qp_config->num_qps, sizeof(uint32_t)); - - if (OPAL_UNLIKELY(NULL == qp_config->rts_attr_mask)) { - IBOFFLOAD_ERROR(("Failled allocate memory for qp rts attributes mask.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - - goto config_qps_exit; - } - - for (qp_index = 0; qp_index < qp_config->num_qps; ++qp_index) { - mca_bcol_iboffload_config_qps_fn_t config_qp = - cm->qp_infos[qp_index].config_qp; - - if (NULL != config_qp) { - config_qp(qp_index, ep, qp_config); - } - } - -config_qps_exit: - return ret; -} - -/* The fucntion is called for endpoints - * with MCA_COMMON_OFACM_USER_CUSTOM state only, - * we need a OPAL_THREAD_LOCK before call to this function */ -int mca_bcol_iboffload_endpoint_init(mca_bcol_iboffload_endpoint_t *ep) -{ - int qp_index, cq_index, num_qps; - ompi_common_ofacm_base_module_t *cpc; - - mca_bcol_iboffload_device_t *device = ep->iboffload_module->device; - - mca_sbgp_ibnet_connection_group_info_t *cgroup = - &ep->iboffload_module->ibnet->cgroups[ep->iboffload_module->cgroup_index]; - - for (cq_index = 0; cq_index < IBOFFLOAD_CQ_LAST; cq_index++) { - if (OMPI_SUCCESS != - mca_bcol_iboffload_adjust_cq(device, &ep->recv_cq[cq_index])) { - IBOFFLOAD_ERROR(("Error creating CQ for %s errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - /* OBJ_RELEASE(ep); */ /* Vasily: What must we do in this case ??? */ - return OMPI_ERROR; - } - } - - if (OPAL_UNLIKELY(OMPI_SUCCESS != config_qps(ep))) { - IBOFFLOAD_ERROR(("Error configure QPs for endpoint %x errno says %s", - ep, strerror(errno))); - return OMPI_ERROR; - } - - /* Adding here one more redirection in critical path. Need to think - * what is the best way to prevent it */ - - IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, rem port - %d", ep, - ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].id)); - - cpc = ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].local_cpc; - ep->endpoint_cpc = cpc; /* caching pointer to cpc */ - - if (NULL != cpc->cbm_endpoint_init) { - ep->cpc_context = cpc->cbm_endpoint_init( - ep->ibnet_proc->ompi_proc, - &ep->qp_config, - device->ib_pd, - ep->iboffload_module->subnet_id, - ep->iboffload_module->ibnet->group_id, - ep->iboffload_module->lid, - /* Remote lid of target module */ - ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].lid, - ep->index, /* user context index */ - (void *) ep, /* user context */ - cpc, - mca_bcol_iboffload_endpoint_cpc_complete, - mca_bcol_iboffload_endpoint_invoke_error, - mca_bcol_iboffload_endpoint_post_recvs); - - if (OPAL_UNLIKELY(NULL == ep->cpc_context)) { - IBOFFLOAD_ERROR(("Endpoint - %p, failed to init context", ep)); - /* OBJ_RELEASE(ep); */ /* Vasily: What must we do in this case ??? */ - return OMPI_ERROR; - } - - /* Updating remote port info */ - num_qps = mca_bcol_iboffload_component.num_qps; - - ep->remote_info = &ep->cpc_context->remote_info; - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - ep->qps[qp_index].qp = &ep->cpc_context->qps[qp_index]; - } - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_endpoint.h b/ompi/mca/bcol/iboffload/bcol_iboffload_endpoint.h deleted file mode 100644 index 7a57b57a7b..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_endpoint.h +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_IBOFFLOAD_ENDPOINT_H -#define MCA_BCOL_IBOFFLOAD_ENDPOINT_H - -#include "ompi_config.h" -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" - -#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h" - -#define BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) (ep)->ibnet_proc->use_port[(cgroup)->index] -#define BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep) (BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) - 1) - -BEGIN_C_DECLS - -struct mca_bcol_iboffload_endpoint_qp_t { - struct ompi_common_ofacm_base_qp_t *qp; - size_t ib_inline_max; - int32_t sd_wqe; /* Number of available send wqe entries */ - int32_t rd_wqe; /* Number of available recv wqe entries */ - opal_list_t preposted_frags; /* List of preposted frags */ - /* opal_mutex_t lock; */ /* Do I need lock here ? */ -}; - -typedef struct mca_bcol_iboffload_endpoint_qp_t mca_bcol_iboffload_endpoint_qp_t; - -enum { - IBOFFLOAD_CQ_SMALL_MESSAGES = 0, - IBOFFLOAD_CQ_SYNC, - IBOFFLOAD_CQ_LARGE_MESSAGES, - IBOFFLOAD_CQ_LAST -}; - -/* Endpoint object */ -struct mca_bcol_iboffload_endpoint_t { - opal_list_item_t super; - - /** BTL module that created this connection */ - mca_bcol_iboffload_module_t *iboffload_module; - - /** proc structure corresponding to endpoint */ - mca_sbgp_ibnet_proc_t *ibnet_proc; - - /** lock for concurrent access to endpoint state */ - opal_mutex_t endpoint_lock; - - /** Penging frag list */ - opal_list_t pending_frags; - - /** QPs information */ - mca_bcol_iboffload_endpoint_qp_t *qps; - - /** endpoint index on array */ - int32_t index; - - /** CQ for receive queues on this endpoint */ - struct ibv_cq *recv_cq[IBOFFLOAD_CQ_LAST]; - - /** QP configuration information */ - ompi_common_ofacm_base_qp_config_t qp_config; - - /** cpc context */ - ompi_common_ofacm_base_local_connection_context_t *cpc_context; - - /** caching pointer to remote info */ - ompi_common_ofacm_base_remote_connection_context_t *remote_info; - - /** caching pointer to cpc */ - ompi_common_ofacm_base_module_t *endpoint_cpc; - - /** The struct is used for zero RDMA with immediate - in some collectives, in barrier for example. */ - mca_bcol_iboffload_rdma_info_t remote_zero_rdma_addr; - mca_bcol_iboffload_rem_rdma_block_t remote_rdma_block; - - /** The pointer to device - In the destruction function - the iboffload module may not exist any more - caching the device */ - struct mca_bcol_iboffload_device_t *device; - - bool need_toset_remote_rdma_info; - - mca_bcol_iboffload_rdma_info_t remote_rdma_info[MAX_REMOTE_RDMA_INFO]; -}; -typedef struct mca_bcol_iboffload_endpoint_t mca_bcol_iboffload_endpoint_t; -OBJ_CLASS_DECLARATION(mca_bcol_iboffload_endpoint_t); - -/* Function declaration */ -int mca_bcol_iboffload_endpoint_init(mca_bcol_iboffload_endpoint_t *ep); - -static inline __opal_attribute_always_inline__ - int check_endpoint_state(mca_bcol_iboffload_endpoint_t *ep, - mca_bcol_base_descriptor_t *des, - opal_list_t *pending_list) -{ - int rc = OMPI_ERR_RESOURCE_BUSY; - - OPAL_THREAD_LOCK(&ep->cpc_context->context_lock); - /* Adding here one more redirection in critical path. Need to think - * what is the best way to prevent it */ - switch(ep->cpc_context->state) { - case MCA_COMMON_OFACM_CLOSED: - rc = ep->endpoint_cpc->cbm_start_connect(ep->cpc_context); - if (OMPI_SUCCESS == rc) { - rc = OMPI_ERR_RESOURCE_BUSY; - } - /* - * As long as we expect a message from the peer (in order - * to setup the connection) let the event engine pool the - * OOB events. Note: we increment it once peer active - * connection. - */ - opal_progress_event_users_increment(); - /* fall through */ - default: - /* opal_list_append(pending_list, (opal_list_item_t *)des); */ /* Vasily: will be uncomment later */ - break; - case MCA_COMMON_OFACM_FAILED: - rc = OMPI_ERR_UNREACH; - break; - case MCA_COMMON_OFACM_CONNECTED: - rc = OMPI_SUCCESS; - break; - } - - OPAL_THREAD_UNLOCK(&ep->cpc_context->context_lock); - return rc; -} - -int mca_bcol_iboffloads_create_endpoints(mca_sbgp_ibnet_connection_group_info_t *cgroup, - mca_bcol_iboffload_module_t *module); - -int mca_bcol_iboffload_endpoint_post_recvs(void *context); - -static inline __opal_attribute_always_inline__ int - mca_bcol_iboffload_prepost_recv( - mca_bcol_iboffload_endpoint_t *endpoint, - int qp_index, int num_to_prepost) -{ - mca_bcol_iboffload_prepost_qps_fn_t prepost_recv = - mca_bcol_iboffload_component.qp_infos[qp_index].prepost_recv; - if (NULL != prepost_recv) { - return prepost_recv(endpoint, qp_index, num_to_prepost); - } - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ int - mca_bcol_iboffload_post_ml_scatter_recv_frag( - int qp_index, uint32_t dest_rank, - int nitems, struct iovec *buff_iovec, - uint32_t lkey, - struct ibv_sge *sg_entries, - mca_bcol_iboffload_frag_t *frag, - mca_bcol_iboffload_module_t *iboffload) -{ - int ret, start_wr_index; - struct ibv_recv_wr *recv_wr, *recv_bad; - int i; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank]; - - mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs; - mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device; - - IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d", - (void *) endpoint, qp_index)); - - /* make sure that we do not overrun number of rd_wqe */ - if (0 >= endpoint->qps[qp_index].rd_wqe) { - IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d", - endpoint->qps[qp_index].rd_wqe)); - - return 0; - } - - OPAL_THREAD_LOCK(&recv_wrs->lock); - - /* Calculate start index in array - * of pre-allocated work requests */ - start_wr_index = cm->qp_infos[qp_index].rd_num - 1; - recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index]; - - IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, " - "start index of WRs - %d", (void *) endpoint, - qp_index, start_wr_index)); - - for (i = 0; i < nitems; i++) { - sg_entries[i].length = buff_iovec[i].iov_len; - sg_entries[i].addr = (uint64_t)buff_iovec[i].iov_base; - sg_entries[i].lkey = lkey; - - IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , length %d , address %p", - i, sg_entries[i].length, sg_entries[i].addr)); - - IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , iovec length %d", - i, buff_iovec[i].iov_len)); - } - - recv_wr->num_sge = nitems; - recv_wr->sg_list = sg_entries; - - /* Set the tail */ - recv_wr->next = NULL; - - /* post the list of recvs */ - ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad); - if (OPAL_UNLIKELY(0 != ret)) { - IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], " - "qp_index - %d.\n", - ibv_get_device_name(device->dev.ib_dev), - strerror(errno), ret, qp_index)); - - return -1; - } - - /* decresing numbers of free recv wqe */ - --endpoint->qps[qp_index].rd_wqe; - - OPAL_THREAD_UNLOCK(&recv_wrs->lock); - - IBOFFLOAD_VERBOSE(10, ("Return success: " - "endpoint %p, qp_index %d, dest_rank %d", - endpoint, qp_index, dest_rank)); - - return 1; -} - -static inline __opal_attribute_always_inline__ int - mca_bcol_iboffload_prepost_ml_recv_frag( - int qp_index, uint32_t dest_rank, - mca_bcol_iboffload_frag_t *frag, - mca_bcol_iboffload_module_t *iboffload) -{ - int ret, start_wr_index; - struct ibv_recv_wr *recv_wr, *recv_bad; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank]; - - mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs; - mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device; - - IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d", - (void *) endpoint, qp_index)); - - /* make sure that we do not overrun number of rd_wqe */ - if (0 >= endpoint->qps[qp_index].rd_wqe) { - IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d", - endpoint->qps[qp_index].rd_wqe)); - - return 0; - } - - OPAL_THREAD_LOCK(&recv_wrs->lock); - - /* Calculate start index in array - * of pre-allocated work requests */ - start_wr_index = cm->qp_infos[qp_index].rd_num - 1; - recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index]; - - IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, " - "start index of WRs - %d", (void *) endpoint, - qp_index, start_wr_index)); - - recv_wr->sg_list = &frag->sg_entry; - - /* Set the tail */ - recv_wr->next = NULL; - - /* post the list of recvs */ - ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad); - if (OPAL_UNLIKELY(0 != ret)) { - IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], " - "qp_index - %d.\n", - ibv_get_device_name(device->dev.ib_dev), - strerror(errno), ret, qp_index)); - - return -1; - } - - /* decresing numbers of free recv wqe */ - --endpoint->qps[qp_index].rd_wqe; - - OPAL_THREAD_UNLOCK(&recv_wrs->lock); - - IBOFFLOAD_VERBOSE(10, ("Return success: " - "endpoint %p, qp_index %d, dest_rank %d", - endpoint, qp_index, dest_rank)); - - return 1; -} - -static inline __opal_attribute_always_inline__ - mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_preposted_recv_frag( - mca_bcol_iboffload_module_t *iboffload, - int source, int qp_index) -{ - mca_bcol_iboffload_frag_t *frag; - mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[source]; - - frag = mca_bcol_iboffload_component.qp_infos[qp_index].get_preposted_recv(endpoint, qp_index); - - /* do we want to run prepost */ - IBOFFLOAD_VERBOSE(10, ("source - %d, qp_index - %d; " - "allocating preposted addr %p.\n", - source, qp_index, (void *) frag->sg_entry.addr)); - - if (OPAL_LIKELY(NULL != frag)) { - frag->next = NULL; - } - - return frag; -} - -END_C_DECLS - -#endif /* MCA_BCOL_IBOFFLOAD_ENDPOINT_H */ diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_fanin.c b/ompi/mca/bcol/iboffload/bcol_iboffload_fanin.c deleted file mode 100644 index 49f771d46b..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_fanin.c +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include -#include -#include - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" -#include "bcol_iboffload_collfrag.h" -#include "bcol_iboffload_endpoint.h" - -static int mca_bcol_iboffload_fanin_leader_progress( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc = OMPI_SUCCESS, leader_rank = 0, rank, - sbgp_size = iboffload->ibnet->super.group_size; - - struct mqe_task *last_wait = NULL; - - mca_bcol_iboffload_task_t *wait_task = NULL; - mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL; - - struct mqe_task **mqe_ptr_to_set; - mca_bcol_iboffload_collfrag_t *coll_fragment; - - coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mqe_ptr_to_set = &coll_fragment->to_post; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - for (rank = leader_rank + 1; rank < sbgp_size; ++rank) { - /* post wait */ - preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, rank, coll_request->qp_index); - if(NULL == preposted_recv_frag) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n")); - goto out_of_resources; - } - - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if(NULL == wait_task) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - } - - /* end of list */ - *mqe_ptr_to_set = NULL; - - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if(OMPI_SUCCESS != rc) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -static int mca_bcol_iboffload_fanin_proxy_progress( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc = OMPI_SUCCESS, leader_rank = 0; - - struct mqe_task *last_send = NULL; - mca_bcol_iboffload_task_t *send_task = NULL; - mca_bcol_iboffload_frag_t *send_fragment = NULL; - - struct mqe_task **mqe_ptr_to_set; - mca_bcol_iboffload_collfrag_t *coll_fragment; - - coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mqe_ptr_to_set = &coll_fragment->to_post; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - /* post send */ - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - leader_rank, coll_request->qp_index, 0, - 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); - if(NULL == send_fragment) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n")); - goto out_of_resources; - } - - send_task = mca_bcol_iboffload_get_send_task(iboffload, leader_rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER, - send_fragment, coll_fragment, INLINE); - if(NULL == send_task) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - - /* end of list */ - *mqe_ptr_to_set = NULL; - assert(NULL != last_send); - - last_send->flags |= MQE_WR_FLAG_SIGNAL; - - coll_fragment->signal_task_wr_id = last_send->wr_id; - last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if(OMPI_SUCCESS != rc) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -static int mca_bcol_iboffload_fanin_init( - bcol_function_args_t *input_args, - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t **coll_request) -{ - ompi_free_list_item_t *item = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = NULL; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init")); - - OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item); - if(OPAL_UNLIKELY(NULL == item)) { - IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - (*coll_request) = (mca_bcol_iboffload_collreq_t *) item; - (*coll_request)->progress_fn = iboffload->fanin_algth; - - (*coll_request)->completion_cb_fn = NULL; - (*coll_request)->order_info = &input_args->order_info; - - (*coll_request)->module = iboffload; - (*coll_request)->ml_buffer_index = input_args->buffer_index; - (*coll_request)->buffer_info[SBUF].offset = 0; - (*coll_request)->buffer_info[RBUF].offset = 0; - (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER; - - input_args->bcol_opaque_data = (void *) (*coll_request); - - /* finish initializing full message descriptor */ - (*coll_request)->n_fragments = 1; - (*coll_request)->n_frags_sent = 1; - - (*coll_request)->n_frag_mpi_complete = 0; - (*coll_request)->n_frag_net_complete = 0; - - (*coll_request)->user_handle_freed = false; - - /* - * setup collective work request - */ - - /* get collective frag */ - coll_fragment = &(*coll_request)->first_collfrag; - mca_bcol_iboffload_collfrag_init(coll_fragment); - - coll_fragment->alg = FANIN_ALG; - coll_fragment->mq_index = COLL_MQ; - - /* Set mq credits */ - coll_fragment->mq_credits = iboffload->alg_task_consump[FANIN_ALG]; - - /* set pointers for (coll frag) <-> (coll full request) */ - MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment); - - return OMPI_SUCCESS; -} - -/************************************************************************ - ************************ New style Fan-In ****************************** - ***********************************************************************/ -static int mca_bcol_iboffload_new_style_fanin_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_collreq_t *coll_request = - (mca_bcol_iboffload_collreq_t *) - input_args->bcol_opaque_data; - - if (BCOL_IS_COMPLETED(coll_request)) { - coll_request->user_handle_freed = true; - if (COLLREQ_IS_DONE(coll_request)) { - IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); - RELEASE_COLLREQ(coll_request); - } - - IBOFFLOAD_VERBOSE(10, ("Fan-In already done.\n")); - return BCOL_FN_COMPLETE; - } - - return BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_new_style_fanin_first_call( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int i = 0, leader_rank = 0, /* We always suppose - the lowest index is a leader */ - my_rank = iboffload->ibnet->super.my_index, - sbgp_size = iboffload->ibnet->super.group_size; - - mca_bcol_iboffload_endpoint_t *ep = NULL; - mca_sbgp_ibnet_proc_t *my_ibnet_proc = iboffload->endpoints[my_rank]->ibnet_proc; - - assert(NULL != my_ibnet_proc); - - if (MCA_SBGP_IBNET_NODE_LEADER == my_ibnet_proc->duty) { - iboffload->fanin_algth = mca_bcol_iboffload_fanin_leader_progress; - iboffload->alg_task_consump[FANIN_ALG] += sbgp_size; - - for (i = leader_rank + 1; i < sbgp_size; ++i) { - ep = iboffload->endpoints[i]; - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - } else { - iboffload->fanin_algth = mca_bcol_iboffload_fanin_proxy_progress; - iboffload->alg_task_consump[FANIN_ALG] += 1; - - ep = iboffload->endpoints[leader_rank]; - while(OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - - return iboffload->fanin_algth(iboffload, coll_request); -} - -static int mca_bcol_iboffload_new_style_fanin_intra( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - int rc = OMPI_SUCCESS; - - struct mca_bcol_iboffload_collreq_t *coll_request = NULL; - mca_bcol_iboffload_module_t *iboffload = - (mca_bcol_iboffload_module_t *) const_args->bcol_module; - - assert(NULL != iboffload); - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args); - - /* Init Fan-In collective reqeust */ - rc = mca_bcol_iboffload_fanin_init(input_args, iboffload, &coll_request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Error from mca_bcol_iboffload_fanin_init.\n")); - return BCOL_FN_NOT_STARTED; - } - - rc = iboffload->fanin_algth(iboffload, coll_request); - if (OPAL_UNLIKELY(OMPI_ERROR == rc)) { - return BCOL_FN_NOT_STARTED; - } - - return BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_fanin_register(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - IBOFFLOAD_VERBOSE(10, ("Register iboffload Fan-In.\n")); - - comm_attribs.bcoll_type = BCOL_FANIN; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_new_style_fanin_intra, - mca_bcol_iboffload_new_style_fanin_progress); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_fanout.c b/ompi/mca/bcol/iboffload/bcol_iboffload_fanout.c deleted file mode 100644 index 9ac93d16e7..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_fanout.c +++ /dev/null @@ -1,349 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include -#include -#include - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" -#include "bcol_iboffload_collfrag.h" -#include "bcol_iboffload_endpoint.h" - -static int mca_bcol_iboffload_fanout_leader_progress( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc = OMPI_SUCCESS, leader_rank = 0, rank, - sbgp_size = iboffload->ibnet->super.group_size; - - struct mqe_task *last_send = NULL; - mca_bcol_iboffload_task_t *send_task = NULL; - mca_bcol_iboffload_frag_t *send_fragment = NULL; - - struct mqe_task **mqe_ptr_to_set; - mca_bcol_iboffload_collfrag_t *coll_fragment; - - coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mqe_ptr_to_set = &coll_fragment->to_post; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - for (rank = leader_rank + 1; rank < sbgp_size; ++rank) { - /* post send */ - send_fragment = mca_bcol_iboffload_get_send_frag(coll_request, - rank, coll_request->qp_index, 0, - 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY); - if(NULL == send_fragment) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n")); - goto out_of_resources; - } - - send_task = mca_bcol_iboffload_get_send_task(iboffload, rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER, - send_fragment, coll_fragment, INLINE); - if(NULL == send_task) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - } - - /* end of list */ - *mqe_ptr_to_set = NULL; - assert(NULL != last_send); - - last_send->flags |= MQE_WR_FLAG_SIGNAL; - - coll_fragment->signal_task_wr_id = last_send->wr_id; - last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if(OMPI_SUCCESS != rc) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -static int mca_bcol_iboffload_fanout_proxy_progress( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc = OMPI_SUCCESS, leader_rank = 0; - - struct mqe_task *last_wait = NULL; - mca_bcol_iboffload_task_t *wait_task = NULL; - mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL; - - struct mqe_task **mqe_ptr_to_set; - mca_bcol_iboffload_collfrag_t *coll_fragment; - - coll_fragment = (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mqe_ptr_to_set = &coll_fragment->to_post; - - if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS( - iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - goto out_of_resources; - } - - /* post wait */ - preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag( - iboffload, leader_rank, coll_request->qp_index); - if(NULL == preposted_recv_frag) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n")); - goto out_of_resources; - } - - wait_task = mca_bcol_iboffload_get_wait_task(iboffload, leader_rank, 1, - preposted_recv_frag, coll_request->qp_index, NULL); - if(NULL == wait_task) { - IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n")); - goto out_of_resources; - } - - APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait); - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - - /* end of list */ - *mqe_ptr_to_set = NULL; - - last_wait->flags |= MQE_WR_FLAG_SIGNAL; - - coll_fragment->signal_task_wr_id = last_wait->wr_id; - last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post); - if(OMPI_SUCCESS != rc) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info); - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload); -} - -static int mca_bcol_iboffload_fanout_init( - bcol_function_args_t *input_args, - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t **coll_request) -{ - ompi_free_list_item_t *item = NULL; - mca_bcol_iboffload_collfrag_t *coll_fragment = NULL; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init")); - - OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item); - if(NULL == item) { - IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - (*coll_request) = (mca_bcol_iboffload_collreq_t *) item; - (*coll_request)->progress_fn = iboffload->fanout_algth; - - (*coll_request)->completion_cb_fn = NULL; - (*coll_request)->order_info = &input_args->order_info; - - (*coll_request)->module = iboffload; - (*coll_request)->ml_buffer_index = input_args->buffer_index; - (*coll_request)->buffer_info[SBUF].offset = 0; - (*coll_request)->buffer_info[RBUF].offset = 0; - (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER; - - /* finish initializing full message descriptor */ - (*coll_request)->n_fragments = 1; - (*coll_request)->n_frags_sent = 1; - - (*coll_request)->n_frag_mpi_complete = 0; - (*coll_request)->n_frag_net_complete = 0; - - (*coll_request)->user_handle_freed = false; - - input_args->bcol_opaque_data = (void *) (*coll_request); - - /* - * setup collective work request - */ - - /* get collective frag */ - coll_fragment = &(*coll_request)->first_collfrag; - mca_bcol_iboffload_collfrag_init(coll_fragment); - - coll_fragment->alg = FANOUT_ALG; - coll_fragment->mq_index = COLL_MQ; - - /* Set mq credits */ - coll_fragment->mq_credits = iboffload->alg_task_consump[FANOUT_ALG]; - - /* set pointers for (coll frag) <-> (coll full request) */ - MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment); - - return OMPI_SUCCESS; -} - -/************************************************************************ - ************************ New style Fan-In ****************************** - ***********************************************************************/ -static int mca_bcol_iboffload_new_style_fanout_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_iboffload_collreq_t *coll_request = - (mca_bcol_iboffload_collreq_t *) - input_args->bcol_opaque_data; - - if (BCOL_IS_COMPLETED(coll_request)) { - coll_request->user_handle_freed = true; - if (COLLREQ_IS_DONE(coll_request)) { - IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n")); - RELEASE_COLLREQ(coll_request); - } - - IBOFFLOAD_VERBOSE(10, ("Fan-Out already done.\n")); - return BCOL_FN_COMPLETE; - } - - return BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_new_style_fanout_first_call( - mca_bcol_iboffload_module_t *iboffload, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int i = 0, leader_rank = 0, /* We always suppose - the lowest index is a leader */ - my_rank = iboffload->ibnet->super.my_index, - sbgp_size = iboffload->ibnet->super.group_size; - - mca_bcol_iboffload_endpoint_t *ep = NULL; - mca_sbgp_ibnet_proc_t *my_ibnet_proc = iboffload->endpoints[my_rank]->ibnet_proc; - - assert(NULL != my_ibnet_proc); - - if (MCA_SBGP_IBNET_NODE_LEADER == my_ibnet_proc->duty) { - iboffload->fanout_algth = mca_bcol_iboffload_fanout_leader_progress; - iboffload->alg_task_consump[FANOUT_ALG] += sbgp_size; - - for (i = leader_rank + 1; i < sbgp_size; ++i) { - ep = iboffload->endpoints[i]; - while (OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - } else { - iboffload->fanout_algth = mca_bcol_iboffload_fanout_proxy_progress; - iboffload->alg_task_consump[FANOUT_ALG] += 1; - - ep = iboffload->endpoints[leader_rank]; - while(OMPI_SUCCESS != - check_endpoint_state(ep, NULL, NULL)) { - opal_progress(); - } - } - - return iboffload->fanout_algth(iboffload, coll_request); -} - -static int mca_bcol_iboffload_new_style_fanout_intra( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - int rc = OMPI_SUCCESS; - - struct mca_bcol_iboffload_collreq_t *coll_request = NULL; - mca_bcol_iboffload_module_t *iboffload = - (mca_bcol_iboffload_module_t *) const_args->bcol_module; - - assert(NULL != iboffload); - - MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args); - - /* Init Fan-In collective reqeust */ - rc = mca_bcol_iboffload_fanout_init(input_args, iboffload, &coll_request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("Error from mca_bcol_iboffload_fanin_init.\n")); - return BCOL_FN_NOT_STARTED; - } - - rc = iboffload->fanout_algth(iboffload, coll_request); - if (OPAL_UNLIKELY(OMPI_ERROR == rc)) { - return BCOL_FN_NOT_STARTED; - } - - return BCOL_FN_STARTED; -} - -int mca_bcol_iboffload_fanout_register(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - IBOFFLOAD_VERBOSE(10, ("Register iboffload Fan-In.\n")); - - comm_attribs.bcoll_type = BCOL_FANOUT; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, - &comm_attribs, &inv_attribs, - mca_bcol_iboffload_new_style_fanout_intra, - mca_bcol_iboffload_new_style_fanout_progress); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_frag.c b/ompi/mca/bcol/iboffload/bcol_iboffload_frag.c deleted file mode 100644 index 0ecf1ef62e..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_frag.c +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "opal/include/opal/types.h" -#include "opal/datatype/opal_convertor.h" - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_device.h" -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_endpoint.h" - -static void frag_constructor(mca_bcol_iboffload_frag_t *frag) -{ - mca_bcol_iboffload_reg_t* reg = - (mca_bcol_iboffload_reg_t*) frag->super.registration; - - memset(&frag->sg_entry, 0, sizeof(struct ibv_sge)); - frag->sg_entry.addr = (uint64_t) (uintptr_t) frag->super.ptr; - - frag->registration = reg; - - if (NULL != reg) { - frag->sg_entry.lkey = reg->mr->lkey; - } - - frag->next = NULL; - frag->type = MCA_BCOL_IBOFFLOAD_NONE_OWNER; - frag->ref_counter = 0; - frag->qp_index = -1; -} - -OBJ_CLASS_INSTANCE( - mca_bcol_iboffload_frag_t, - ompi_free_list_item_t, - frag_constructor, - NULL); - - -static mca_bcol_iboffload_frag_t* - mca_bcol_iboffload_get_ml_frag_calc(mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collreq_t *coll_request, - size_t len, size_t src_offset) -{ - int rc; - - mca_bcol_iboffload_frag_t *fragment; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - uint64_t sbuff = (uint64_t) (uintptr_t) coll_request->buffer_info[SBUF].buf + - src_offset; - - /* The buffer was allocated on ML level, - no need to allocate local buffer */ - rc = pack_data_for_calc(iboffload->device->dev.ib_dev_context, - cm->map_ompi_to_ib_calcs[coll_request->op->op_type], - cm->map_ompi_to_ib_dt[coll_request->dtype->id], - false /* host order */, - (void *) sbuff, 0, - &coll_request->actual_ib_op, - &coll_request->actual_ib_dtype, - (void *) sbuff); - if (OPAL_UNLIKELY(0 != rc)) { - IBOFFLOAD_VERBOSE(10, ("pack_data_for_calc failed, op: %s, type: %s\n", - coll_request->op->o_name, coll_request->dtype->name)); - return NULL; - } - - fragment = mca_bcol_iboffload_get_ml_frag( - iboffload, coll_request->qp_index, len, - coll_request->buffer_info[SBUF].lkey, - sbuff); - - return fragment; -} - -static mca_bcol_iboffload_frag_t * -mca_bcol_iboffload_get_packed_frag(mca_bcol_iboffload_module_t *iboffload, - uint32_t destination, int qp_index, size_t len, - struct opal_convertor_t *convertor) -{ - /* local variables */ - int rc; - uint32_t out_size; - size_t max_size = 0; - - struct iovec payload_iovec; - - ompi_free_list_item_t *item; - mca_bcol_iboffload_frag_t *frag; - - mca_bcol_iboffload_device_t *device = iboffload->device; - - /* Get frag from free list */ - OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item); - if (OPAL_UNLIKELY(NULL == item)) { - return NULL; - } - - frag = (mca_bcol_iboffload_frag_t *) item; - - /* Pack data into the buffer */ - out_size = 1; - payload_iovec.iov_len = len; - - payload_iovec.iov_base = (void *) (uintptr_t) frag->sg_entry.addr; - - rc = opal_convertor_pack(convertor, &(payload_iovec), - &out_size, &max_size); - if (OPAL_UNLIKELY(rc < 0)) { - /* Error: put the fragment back */ - OMPI_FREE_LIST_RETURN_MT(&device->frags_free[qp_index], item); - return NULL; - } - - return frag; -} - -static mca_bcol_iboffload_frag_t * -mca_bcol_iboffload_get_calc_frag(mca_bcol_iboffload_module_t *iboffload, int qp_index, - struct mca_bcol_iboffload_collreq_t *coll_request) -{ - int rc; - - ompi_free_list_item_t *item; - mca_bcol_iboffload_frag_t *frag; - - mca_bcol_iboffload_device_t *device = iboffload->device; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Start to pack frag.\n")); - - /* Get frag from free list */ - OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item); - if (OPAL_UNLIKELY(NULL == item)) { - return NULL; - } - - frag = (mca_bcol_iboffload_frag_t *) item; - - /* Pack data into the buffer */ - rc = pack_data_for_calc(device->dev.ib_dev_context, - cm->map_ompi_to_ib_calcs[coll_request->op->op_type], - cm->map_ompi_to_ib_dt[coll_request->dtype->id], false, - coll_request->buffer_info[SBUF].buf, 0, - &coll_request->actual_ib_op, - &coll_request->actual_ib_dtype, - (void *) (uintptr_t) frag->sg_entry.addr); - if (OPAL_UNLIKELY(0 != rc)) { - IBOFFLOAD_ERROR(("pack_data_for_calc failed, op: %s, type: %s\n", - coll_request->op->o_name, coll_request->dtype->name)); - return NULL; - } - - return frag; -} - -mca_bcol_iboffload_frag_t* -mca_bcol_iboffload_get_send_frag(mca_bcol_iboffload_collreq_t *coll_request, - uint32_t destination, int qp_index, size_t len, - size_t src_offset, int buf_index, int send_frag_type) -{ - /* local variables */ - mca_bcol_iboffload_frag_t *frag; - mca_bcol_iboffload_module_t *iboffload = coll_request->module; - - mca_bcol_iboffload_endpoint_t *endpoint = - iboffload->endpoints[destination]; - - IBOFFLOAD_VERBOSE(10, ("Calling mca_bcol_iboffload_get_send_frag qp_index %d", - qp_index)); - - if ((endpoint->qps[qp_index].sd_wqe) <= 0) { - IBOFFLOAD_VERBOSE(10, ("No send wqe %d", - endpoint->qps[qp_index].sd_wqe)); - return NULL; - } - - --endpoint->qps[qp_index].sd_wqe; - - IBOFFLOAD_VERBOSE(10, ("Endpoint %p: qp_index %d, destination %d, sd_wqe %d", - endpoint, qp_index, destination, endpoint->qps[qp_index].sd_wqe)); - - switch (send_frag_type) { - case MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY: - IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY")); - assert(NULL != &iboffload->device->dummy_frags[qp_index]); - return &iboffload->device->dummy_frags[qp_index]; - - case MCA_BCOL_IBOFFLOAD_SEND_FRAG: - { - ompi_free_list_item_t *item; - IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG")); - - /* Get frag from free list */ - OMPI_FREE_LIST_GET_MT(&iboffload->device->frags_free[qp_index], item); - - frag = (mca_bcol_iboffload_frag_t *) item; - } - - break; - case MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT: - IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT")); - frag = mca_bcol_iboffload_get_packed_frag(iboffload, destination, - qp_index, len, &coll_request->send_convertor); - - break; - case MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC: - IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC")); - frag = mca_bcol_iboffload_get_calc_frag(iboffload, qp_index, coll_request); - - break; - case MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML: - IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML")); - frag = mca_bcol_iboffload_get_ml_frag( - iboffload, qp_index, len, coll_request->buffer_info[buf_index].lkey, - (uint64_t)(uintptr_t) coll_request->buffer_info[buf_index].buf + src_offset); - - break; - case MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC: - frag = mca_bcol_iboffload_get_ml_frag_calc(iboffload, coll_request, len, src_offset); - IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC")); - - break; - default: - IBOFFLOAD_VERBOSE(10, ("Getting default")); - frag = NULL; - IBOFFLOAD_ERROR(("Unknown send frag type %d for QP index %d", - send_frag_type, qp_index)); - } - - if (OPAL_UNLIKELY(NULL == frag)) { - IBOFFLOAD_VERBOSE(10, ("Getting NULL")); - return NULL; - } - - frag->sg_entry.length = len; - frag->next = NULL; - - return frag; -} - -void -mca_bcol_iboffload_frag_init(ompi_free_list_item_t* item, void* ctx) -{ - int qp_index = *(int *) ctx; - mca_bcol_iboffload_frag_t *frag = (mca_bcol_iboffload_frag_t *) item; - - frag->qp_index = qp_index; - frag->type = MCA_BCOL_IBOFFLOAD_BCOL_OWNER; -} - -void -mca_bcol_iboffload_ml_frag_init(ompi_free_list_item_t* item, void* ctx) -{ - mca_bcol_iboffload_frag_t *frag = (mca_bcol_iboffload_frag_t *) item; - - frag->qp_index = -1; - frag->type = MCA_BCOL_IBOFFLOAD_ML_OWNER; -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_frag.h b/ompi/mca/bcol/iboffload/bcol_iboffload_frag.h deleted file mode 100644 index fffc33f293..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_frag.h +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_IBOFFLOAD_FRAG_H -#define MCA_BCOL_IBOFFLOAD_FRAG_H - -#include "ompi_config.h" - -#include - -#include "opal/datatype/opal_convertor.h" - -#include "opal/mca/mpool/mpool.h" -#include "opal/class/ompi_free_list.h" - -#include "bcol_iboffload.h" - -BEGIN_C_DECLS - -/* forward declarations */ -struct mca_bcol_iboffload_collreq_t; - -struct mca_bcol_iboffload_reg_t { - mca_mpool_base_registration_t base; - struct ibv_mr *mr; -}; -typedef struct mca_bcol_iboffload_reg_t mca_bcol_iboffload_reg_t; - -typedef enum { - MCA_BCOL_IBOFFLOAD_NONE_OWNER = -1, - MCA_BCOL_IBOFFLOAD_DUMMY_OWNER, - MCA_BCOL_IBOFFLOAD_BCOL_OWNER, - MCA_BCOL_IBOFFLOAD_ML_OWNER -} frag_type; - -typedef enum { - MCA_BCOL_IBOFFLOAD_SEND_FRAG, - MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML, - MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC, - MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT, - MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC, - MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY -} send_frag_type; - -struct mca_bcol_iboffload_frag_t { - ompi_free_list_item_t super; - - struct mca_bcol_iboffload_frag_t *next; - struct mca_bcol_iboffload_reg_t *registration; - - struct ibv_sge sg_entry; - - frag_type type; - - int ref_counter; - int qp_index; -}; -typedef struct mca_bcol_iboffload_frag_t mca_bcol_iboffload_frag_t; -OBJ_CLASS_DECLARATION(mca_bcol_iboffload_frag_t); - -/* The same fragment maybe shared by multiple task. - * In order to manage right release and allocation flow - * we use reference counter on each fragment and the follow - * wrapper allocation and release function that hides - * the counter */ - -#define IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(fragment, task) \ - do { \ - ++((fragment)->ref_counter); \ - (task)->frag = (fragment); \ - } while(0) - -#define IBOFFLOAD_SET_FRAGS_ON_TASK(fragment, task) \ - do { \ - struct mca_bcol_iboffload_frag_t *temp_frag = fragment; \ - while (NULL != temp_frag) { \ - ++(temp_frag->ref_counter); \ - temp_frag = temp_frag->next; \ - } \ - (task)->frag = fragment; \ - } while(0) - -/* function declarations */ -mca_bcol_iboffload_frag_t * -mca_bcol_iboffload_get_send_frag(struct mca_bcol_iboffload_collreq_t *coll_request, - uint32_t destination, int qp_index, size_t len, - size_t src_offset, int buff_index, int send_frag_type); - -void -mca_bcol_iboffload_frag_init(ompi_free_list_item_t* item, void* ctx); -void -mca_bcol_iboffload_ml_frag_init(ompi_free_list_item_t* item, void* ctx); - -static inline __opal_attribute_always_inline__ -mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_ml_empty_frag( - mca_bcol_iboffload_module_t *iboffload, - int qp_index) -{ - ompi_free_list_item_t *item; - mca_bcol_iboffload_frag_t *frag; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - /* Get frag from free list */ - OMPI_FREE_LIST_GET_MT(&cm->ml_frags_free, item); - if (OPAL_UNLIKELY(NULL == item)) { - return NULL; - } - - frag = (mca_bcol_iboffload_frag_t *) item; - - frag->qp_index = qp_index; - frag->next = NULL; - - return frag; -} - -static inline __opal_attribute_always_inline__ -mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_ml_frag( - mca_bcol_iboffload_module_t *iboffload, - int qp_index, size_t len, uint32_t lkey, uint64_t addr) -{ - /* local variables */ - mca_bcol_iboffload_frag_t *frag; - - IBOFFLOAD_VERBOSE(10, ("Call for get ML frag - addr 0x%x", addr)); - - frag = mca_bcol_iboffload_get_ml_empty_frag(iboffload, qp_index); - - frag->sg_entry.addr = addr; - frag->sg_entry.lkey = lkey; - frag->sg_entry.length = len; - - IBOFFLOAD_VERBOSE(10, ("Setting ml frag lkey %u, " - "addr %p, qp_index %d, send value - %lf", - frag->sg_entry.lkey, frag->sg_entry.addr, - qp_index, *(double *) frag->sg_entry.addr)); - - return frag; -} - -END_C_DECLS - -#endif diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_mca.c b/ompi/mca/bcol/iboffload/bcol_iboffload_mca.c deleted file mode 100644 index eb28525f36..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_mca.c +++ /dev/null @@ -1,451 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include -#include - -#include "bcol_iboffload.h" -#include "bcol_iboffload_mca.h" - -#include "ompi/constants.h" -#include "ompi/mca/common/ofacm/base.h" -#include "ompi/communicator/communicator.h" - -#include "opal/util/show_help.h" - -/* - * Local flags - */ -enum { - REGINT_NEG_ONE_OK = 0x01, - REGINT_GE_ZERO = 0x02, - REGINT_GE_ONE = 0x04, - REGINT_NONZERO = 0x08, - REGINT_MAX = 0x88 -}; - -enum { - REGSTR_EMPTY_OK = 0x01, - REGSTR_MAX = 0x88 -}; - -mca_base_var_enum_value_t mtu_values[] = { - {IBV_MTU_256, "256B"}, - {IBV_MTU_512, "512B"}, - {IBV_MTU_1024, "1k"}, - {IBV_MTU_4096, "4k"}, - {0, NULL} -}; - -/* - * utility routine for string parameter registration - */ -static int reg_string(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - const char* default_value, char **storage, - int flags) -{ - int index; - - /* the MCA variable system will not attempt to modify this value */ - *storage = (char *) default_value; - index = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_STRING, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "bcol", "iboffload", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGSTR_EMPTY_OK) && 0 == strlen(*storage)) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} - -/* - * utility routine for integer parameter registration - */ -static int reg_int(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - int default_value, int *storage, int flags) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_INT, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "bcol", "iboffload", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) { - return OMPI_SUCCESS; - } - - if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) || - (0 != (flags & REGINT_GE_ONE) && *storage < 1) || - (0 != (flags & REGINT_NONZERO) && 0 == *storage)) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} - -/* - * utility routine for integer parameter registration - */ -static int reg_bool(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - bool default_value, bool *storage) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "bcol", "iboffload", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - return OMPI_SUCCESS; -} - -int mca_bcol_iboffload_verify_params(void) -{ - if (mca_bcol_iboffload_component.min_rnr_timer > 31) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_min_rnr_timer > 31", - "bcol_iboffload_ib_min_rnr_timer reset to 31"); - mca_bcol_iboffload_component.min_rnr_timer = 31; - } else if (mca_bcol_iboffload_component.min_rnr_timer < 0){ - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_min_rnr_timer < 0", - "bcol_iboffload_ib_min_rnr_timer reset to 0"); - mca_bcol_iboffload_component.min_rnr_timer = 0; - } - - if (mca_bcol_iboffload_component.timeout > 31) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_timeout > 31", - "bcol_iboffload_ib_timeout reset to 31"); - mca_bcol_iboffload_component.timeout = 31; - } else if (mca_bcol_iboffload_component.timeout < 0) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_timeout < 0", - "bcol_iboffload_ib_timeout reset to 0"); - mca_bcol_iboffload_component.timeout = 0; - } - - if (mca_bcol_iboffload_component.retry_count > 7) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_retry_count > 7", - "bcol_iboffload_ib_retry_count reset to 7"); - mca_bcol_iboffload_component.retry_count = 7; - } else if (mca_bcol_iboffload_component.retry_count < 0) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_retry_count < 0", - "bcol_iboffload_ib_retry_count reset to 0"); - mca_bcol_iboffload_component.retry_count = 0; - } - - if (mca_bcol_iboffload_component.max_rdma_dst_ops > 7) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_rnr_retry > 7", - "bcol_iboffload_ib_rnr_retry reset to 7"); - mca_bcol_iboffload_component.max_rdma_dst_ops = 7; - } else if (mca_bcol_iboffload_component.max_rdma_dst_ops < 0) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_rnr_retry < 0", - "bcol_iboffload_ib_rnr_retry reset to 0"); - mca_bcol_iboffload_component.max_rdma_dst_ops = 0; - } - - if (mca_bcol_iboffload_component.service_level > 15) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_service_level > 15", - "bcol_iboffload_ib_service_level reset to 15"); - mca_bcol_iboffload_component.service_level = 15; - } else if (mca_bcol_iboffload_component.service_level < 0) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "bcol_iboffload_ib_service_level < 0", - "bcol_iboffload_ib_service_level reset to 0"); - mca_bcol_iboffload_component.service_level = 0; - } - - if(mca_bcol_iboffload_component.buffer_alignment <= 1 || - (mca_bcol_iboffload_component.buffer_alignment & (mca_bcol_iboffload_component.buffer_alignment - 1))) { - opal_show_help("help-mpi-btl-openib.txt", "wrong buffer alignment", - true, mca_bcol_iboffload_component.buffer_alignment, ompi_process_info.nodename, 64); - mca_bcol_iboffload_component.buffer_alignment = 64; - } - - return OMPI_SUCCESS; -} - -int mca_bcol_iboffload_register_params(void) -{ - mca_base_var_enum_t *new_enum; - char *msg; - int ret = OMPI_SUCCESS, tmp; - -#define CHECK(expr) do { \ - tmp = (expr); \ - if (OMPI_SUCCESS != tmp) ret = tmp; \ - } while (0) - - /* register openib component parameters */ - CHECK(reg_int("k_nomial_radix", NULL, - "The radix of the K-nomial tree for scatther-gather type algorithms" - "(starts from 2)", 2, &mca_bcol_iboffload_component.k_nomial_radix, - REGINT_GE_ONE)); - - CHECK(reg_int("priority", NULL, - "IB offload component priority" - "(from 0(low) to 90 (high))", 90, - &mca_bcol_iboffload_component.super.priority, 0)); - - CHECK(reg_int("verbose", NULL, - "Output some verbose IB offload BTL information " - "(0 = no output, nonzero = output)", 0, - &mca_bcol_iboffload_component.verbose, 0)); - - CHECK(reg_bool("warn_default_gid_prefix", NULL, - "Warn when there is more than one active ports and at least one of them connected to the network with only default GID prefix configured (0 = do not warn; any other value = warn)", - true, &mca_bcol_iboffload_component.warn_default_gid_prefix)); - - CHECK(reg_bool("warn_nonexistent_if", NULL, - "Warn if non-existent devices and/or ports are specified in the bcol_iboffla_if_[in|ex]clude MCA parameters (0 = do not warn; any other value = warn)", - true, &mca_bcol_iboffload_component.warn_nonexistent_if)); - - CHECK(reg_int("max_pipeline_depth", NULL, - "The maximal number of fragments of the same collective request that can be transferred in parallel", 3, - (int *) &mca_bcol_iboffload_component.max_pipeline_depth, 0)); - - CHECK(reg_int("max_mqe_tasks", NULL, - "Maximum number of MQEs for each iboffload module", - 1024, &mca_bcol_iboffload_component.max_mqe_tasks, 0)); - CHECK(reg_int("max_mq_size", NULL, - "Maximum size of each MQ for each iboffload module", - 1024, &mca_bcol_iboffload_component.max_mq_size, 0)); - CHECK(reg_int("free_list_num", NULL, - "Intial size of free lists (must be >= 1)", - 256, &mca_bcol_iboffload_component.free_list_num, - REGINT_GE_ONE)); - CHECK(reg_int("free_list_max", NULL, - "Maximum size of free lists " - "(-1 = infinite, otherwise must be >= 0)", - -1, &mca_bcol_iboffload_component.free_list_max, - REGINT_NEG_ONE_OK | REGINT_GE_ONE)); - CHECK(reg_int("free_list_inc", NULL, - "Increment size of free lists (must be >= 1)", - 32, &mca_bcol_iboffload_component.free_list_inc, - REGINT_GE_ONE)); - /* rdma mpool no longer exists - must use the grdma mpool component, should resolve errors in - * mtt testing - */ - /* - CHECK(reg_string("mpool", NULL, - "Name of the memory pool to be used (it is unlikely that you will ever want to change this", - "rdma", &mca_bcol_iboffload_component.mpool_name, - 0)); - */ - CHECK(reg_string("mpool", NULL, - "Name of the memory pool to be used (it is unlikely that you will ever want to change this", - "grdma", &mca_bcol_iboffload_component.mpool_name, - 0)); - CHECK(reg_int("cq_size", "cq_size", - "Size of the OpenFabrics completion " - "queue (will automatically be set to a minimum of " - "(2 * number_of_peers * bcol_iboffload_rd_num))", - 1024, &mca_bcol_iboffload_component.cq_size, REGINT_GE_ONE)); - - CHECK(reg_int("exchange_tree_order", NULL, - "The order of the exchange tree. " - "Must be power of two.", - 2, &mca_bcol_iboffload_component.exchange_tree_order, REGINT_GE_ONE)); - - CHECK(reg_int("knomial_tree_order", NULL, - "The order of the knomial exchange tree. ", - 3, &mca_bcol_iboffload_component.knomial_tree_order, REGINT_GE_ONE)); - - - CHECK(reg_int("max_inline_data", "max_inline_data", - "Maximum size of inline data segment " - "(-1 = run-time probe to discover max value, " - "otherwise must be >= 0). " - "If not explicitly set, use max_inline_data from " - "the INI file containing device-specific parameters", - 128, (int *) &mca_bcol_iboffload_component.max_inline_data, - REGINT_NEG_ONE_OK | REGINT_GE_ZERO)); - -#if 0 - CHECK(reg_string("pkey", "ib_pkey_val", - "OpenFabrics partition key (pkey) value. " - "Unsigned integer decimal or hex values are allowed (e.g., \"3\" or \"0x3f\") and will be masked against the maximum allowable IB paritition key value (0x7fff)", - "0", &pkey, 0)); - /* Pasha - mca_bcol_iboffload_component.pkey_val = - ompi_btl_openib_ini_intify(pkey) & MCA_BTL_IB_PKEY_MASK; - free(pkey); - */ -#endif - - CHECK(reg_string("receive_queues", NULL, - "Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4", - "P,512,256,192,128", &mca_bcol_iboffload_component.receive_queues, - 0)); - - CHECK(reg_int("qp_ous_rd_atom", NULL, - "InfiniBand outstanding atomic reads (must be >= 0)", 4, - (int *) &mca_bcol_iboffload_component.qp_ous_rd_atom, REGINT_GE_ZERO)); - - asprintf(&msg, "OpenFabrics MTU, in bytes (if not specified in INI files). Valid values are: %d=256 bytes, %d=512 bytes, %d=1024 bytes, %d=2048 bytes, %d=4096 bytes", - IBV_MTU_256, - IBV_MTU_512, - IBV_MTU_1024, - IBV_MTU_2048, - IBV_MTU_4096); - if (NULL == msg) { - /* Don't try to recover from this */ - return OMPI_ERR_OUT_OF_RESOURCE; - } - CHECK(mca_base_var_enum_create("infiniband mtu", mtu_values, &new_enum)); - mca_bcol_iboffload_component.mtu = IBV_MTU_1024; - tmp = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version, - "mtu", msg, MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_bcol_iboffload_component.mtu); - OBJ_RELEASE(new_enum); - free(msg); - - if (0 > tmp) ret = tmp; - - tmp = mca_base_var_register_synonym(tmp, "ompi", "bcol", "iboffload", "ib_mtu", - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - if (0 > tmp) ret = tmp; - - CHECK(reg_int("ib_min_rnr_timer", NULL, "InfiniBand minimum " - "\"receiver not ready\" timer, in seconds " - "(must be >= 0 and <= 31)", - 1 , &mca_bcol_iboffload_component.min_rnr_timer, 0)); - - CHECK(reg_int("ib_timeout", NULL, "InfiniBand transmit timeout, plugged into formula: 4.096 microseconds * " - "(2^bcol_iboffload_ib_timeout) (must be >= 0 and <= 31)", - 20, &mca_bcol_iboffload_component.timeout, 0)); - - CHECK(reg_int("ib_retry_count", NULL, "InfiniBand transmit retry count " - "(must be >= 0 and <= 7)", - 7, &mca_bcol_iboffload_component.retry_count, 0)); - - CHECK(reg_int("ib_rnr_retry", NULL, "InfiniBand \"receiver not ready\" " - "retry count; applies *only* to SRQ/XRC queues. PP queues " - "use RNR retry values of 0 because Open MPI performs " - "software flow control to guarantee that RNRs never occur " - "(must be >= 0 and <= 7; 7 = \"infinite\")", - 7, &mca_bcol_iboffload_component.rnr_retry, 0)); - - CHECK(reg_int("ib_max_rdma_dst_ops", NULL, "InfiniBand maximum pending RDMA " - "destination operations " - "(must be >= 0)", - 4, &mca_bcol_iboffload_component.max_rdma_dst_ops, REGINT_GE_ZERO)); - - CHECK(reg_int("ib_service_level", NULL, "InfiniBand service level " - "(must be >= 0 and <= 15)", - 0, &mca_bcol_iboffload_component.service_level, 0)); - - CHECK(reg_int("buffer_alignment", NULL, - "Prefered communication buffer alignment, in bytes " - "(must be > 0 and power of two)", - 64, &mca_bcol_iboffload_component.buffer_alignment, REGINT_GE_ZERO)); - - /* register parmeters controlling message fragementation */ - CHECK(reg_int("min_frag_size", NULL, - "Minimum fragment size", - getpagesize(), &mca_bcol_iboffload_component.super.min_frag_size, - REGINT_GE_ONE)); - - CHECK(reg_int("max_frag_size", NULL, - "Maximum fragment size", - FRAG_SIZE_NO_LIMIT, &mca_bcol_iboffload_component.super.max_frag_size, - REGINT_NONZERO)); - - CHECK(reg_bool("can_use_user_buffers", NULL, - "User memory can be used by the collective algorithms", - true, &mca_bcol_iboffload_component.super.can_use_user_buffers)); - - CHECK(reg_int("barrier_mode", NULL, - "Barrier mode: 0 - Recursive doubling; 1 - Recursive K-ing", - 0, &mca_bcol_iboffload_component.barrier_mode, REGINT_GE_ZERO)); - - CHECK(reg_int("max_progress_pull", NULL, - "Max number of progress pull checks", - 8, &mca_bcol_iboffload_component.max_progress_pull, REGINT_GE_ZERO)); - - CHECK(reg_int("use_brucks_smsg_alltoall_rdma", NULL, - "Use brucks algorithm for smsg alltoall and RDMA semantics 1 = No Temp buffer recycling" - "1 = Alg with no Temp Buffer Recycling (faster), 2 = Alg with temp Buffer Recycling (slower)", - 0, &mca_bcol_iboffload_component.use_brucks_smsg_alltoall_rdma, 0)); - - CHECK(reg_int("use_brucks_smsg_alltoall_sr", NULL, - "Use brucks algorithm for smsg alltoall and Send/Recv semantics " - "1 = Alg with RTR (faster), 2 = Alg with RNR (slower)", - 0, &mca_bcol_iboffload_component.use_brucks_smsg_alltoall_sr, 0)); - - CHECK(reg_int("alltoall_bruck_radix", NULL, - "Radix for Bruck algorithm for smsg alltoall", - 3, &mca_bcol_iboffload_component.k_alltoall_bruck_radix, 0)); - - CHECK(reg_int("k_alltoall_bruck_radix", NULL, - "Temp Buffer alignment for Bruck algorithm for smsg alltoall", - 64, &mca_bcol_iboffload_component.tmp_buf_alignment, 0)); - - /* - CHECK(reg_string("if_include", NULL, - "Comma-delimited list of devices/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with bcol_iboffload_if_exclude.", - NULL, &mca_bcol_iboffload_component.if_include, - 0)); - - CHECK(reg_string("if_exclude", NULL, - "Comma-delimited list of device/ports to be excluded (empty value means to not exclude any ports). Mutually exclusive with bcol_iboffload_if_include.", - NULL, &mca_bcol_iboffload_component.if_exclude, - 0)); - */ - - CHECK(mca_bcol_iboffload_verify_params()); - - /* Register any MCA params for the connect pseudo-components */ - if (OMPI_SUCCESS == ret) { - ret = ompi_common_ofacm_base_register(&mca_bcol_iboffload_component.super.bcol_version); - } - - return ret; -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_mca.h b/ompi/mca/bcol/iboffload/bcol_iboffload_mca.h deleted file mode 100644 index 95e1ec7ee2..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_mca.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - /** @file */ - -#ifndef MCA_BCOL_IBOFFLOAD_MCA_H -#define MCA_BCOL_IBOFFLOAD_MCA_H - -#include "ompi_config.h" - -int mca_bcol_iboffload_register_params(void); -int mca_bcol_iboffload_verify_params(void); - -#endif diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_module.c b/ompi/mca/bcol/iboffload/bcol_iboffload_module.c deleted file mode 100644 index 0e90fac944..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_module.c +++ /dev/null @@ -1,1538 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "opal/util/arch.h" -#include "opal/include/opal/types.h" -#include "opal/datatype/opal_datatype.h" - -#include "ompi/mca/bcol/base/base.h" -#include "opal/mca/mpool/base/base.h" -#include "ompi/communicator/communicator.h" -#include "opal/mca/mpool/grdma/mpool_grdma.h" -#include "ompi/mca/coll/ml/coll_ml_allocation.h" - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" -#include "bcol_iboffload_bcast.h" -#include "bcol_iboffload_device.h" -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_collfrag.h" -#include "bcol_iboffload_endpoint.h" - -static int init_rdma_buf_desc(mca_bcol_iboffload_rdma_buffer_desc_t **desc, void *base_addr, uint32_t num_banks, - uint32_t num_buffers_per_bank, uint32_t size_buffer, uint32_t header_size); - -static int set_endpoint_remote_rdma_info(mca_bcol_iboffload_endpoint_t *ep, mca_bcol_iboffload_rdma_info_t *remote_rdma_info); - -static void -mca_bcol_iboffload_module_construct(mca_bcol_iboffload_module_t *module) -{ - int i; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - /* set all to zero */ - module->group_size = 0; - module->segment_size = 0; - module->collective_tag = 0; - module->ibnet = NULL; - module->cgroup_index = 0; - - module->num_endpoints = 0; - module->endpoints = NULL; - - /* initi the previous sequence number */ - module->prev_sequence_num = -1; - - switch (cm->barrier_mode) { - case (0): module->barrier_algth = - mca_bcol_iboffload_barrier_intra_recursive_doubling_start; - break; - case (1): module->barrier_algth = - mca_bcol_iboffload_barrier_intra_recursive_knomial_start; - break; - default: module->barrier_algth = NULL; - } - - module->allreduce_algth = NULL; - module->fanin_algth = mca_bcol_iboffload_new_style_fanin_first_call; - module->fanout_algth = mca_bcol_iboffload_new_style_fanout_first_call; - module->memsync_algth = mca_bcol_iboffload_nb_memory_service_barrier_start; - - memset(module->mq, 0, sizeof(module->mq[0]) * BCOL_IBOFFLOAD_MQ_NUM); - memset(module->alg_task_consump, 0, sizeof(uint32_t) * LAST_ALG); - memset(module->connection_status, 0, sizeof(bool) * LAST_ALG); - - for (i = 0; i < BCOL_IBOFFLOAD_MQ_NUM; i++) { - module->mq_credit[i] = mca_bcol_iboffload_component.max_mqe_tasks; - } - - module->super.bcol_component = - (mca_bcol_base_component_t *) &mca_bcol_iboffload_component; - - /* We need two MQ's tasks for exchange with remote addresses */ - module->alg_task_consump[REMOTE_EXCHANGE_ALG] += 2; - - module->power_of_2_ranks = 0; - /* it is safe to set all the remote block to zero */ - memset(&module->rdma_block, 0, sizeof(mca_bcol_iboffload_local_rdma_block_t)); - - module->super.list_n_connected = NULL; - - OBJ_CONSTRUCT(&module->collfrag_pending, opal_list_t); -} - -static void -mca_bcol_iboffload_module_destruct(mca_bcol_iboffload_module_t *module) -{ - int i = 0; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("Module - %p: start to destroy; " - "pending queue size - %d.\n", - module, opal_list_get_size(&module->collfrag_pending))); - - /* Make sure that we done with all pending collective frags */ - while (opal_list_get_size(&module->collfrag_pending) > 0) { - opal_progress(); - } - - OBJ_DESTRUCT(&module->collfrag_pending); - - IBOFFLOAD_VERBOSE(10, ("module->mq_credit - %d, cm->max_mqe_tasks - %d.\n", - module->mq_credit[0], cm->max_mqe_tasks)); - /* Make sure that you got completion on all outstanding collectives */ - for (i = 0; i < BCOL_IBOFFLOAD_MQ_NUM; i++) { - while (module->mq_credit[i] != (int) cm->max_mqe_tasks) { - opal_progress(); - } - } - - IBOFFLOAD_VERBOSE(10, ("All credits were returned.\n")); - - if (NULL != module && NULL != module->mq) { - for (i = 0; i < BCOL_IBOFFLOAD_MQ_NUM; i++) { - if (0 != mqe_context_destroy(module->mq[i])) { - IBOFFLOAD_ERROR(("Error destroying MQ for device (%s), error: %s\n", - ibv_get_device_name(module->device->dev.ib_dev), strerror(errno))); - } - } - - IBOFFLOAD_VERBOSE(10, ("MQ %d was destroyed.\n", i)); - } - - if (NULL != module->endpoints) { - mca_bcol_iboffload_endpoint_t *ep; - int qp_index, num_qps = cm->num_qps; - - for (i = 0; i < module->num_endpoints; ++i) { - if (NULL != module->endpoints[i]) { - /* Make sure that we get completions on all outstanding send requests */ - ep = module->endpoints[i]; - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - IBOFFLOAD_VERBOSE(10, ("qp_index - %d, ep->index - %d, " - "ep->qps[qp_index].sd_wqe - %d, " - "cm->qp_infos[qp_index].rd_num - %d.\n", - qp_index, ep->index, - ep->qps[qp_index].sd_wqe, - cm->qp_infos[qp_index].rd_num)); - - while (ep->qps[qp_index].sd_wqe != cm->qp_infos[qp_index].rd_num) { - opal_progress(); - } - - IBOFFLOAD_VERBOSE(10, ("qp_index - %d, ep->index - %d; " - "All sends were sent.\n", - qp_index, ep->index)); - } - - OBJ_RELEASE(ep); - } - } - - free(module->endpoints); - } - - netpatterns_free_recursive_doubling_tree_node(&module->n_exchange_tree); - netpatterns_free_recursive_doubling_tree_node(&module->recursive_doubling_tree); - - OBJ_RELEASE(module->device->net_context); - OBJ_RELEASE(module->device); - - if (NULL != module->super.list_n_connected) { - free(module->super.list_n_connected); - module->super.list_n_connected = NULL; - } - - OBJ_DESTRUCT(&module->iovec_tasks_free); - - IBOFFLOAD_VERBOSE(10, ("module - %p was successfully destructed.\n", module)); -} - -OBJ_CLASS_INSTANCE(mca_bcol_iboffload_module_t, - mca_bcol_base_module_t, - mca_bcol_iboffload_module_construct, - mca_bcol_iboffload_module_destruct); - -static int iboffload_init_port(struct mca_bcol_iboffload_device_t *device, - struct mca_bcol_iboffload_port_t *p) -{ - union ibv_gid gid; - struct ibv_port_attr ib_port_attr; - - if (ibv_query_port(device->dev.ib_dev_context, p->id, &ib_port_attr)){ - IBOFFLOAD_ERROR(("Error getting port attributes for device %s " - "port number %d errno says %s", - ibv_get_device_name(device->dev.ib_dev), p->id, strerror(errno))); - return OMPI_ERR_NOT_FOUND; - } - - /* Set port data */ - p->lmc = (1 << ib_port_attr.lmc); - p->lid = ib_port_attr.lid; - p->stat = ib_port_attr.state; - p->mtu = ib_port_attr.active_mtu; - - IBOFFLOAD_VERBOSE(10, (" Setting port data (%s:%d) lid=%d, lmc=%d, stat=%d, mtu=%d\n", - ibv_get_device_name(device->dev.ib_dev), p->id, p->lid, - p->lmc, p->stat, p->mtu)); - - if (0 != ibv_query_gid(device->dev.ib_dev_context, p->id, 0, &gid)) { - IBOFFLOAD_ERROR(("ibv_query_gid failed (%s:%d)\n", - ibv_get_device_name(device->dev.ib_dev), p->id)); - return OMPI_ERR_NOT_FOUND; - } - - /* set subnet data */ - p->subnet_id = ntoh64(gid.global.subnet_prefix); - IBOFFLOAD_VERBOSE(10, ("my IB-only subnet_id for HCA %s port %d is %lx", - ibv_get_device_name(device->dev.ib_dev), p->id, p->subnet_id)); - - return OMPI_SUCCESS; -} - -/* mpool allocation maybe changed in future, so lets keep it as separate function */ -static int prepare_mpool(mca_bcol_iboffload_device_t *device) -{ - int ret = OMPI_SUCCESS; - mca_mpool_base_resources_t resources; - - resources.reg_data = (void *) device; - resources.sizeof_reg = sizeof(mca_bcol_iboffload_reg_t); - - resources.register_mem = mca_bcol_iboffload_register_mr; - resources.deregister_mem = mca_bcol_iboffload_deregister_mr; - - device->mpool = - mca_mpool_base_module_create(mca_bcol_iboffload_component.mpool_name, - device, &resources); - if (NULL == device->mpool){ - opal_output(0, "error creating IB memory pool for %s errno says %s\n", - ibv_get_device_name(device->dev.ib_dev), strerror(errno)); - ret = OMPI_ERROR; - } - - return ret; -} - -/* Allocate device related resources: mpool, pd, cq, free_lists */ -static int allocate_device_resources(mca_bcol_iboffload_device_t *device) -{ - int qp_index, num_qps, rc; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - void* dummy_mem = (void *) &device->dummy_mem[0]; - - num_qps = cm->num_qps; - - /* We have some active ports, alloce pd */ - device->ib_pd = ibv_alloc_pd(device->dev.ib_dev_context); - if (NULL == device->ib_pd){ - IBOFFLOAD_ERROR(("Error allocating protection domain for %s errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - return OMPI_ERROR; - } - - /* Pasha: allocate mpool here */ - if (OMPI_SUCCESS != prepare_mpool(device)) { - return OMPI_ERROR; - } - - /* Allocating free list of memory registered fragments */ - device->frags_free = (ompi_free_list_t *) calloc( - num_qps, sizeof(ompi_free_list_t)); - - if (NULL == device->frags_free) { - IBOFFLOAD_ERROR(("Error allocating memory for " - "frags array, dev: %s errno says %s", - ibv_get_device_name(device->dev.ib_dev), - strerror(errno))); - - return OMPI_ERROR; - } - - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - mca_bcol_iboffload_alloc_qps_resource_fn_t alloc_resource = - cm->qp_infos[qp_index].alloc_resource; - - if (NULL != alloc_resource) { - if (OMPI_SUCCESS != alloc_resource(qp_index, device)) { - return OMPI_ERROR; - } - } - - } - - if (OMPI_SUCCESS != - mca_bcol_iboffload_adjust_cq(device, &device->ib_cq)) { - IBOFFLOAD_ERROR(("Error creating CQ for %s errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - return OMPI_ERROR; - } - - if (OMPI_SUCCESS != - mca_bcol_iboffload_adjust_cq(device, &device->ib_mq_cq)) { - IBOFFLOAD_ERROR(("Error creating mq CQ for %s errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - return OMPI_ERROR; - } - - rc = mca_bcol_iboffload_register_mr((void *) device, dummy_mem, - sizeof(char) * BCOL_IBOFFLOAD_DUMMY_MEM_SIZE, - &device->dummy_reg.base); - - if (OMPI_SUCCESS != rc) { - IBOFFLOAD_ERROR(("Dummy memory registration failed for %s errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - return OMPI_ERROR; - } - - for (qp_index = 0; qp_index < num_qps; ++qp_index) { - mca_bcol_iboffload_frag_t *frag = &device->dummy_frags[qp_index]; - - memset(&frag->super.registration, 0, sizeof(mca_mpool_base_registration_t)); - OBJ_CONSTRUCT(frag, mca_bcol_iboffload_frag_t); - - frag->qp_index = qp_index; - frag->type = MCA_BCOL_IBOFFLOAD_DUMMY_OWNER; - - frag->registration = &device->dummy_reg; - - frag->super.ptr = dummy_mem; - frag->super.registration = &device->dummy_reg.base; - - frag->sg_entry.length = 0; - frag->sg_entry.lkey = device->dummy_reg.mr->lkey; - frag->sg_entry.addr = (uint64_t) (uintptr_t) dummy_mem; - } - - return OMPI_SUCCESS; -} - -/* Register memory */ -int mca_bcol_iboffload_register_mr(void *reg_data, void *base, size_t size, - mca_mpool_base_registration_t *reg) -{ - mca_bcol_iboffload_device_t *device = (mca_bcol_iboffload_device_t *) reg_data; - mca_bcol_iboffload_reg_t *iboffload_reg = (mca_bcol_iboffload_reg_t *) reg; - - iboffload_reg->mr = ibv_reg_mr(device->ib_pd, base, size, - IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE | - IBV_ACCESS_REMOTE_READ); - - if (NULL == iboffload_reg->mr) { - IBOFFLOAD_ERROR(("Device %s: %p addr, %d bytes registration failed.", - ibv_get_device_name(device->dev.ib_dev), base, size)); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - IBOFFLOAD_VERBOSE(10, ("Device %s: memory register addr=%p, len=%d, mr - %p.", - ibv_get_device_name(device->dev.ib_dev), base, size, iboffload_reg->mr)); - - return OMPI_SUCCESS; -} - -/* Deregister memory */ -int mca_bcol_iboffload_deregister_mr(void *reg_data, mca_mpool_base_registration_t *reg) -{ - mca_bcol_iboffload_device_t *device = (mca_bcol_iboffload_device_t *) reg_data; - mca_bcol_iboffload_reg_t *iboffload_reg = (mca_bcol_iboffload_reg_t *) reg; - - IBOFFLOAD_VERBOSE(10, ("Device %s: mr - %p.", - ibv_get_device_name(device->dev.ib_dev), iboffload_reg->mr)); - - if (NULL != iboffload_reg->mr) { - if (ibv_dereg_mr(iboffload_reg->mr)) { - IBOFFLOAD_ERROR(("Device %s: error unpinning iboffload memory errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - return OMPI_ERROR; - } - } - - IBOFFLOAD_VERBOSE(10, ("Device %s: memory deregister succeeded.", - ibv_get_device_name(device->dev.ib_dev))); - - iboffload_reg->mr = NULL; - - return OMPI_SUCCESS; -} - -/* We need to keep separate registration function for - ML list memory managment */ -static int mca_bcol_iboffload_lmngr_register(void *context_data, - void *base, size_t size, - void **reg_desc) -{ - struct ibv_mr *mr; - mca_bcol_iboffload_device_t *device = - (mca_bcol_iboffload_device_t *) context_data; - - mr = ibv_reg_mr(device->ib_pd, base, size, - IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE | - IBV_ACCESS_REMOTE_READ); - - if (NULL == mr) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - IBOFFLOAD_VERBOSE(10, ("Device %s: memory register addr=%p, len=%d", - ibv_get_device_name(device->dev.ib_dev), base, size)); - - *reg_desc = (void *) mr; - - /* Make sure that the addr stays the same */ - assert(mr->addr == base); - - return OMPI_SUCCESS; -} - -static int mca_bcol_iboffload_lmngr_deregister(void *context_data, void *reg_desc) -{ - struct ibv_mr *mr = (struct ibv_mr *) reg_desc; - mca_bcol_iboffload_device_t *device = - (mca_bcol_iboffload_device_t *) context_data; - - if (mr != NULL) { - if (ibv_dereg_mr(mr)) { - IBOFFLOAD_ERROR(("Device %s: error unpinning iboffload memory errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - return OMPI_ERROR; - } - } - - return OMPI_SUCCESS; -} - -static int iboffload_start_device(mca_bcol_iboffload_device_t *device) -{ - int port_cnt, port, ret; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - -#if HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE - if (IBV_TRANSPORT_IB != device->dev.ib_dev->transport_type) { - IBOFFLOAD_VERBOSE(10, ("Skipping non IB device %s", - ibv_get_device_name(device->dev.ib_dev))); - goto error; - } -#endif - - /* Open device context */ - IBOFFLOAD_VERBOSE(10, ("Open IB device - %p", device->dev.ib_dev)); - - device->dev.ib_dev_context = ibv_open_device(device->dev.ib_dev); - if (NULL == device->dev.ib_dev_context) { - IBOFFLOAD_ERROR(("Error obtaining device context for %s errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - goto error; - } - - if (ibv_query_device(device->dev.ib_dev_context, &device->ib_dev_attr)) { - IBOFFLOAD_ERROR(("error obtaining device attributes for %s errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - goto error; - } - - port_cnt = device->ib_dev_attr.phys_port_cnt; - if (0 == port_cnt) { - goto error; - } - - device->ports = (mca_bcol_iboffload_port_t *) - calloc(port_cnt, sizeof(mca_bcol_iboffload_port_t)); - if (NULL == device->ports) { - goto error; - } - - /* Note ports are 1 based (i >= 1) */ - for (port = 1; port <= port_cnt; port++) { - int pi = port - 1; /* port array index starts from zero */ - - struct ibv_port_attr ib_port_attr; - memset(&ib_port_attr, 0, sizeof(ib_port_attr)); - - if (ibv_query_port(device->dev.ib_dev_context, (uint8_t) port, &ib_port_attr)) { - IBOFFLOAD_ERROR(("Error getting port attributes for device %s " - "port number %d errno says %s", - ibv_get_device_name(device->dev.ib_dev), port, strerror(errno))); - continue; - } - - if (IBV_PORT_ACTIVE == ib_port_attr.state) { - /* Pasha: Need to think how we want to handle MTUs - if (ib_port_attr.active_mtu < mca_bcol_iboffload_component.mtu){ - device->mtu = ib_port_attr.active_mtu; - } - */ - /* start to put port info */ - ++device->num_act_ports; - device->ports[pi].id = port; - device->ports[pi].stat = ib_port_attr.state; - device->ports[pi].mtu = ib_port_attr.active_mtu; - - if (0 == cm->pkey_val) { - ret = iboffload_init_port(device, &device->ports[pi]); - if (OMPI_SUCCESS != ret) { - IBOFFLOAD_ERROR(("Device %s " - "port number %d , failed to init port, errno says %s", - ibv_get_device_name(device->dev.ib_dev), - port, strerror(errno))); - continue; - } - } else { - uint16_t pkey, j; - for (j = 0; j < device->ib_dev_attr.max_pkeys; j++) { - if (ibv_query_pkey(device->dev.ib_dev_context, (uint8_t) port, j, &pkey)) { - IBOFFLOAD_ERROR(("error getting pkey for index %d, device %s " - "port number %d errno says %s", - j, ibv_get_device_name(device->dev.ib_dev), port, strerror(errno))); - continue; - } - - pkey = ntohs(pkey) & MCA_BCOL_IBOFFLOAD_PKEY_MASK; - if (pkey == cm->pkey_val) { - ret = iboffload_init_port(device, &device->ports[pi]); - if (OMPI_SUCCESS != ret) { - IBOFFLOAD_ERROR(("Device %s " - "port number %d , failed to init port, errno says %s", - ibv_get_device_name(device->dev.ib_dev), - port, strerror(errno))); - continue; - } - } - } - } - } - } - - if (0 == device->num_act_ports) { - goto error; - } - - if (OMPI_SUCCESS != allocate_device_resources(device)) { - goto error; - } - - /* setup network context on device */ - device->net_context = OBJ_NEW(bcol_base_network_context_t); - - device->net_context->context_data = (void *) device; - - device->net_context->register_memory_fn = mca_bcol_iboffload_lmngr_register; - device->net_context->deregister_memory_fn = mca_bcol_iboffload_lmngr_deregister; - - /* the device is ready now */ - device->activated = true; - return OMPI_SUCCESS; - -error: - /* Pasha: need to add nice resource clean up */ - return OMPI_ERROR; -} -static void mca_bcol_iboffload_set_small_msg_thresholds(struct mca_bcol_base_module_t *super) -{ - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *) super; - - /* Set the Bcast threshold, for IB it equals to ML buffer size */ - super->small_message_thresholds[BCOL_BCAST] = - iboffload_module->rdma_block.ml_mem_desc->size_buffer; - - if ((mca_bcol_iboffload_component.use_brucks_smsg_alltoall_rdma) - || (mca_bcol_iboffload_component.use_brucks_smsg_alltoall_sr)) { - /* Set the Alltoall threshold, for Bruck's algth we use 1.5 of the buff size */ - super->small_message_thresholds[BCOL_ALLTOALL] = - (iboffload_module->rdma_block.ml_mem_desc->size_buffer / 3) * 2; - } else { - /* Set the Alltoall threshold, for this case it equals to a half of the ML buffer size */ - super->small_message_thresholds[BCOL_ALLTOALL] = - iboffload_module->rdma_block.ml_mem_desc->size_buffer / 2; - } - - /* Set the Allreduce threshold, for IB it equals to ML buffer size */ - super->small_message_thresholds[BCOL_ALLREDUCE] = - iboffload_module->rdma_block.ml_mem_desc->size_buffer; - - /* Set the Allgather threshold, for IB it equals to ML buffer size */ - super->small_message_thresholds[BCOL_ALLGATHER] = - iboffload_module->rdma_block.ml_mem_desc->size_buffer / - ompi_comm_size(iboffload_module->super.sbgp_partner_module->group_comm); -} - -static int mca_bcol_iboffload_init_buffer_memory(struct mca_coll_ml_module_t *ml_module, - struct mca_bcol_base_module_t *bcol, - void *reg_data) -{ - mca_bcol_iboffload_module_t *iboffload_module = (mca_bcol_iboffload_module_t *) bcol; - mca_bcol_iboffload_local_rdma_block_t *rdma_block = &iboffload_module->rdma_block; - - struct mca_bcol_base_memory_block_desc_t *desc = ml_module->payload_block; - struct ibv_mr *mr = (struct ibv_mr *) desc->block->lmngr->reg_desc[bcol->context_index]; - int i; - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_init_buffer_memory was called")); - - /* Set rdma block data */ - rdma_block->ib_info.rkey = mr->rkey; - rdma_block->ib_info.lkey = mr->lkey; - - rdma_block->ib_info.addr = (uint64_t) (uintptr_t) desc->block->base_addr; - IBOFFLOAD_VERBOSE(10, ("Caching rkey %u lkey %u addr %p", - rdma_block->ib_info.rkey, - rdma_block->ib_info.lkey, - rdma_block->ib_info.addr)); - - /* cache ml mem desc tunings localy */ - rdma_block->bdesc.num_banks = desc->num_banks; - rdma_block->bdesc.num_buffers_per_bank = desc->num_buffers_per_bank; - rdma_block->bdesc.size_buffer = desc->size_buffer; - rdma_block->bdesc.data_offset = ml_module->data_offset; - - IBOFFLOAD_VERBOSE(10, ("RDMA buffer configuration num banks %d num_per_bank %d size %d base addr %p", - mr->addr, desc->num_banks, desc->num_buffers_per_bank, desc->size_buffer)); - - /* pointer to ml level descriptor */ - rdma_block->ml_mem_desc = desc; - - rdma_block->sync_counter = 0; /* reset the counter */ - /* Allocate and set bank block counters */ - for (i = 0; i < MCA_BCOL_IBOFFLOAD_BK_LAST; i++) { - rdma_block->bank_buffer_counter[i] = (int *) calloc(rdma_block->bdesc.num_banks, - sizeof(int)); - if (NULL == rdma_block->bank_buffer_counter[i]) { - IBOFFLOAD_VERBOSE(10, ("Failed to allocate bank_block_counter\n")); - return OMPI_ERROR; - } - } - - if (OMPI_SUCCESS != init_rdma_buf_desc(&rdma_block->bdesc.rdma_desc, - desc->block->base_addr, - rdma_block->bdesc.num_banks, - rdma_block->bdesc.num_buffers_per_bank, - rdma_block->bdesc.size_buffer, - ml_module->data_offset)) { - IBOFFLOAD_VERBOSE(10, ("Failed to allocate rdma memory descriptor\n")); - return OMPI_ERROR; - } - - /* The all data is now cached on module level. The - real data exchange will happen during qp creation and - data exchange */ - - IBOFFLOAD_VERBOSE(10, ("ml_module = %p, iboffload_module = %p, ml_mem_desc = %p.\n", - ml_module, iboffload_module, rdma_block->ml_mem_desc)); - - for (i = 0; i < iboffload_module->num_endpoints; ++i) { - mca_bcol_iboffload_endpoint_t *ep = iboffload_module->endpoints[i]; - - if (true == ep->need_toset_remote_rdma_info) { - IBOFFLOAD_VERBOSE(10, ("ep %p index %d: postponed remote rdma block init.", ep, ep->index)); - if (OPAL_UNLIKELY(OMPI_SUCCESS != - set_endpoint_remote_rdma_info(ep, ep->remote_rdma_info))) { - return OMPI_ERROR; - } - } - } - - /* Hack: - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - Work around for deadlock caused by connection setup - for asyc service barrier. Asyc service barrier use own set of - MQ and QP _BUT_ the exchange operation uses the MQ that is used for - primary set of collectives operations like Allgahter, Barrier,etc. - As result exchange wait operation could be pushed to primary MQ and - cause dead-lock. - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - Create connection for service barrier and memory address exchange - for ml buffers and asyc service barrier - */ - /* This nasty hack was moved to ml discovery - rc = mca_bcol_iboffload_rec_doubling_start_connections(iboffload_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - */ - - return OMPI_SUCCESS; -} - -static void load_func(mca_bcol_base_module_t *super) -{ - int fnc; - - /* Loading Memory managment functions */ - /* NULL means that mpool may decide about prefered memory allocate functions */ - /* super->memory_management_functions.malloc_fn = NULL;*/ - /* NULL means that mpool may decide about prefered memory release functions */ - /* super->memory_management_functions.free_fn = NULL; */ - - /* JSL: setting the bcol_memory_init function to NULL, not sure what ib needs to do with - * the ml_memory_block - */ - super->bcol_memory_init = NULL; - - - /* Loading collective functions */ - for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; ++fnc) { - super->bcol_function_table[fnc] = NULL; - } - - super->bcol_function_init_table[BCOL_FANIN] = mca_bcol_iboffload_fanin_register; - super->bcol_function_init_table[BCOL_FANOUT] = mca_bcol_iboffload_fanout_register; - - super->bcol_function_init_table[BCOL_BARRIER] = mca_bcol_iboffload_barrier_register; - super->bcol_function_init_table[BCOL_BCAST] = mca_bcol_iboffload_bcast_register; - /*super->bcol_function_init_table[BCOL_ALLTOALL] = mca_bcol_iboffload_alltoall_register;*/ - super->bcol_function_init_table[BCOL_ALLGATHER] = mca_bcol_iboffload_allgather_register; - super->bcol_function_init_table[BCOL_SYNC] = mca_bcol_iboffload_memsync_register; - super->bcol_function_init_table[BCOL_ALLREDUCE] = mca_bcol_iboffload_allreduce_register; - - super->bcol_memory_init = mca_bcol_iboffload_init_buffer_memory; - - /* Set thresholds */ - super->set_small_msg_thresholds = mca_bcol_iboffload_set_small_msg_thresholds; - - super->k_nomial_tree = mca_bcol_iboffload_setup_knomial_tree; -} - -int mca_bcol_iboffload_setup_knomial_tree(mca_bcol_base_module_t *super) -{ - int rc; - mca_bcol_iboffload_module_t *ib_module = (mca_bcol_iboffload_module_t *) super; - rc = netpatterns_setup_recursive_knomial_allgather_tree_node( - ib_module->super.sbgp_partner_module->group_size, - ib_module->super.sbgp_partner_module->my_index, - mca_bcol_iboffload_component.k_nomial_radix, - super->list_n_connected, - &ib_module->knomial_allgather_tree); - - return rc; -} - -static inline struct ibv_cq *ibv_create_cq_compat(struct ibv_context *context, - int cqe, void *cq_context, struct ibv_comp_channel *channel, - int comp_vector) -{ -#if OPAL_IBV_CREATE_CQ_ARGS == 3 - return ibv_create_cq(context, cqe, channel); -#else - return ibv_create_cq(context, cqe, cq_context, channel, comp_vector); -#endif -} - -int mca_bcol_iboffload_adjust_cq(mca_bcol_iboffload_device_t *device, - struct ibv_cq **ib_cq) -{ - uint32_t cq_size = (uint32_t) mca_bcol_iboffload_component.cq_size; - - if (NULL == *ib_cq) { - *ib_cq = ibv_create_cq_compat(device->dev.ib_dev_context, cq_size, -#if OPAL_ENABLE_PROGRESS_THREADS == 1 - device, device->ib_channel, -#else - NULL, NULL, -#endif - 0); - - if (NULL == *ib_cq) { - IBOFFLOAD_ERROR(("Device %s " - ", failed to create CQ, errno says %s", - ibv_get_device_name(device->dev.ib_dev), strerror(errno))); - - return OMPI_ERROR; - } - } - - return OMPI_SUCCESS; -} - -static int init_recv_wr_manager(mca_bcol_iboffload_recv_wr_manager *recv_wr_manager) -{ - - struct ibv_recv_wr *recv_wr = NULL; - int ret = OMPI_SUCCESS, qp, wr, num_qps; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - num_qps = cm->num_qps; - OPAL_THREAD_LOCK(&recv_wr_manager->lock); - - recv_wr_manager->recv_work_requests = - (struct ibv_recv_wr **) calloc(num_qps, sizeof(struct ibv_recv_wr *)); - if (NULL == recv_wr_manager->recv_work_requests) { - IBOFFLOAD_ERROR(("Failed to allocate memory for recv_wr_manager->recv_work_requests")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto error; - } - - for (qp = 0; qp < num_qps; ++qp) { - int recv_queue_size = cm->qp_infos[qp].rd_num; - - recv_wr_manager->recv_work_requests[qp] = - (struct ibv_recv_wr *) calloc(recv_queue_size, sizeof(struct ibv_recv_wr)); - if (NULL == recv_wr_manager->recv_work_requests[qp]) { - IBOFFLOAD_ERROR(("Failed to allocate memory for recv_wr_manager->recv_work_requests")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto error; - } - - for (wr = 0; wr < recv_queue_size - 1; ++wr) { - recv_wr = &recv_wr_manager->recv_work_requests[qp][wr]; - recv_wr->next = &recv_wr_manager->recv_work_requests[qp][wr + 1]; - /* init receive work request. - * Real sg_list value we fill during receive prepost flow. - * recv_wr->wr_id and recv_wr->sg_list is zero by default */ - recv_wr->wr_id = 0; - recv_wr->sg_list = NULL; - recv_wr->num_sge = 1; /* single sge will be filled later */ - } - - recv_wr->next->num_sge = 1; /* for the last entry everything is null except the num_sge */ - } - -error: - OPAL_THREAD_UNLOCK(&recv_wr_manager->lock); - return ret; -} - -/* On first access to the component - allocate all memory resources */ -static int component_first_usage(void) -{ - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - int ret = OMPI_SUCCESS; - - /* creating collfrag free list */ - OBJ_CONSTRUCT(&cm->collfrags_free, ompi_free_list_t); - ret = ompi_free_list_init_new(&cm->collfrags_free, - sizeof(mca_bcol_iboffload_collfrag_t), - MCA_IBOFFLOAD_CACHE_LINE_SIZE, - OBJ_CLASS(mca_bcol_iboffload_collfrag_t), - 0, MCA_IBOFFLOAD_CACHE_LINE_SIZE, - cm->free_list_num, - cm->free_list_max, - cm->free_list_inc, - NULL); - if (OMPI_SUCCESS != ret) { - IBOFFLOAD_ERROR(("Failed to allocate mwr_free %s:%d\n", __FILE__, __LINE__)); - return ret; - } - - /* allocate free list of collective message requests */ - OBJ_CONSTRUCT(&cm->collreqs_free, ompi_free_list_t); - ret = ompi_free_list_init_new(&cm->collreqs_free, - sizeof(mca_bcol_iboffload_collreq_t), - MCA_IBOFFLOAD_CACHE_LINE_SIZE, - OBJ_CLASS(mca_bcol_iboffload_collreq_t), - 0, MCA_IBOFFLOAD_CACHE_LINE_SIZE, - cm->free_list_num * 2, - cm->free_list_max * 2, - cm->free_list_inc * 2, - NULL); - if (OMPI_SUCCESS != ret) { - IBOFFLOAD_ERROR(("Error creating free list, error: %s\n", strerror(errno))); - goto release_collfrag; - } - - OBJ_CONSTRUCT(&cm->tasks_free, ompi_free_list_t); - ret = ompi_free_list_init_new(&cm->tasks_free, - sizeof(mca_bcol_iboffload_task_t), - MCA_IBOFFLOAD_CACHE_LINE_SIZE, - OBJ_CLASS(mca_bcol_iboffload_task_t), - 0, MCA_IBOFFLOAD_CACHE_LINE_SIZE, - cm->free_list_num * 2, - cm->free_list_max * 2, - cm->free_list_inc * 2, - NULL); - if (OMPI_SUCCESS != ret) { - IBOFFLOAD_ERROR(("Error creating free list, error: %s\n", strerror(errno))); - goto release_collreq; - } - - OBJ_CONSTRUCT(&cm->calc_tasks_free, ompi_free_list_t); - ret = ompi_free_list_init_ex_new(&cm->calc_tasks_free, - sizeof(mca_bcol_iboffload_task_t), - MCA_IBOFFLOAD_CACHE_LINE_SIZE, - OBJ_CLASS(mca_bcol_iboffload_task_t), - 0, MCA_IBOFFLOAD_CACHE_LINE_SIZE, - cm->free_list_num * 2, - cm->free_list_max * 2, - cm->free_list_inc * 2, - NULL, - mca_bcol_iboffload_calc_task_init, - &cm->calc_tasks_free); - if (OMPI_SUCCESS != ret) { - IBOFFLOAD_ERROR(("Error creating free list, error: %s\n", strerror(errno))); - goto release_collreq; - } - - /* Initialization for frags that handle ML allocated memory, - it is NO registration is required ! - */ - - OBJ_CONSTRUCT(&cm->ml_frags_free, ompi_free_list_t); - ret = ompi_free_list_init_ex_new(&cm->ml_frags_free, - sizeof(mca_bcol_iboffload_frag_t), - MCA_IBOFFLOAD_CACHE_LINE_SIZE, - OBJ_CLASS(mca_bcol_iboffload_frag_t), - 0, MCA_IBOFFLOAD_CACHE_LINE_SIZE, - cm->free_list_num * 2, - cm->free_list_max * 2, - cm->free_list_inc * 2, - NULL, - mca_bcol_iboffload_ml_frag_init, - NULL); - if (OMPI_SUCCESS != ret) { - IBOFFLOAD_ERROR(("Error creating free list, error: %s\n", strerror(errno))); - goto release_collreq; - } - - ret = init_recv_wr_manager(&cm->recv_wrs); - if (OMPI_SUCCESS != ret){ - IBOFFLOAD_ERROR(("Failed to prepare recv wrs")); - goto release_tasks; - } - - cm->init_done = true; - - return OMPI_SUCCESS; - -release_tasks: - OBJ_DESTRUCT(&cm->tasks_free); -release_collreq: - OBJ_DESTRUCT(&cm->collreqs_free); -release_collfrag: - OBJ_DESTRUCT(&cm->collfrags_free); - return ret; -} - - -/* query to see if some modules are available for use on the given - * communicator, and if so, what it's priority is. - */ -mca_bcol_base_module_t ** -mca_bcol_iboffload_comm_query(mca_sbgp_base_module_t *sbgp, int *num_modules) -{ - /* local variables */ - int i, mq_index, rc, my_rank = 0; - struct mqe_context_attr mqe_attr; - - mca_sbgp_ibnet_module_t *ibnet = NULL; - mca_bcol_base_module_t **iboffload_modules = NULL; - mca_bcol_iboffload_module_t *iboffload_module = NULL; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - /* Bruck's alltoall iovec */ - size_t iovec_size; - - if (OPAL_UNLIKELY(false == cm->init_done)) { - if (OMPI_SUCCESS != component_first_usage()) { - return NULL; - } - } - - /* No group - no modules*/ - if (OPAL_UNLIKELY(NULL == sbgp)) { - return NULL; - } - /* - * This is activated only for intra-communicators - */ - if (OPAL_UNLIKELY(OMPI_COMM_IS_INTER(sbgp->group_comm))) { - return NULL; - } - - ibnet = (mca_sbgp_ibnet_module_t *) sbgp; - if (OPAL_UNLIKELY(0 == ibnet->num_cgroups)) { - /* we have no connection group */ - return NULL; - } - - my_rank = sbgp->my_index; - - iboffload_modules = (mca_bcol_base_module_t **) calloc - (ibnet->num_cgroups, sizeof(mca_bcol_base_module_t *)); - if (OPAL_UNLIKELY(NULL == iboffload_modules)) { - return NULL; - } - - /* Go through list of connection groups that we have on ibnet - * and create bcol module for each one */ - *num_modules = 0; - for (i = 0; i < ibnet->num_cgroups; i++) { - mca_sbgp_ibnet_connection_group_info_t *cgroup = - &ibnet->cgroups[i]; - - iboffload_module = OBJ_NEW(mca_bcol_iboffload_module_t); - - iboffload_modules[i] = &(iboffload_module->super); - - /* - * In fact the value == ibnet->num_cgroups in the end - * of the loop, but we need always to know how many modules - * release in the error case (under CLEANUP label) - */ - - (*num_modules)++; - - iboffload_module->cgroup_index = i; - iboffload_module->group_size = ibnet->super.group_size; - iboffload_module->log_group_size = lognum(iboffload_module->group_size); - /* Put pointer to sbgp module */ - iboffload_module->super.sbgp_partner_module = sbgp; - /* Put cgroup information on module */ - iboffload_module->ibnet = ibnet; - - iboffload_module->device = opal_pointer_array_get_item(&cm->devices, cgroup->device_index); - - IBOFFLOAD_VERBOSE(10, ("Iboffload module - %p uses " - "device - %p with index - %d.\n", - iboffload_module, - iboffload_module->device->dev.ib_dev, - cgroup->device_index)); - - OBJ_RETAIN(iboffload_module->device); - /* Pasha: Need to print NICE error in future */ - assert(NULL != iboffload_module->device); - iboffload_module->port = cgroup->port; - - IBOFFLOAD_VERBOSE(10, ("Iboffload module - %p on local port %d.\n", - iboffload_module, iboffload_module->port)); - - if (OPAL_UNLIKELY(!iboffload_module->device->activated)) { - /* this device was never used before, need to activate it */ - if (OMPI_SUCCESS != iboffload_start_device(iboffload_module->device)) { - OBJ_RELEASE(iboffload_module->device); - goto CLEANUP; - } - } - /* Set pointer to network contest on bcol base, we need it for ml - memory managment */ - OBJ_RETAIN(iboffload_module->device->net_context); - iboffload_module->super.network_context = iboffload_module->device->net_context; - - iboffload_module->subnet_id = iboffload_module->device->ports[iboffload_module->port - 1].subnet_id; - iboffload_module->lid = iboffload_module->device->ports[iboffload_module->port - 1].lid; - - load_func(&iboffload_module->super); - - IBOFFLOAD_VERBOSE(10, ("Call for create endpoints for iboffload module %p," - " cgroup num (index) %d.\n", iboffload_module, i)); - - /* create endpoints and store its in the endpoints pointer of iboffload_module structer */ - if (OMPI_SUCCESS != - mca_bcol_iboffloads_create_endpoints(cgroup, iboffload_module)) { - goto CLEANUP; - } - - memset(&mqe_attr, 0, sizeof(mqe_attr)); - mqe_attr.max_mqe_tasks = (uint32_t)mca_bcol_iboffload_component.max_mqe_tasks; - mqe_attr.max_mq_size = (uint32_t)mca_bcol_iboffload_component.max_mq_size; - mqe_attr.cq = iboffload_module->device->ib_mq_cq; - - /* ALL MQs have the same configuration */ - for (mq_index = 0; mq_index < BCOL_IBOFFLOAD_MQ_NUM; mq_index++) { - iboffload_module->mq[mq_index] = - mqe_context_create(iboffload_module->device->dev.ib_dev_context, - iboffload_module->device->ib_pd, &mqe_attr); - if (OPAL_UNLIKELY(NULL == iboffload_module->mq[mq_index])) { - IBOFFLOAD_ERROR(("Error creating MQ for device (%s), error: %s\n", - ibv_get_device_name(iboffload_module->device->dev.ib_dev), strerror(errno))); - goto CLEANUP; - } - } - - /* Barrier initialization - recuresive doubling */ -#if 1 - if (OMPI_SUCCESS != - netpatterns_setup_recursive_doubling_tree_node( - iboffload_module->group_size, my_rank, - &iboffload_module->recursive_doubling_tree)) { - IBOFFLOAD_ERROR(("Failed to setup recursive doubling tree," - " error: %s\n", strerror(errno))); - goto CLEANUP; - } -#endif - - /* Barrier initialization - N exchange tree */ - if (OMPI_SUCCESS != - netpatterns_setup_recursive_doubling_n_tree_node( - iboffload_module->group_size, my_rank, - cm->exchange_tree_order, - &iboffload_module->n_exchange_tree)) { - IBOFFLOAD_ERROR(("Failed to setup recursive doubling tree," - " error: %s\n", strerror(errno))); - goto CLEANUP; - } - - - /* Recursive K-ing initialization - Knomial exchange tree */ - if (OMPI_SUCCESS != - netpatterns_setup_recursive_knomial_tree_node( - iboffload_module->group_size, my_rank, - cm->knomial_tree_order, - &iboffload_module->knomial_exchange_tree)) { - IBOFFLOAD_ERROR(("Failed to setup recursive Knomial tree," - " error: %s\n", strerror(errno))); - goto CLEANUP; - } - - /* Manju Brucks alltoall temp iovec list */ - iovec_size = iboffload_module->group_size / 2 + iboffload_module->group_size % 2; - iboffload_module->alltoall_iovec = (struct iovec *) malloc(sizeof(struct iovec) - * iovec_size); - iboffload_module->alltoall_recv_iovec = (struct iovec *) malloc(sizeof(struct iovec) - * iovec_size); - - - iboffload_module->k_alltoall_bruck_radix=cm->k_alltoall_bruck_radix; - iboffload_module->tmp_buf_alignment=cm->tmp_buf_alignment; - -#if 1 /* Disabling this code since it brakes all iboffload functionality */ - /* Sorry Pasha, gotta do this. Recursive K-ing allgather initialization - Knomial exchange tree */ - /*Pretty sure I need to pass in the communicator rank */ - /* I need to reindex this mess */ - /* this looks silly, I know but it allows for minimal changes to existing code */ - iboffload_module->comm_to_ibnet_map = sbgp->group_list; - - -#endif -#if 0 - if ( NULL == iboffload_module->comm_to_ibnet_map ) { - IBOFFLOAD_ERROR(("Out of resources\n")); - goto CLEANUP; - } - for( i = 0; i < iboffload_module->group_size; i++) { - int j = 0; - while( sbgp->group_list[j] != i){ - j++; - } - iboffload_module->comm_to_ibnet_map[i] = j; - } - /* that should take care of that */ - if (OMPI_SUCCESS != - netpatterns_setup_recursive_knomial_allgather_tree_node( - iboffload_module->group_size, sbgp->group_list[my_rank], - cm->k_nomial_radix, iboffload_module->super.list_n_connected, - &iboffload_module->knomial_allgather_tree)) { - IBOFFLOAD_ERROR(("Failed to setup recursive Knomial tree," - " error: %s\n", strerror(errno))); - goto CLEANUP; - } -#endif - - iboffload_module->power_of_2 = - mca_bcol_iboffload_fls(iboffload_module->num_endpoints); - iboffload_module->power_of_2_ranks = - (1 << iboffload_module->power_of_2); - - /* header into ml buffer, we don't support header for anyone other than shared memory - * at the moment - */ - iboffload_module->super.header_size = 0; - - iboffload_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY | - MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG | - MCA_BCOL_BASE_NO_ML_BUFFER_FOR_BARRIER; - - rc = mca_bcol_base_bcol_fns_table_init(&(iboffload_module->super)); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - goto CLEANUP; - } - - OBJ_CONSTRUCT(&iboffload_module->iovec_tasks_free, ompi_free_list_t); - rc = ompi_free_list_init_ex_new(&iboffload_module->iovec_tasks_free, - sizeof(mca_bcol_iboffload_task_t), - MCA_IBOFFLOAD_CACHE_LINE_SIZE, - OBJ_CLASS(mca_bcol_iboffload_task_t), - 0, MCA_IBOFFLOAD_CACHE_LINE_SIZE, - cm->free_list_num * 2, - cm->free_list_max * 2, - cm->free_list_inc * 2, - NULL, - mca_bcol_iboffload_iovec_task_init, - iboffload_module); - if (OMPI_SUCCESS != rc) { - IBOFFLOAD_ERROR(("Error creating free list, error: %s\n", strerror(errno))); - goto CLEANUP; - } - } - - IBOFFLOAD_VERBOSE(10, ("Finished with success, num of cgroups is %d, num of modules is %d.\n", - ibnet->num_cgroups, *num_modules)); - - return iboffload_modules; - -CLEANUP: - for (i = 0; i < *num_modules; i++) { - if (NULL != iboffload_modules[i]) { - OBJ_RELEASE(iboffload_modules[i]); - } - } - free(iboffload_modules); - return NULL; -} - -static int init_rdma_buf_desc(mca_bcol_iboffload_rdma_buffer_desc_t **desc, void *base_addr, uint32_t num_banks, - uint32_t num_buffers_per_bank, uint32_t size_buffer, uint32_t header_size) -{ - uint32_t i, j, ci; - mca_bcol_iboffload_rdma_buffer_desc_t *tmp_desc; - - IBOFFLOAD_VERBOSE(10, ("init_rdma_buf_desc base addr %p, num_n %d , " - "num_per_bank %d, size %d, header size %d", - base_addr, num_banks, num_buffers_per_bank, - size_buffer, header_size)); - *desc = (mca_bcol_iboffload_rdma_buffer_desc_t *) - calloc(num_banks * num_buffers_per_bank, - sizeof(mca_bcol_iboffload_rdma_buffer_desc_t)); - if (OPAL_UNLIKELY(NULL == *desc)) { - IBOFFLOAD_ERROR(("Failed to allocate memory")); - return OMPI_ERROR; - } - - tmp_desc = *desc; - - for (i = 0; i < num_banks; i++) { - for (j = 0; j < num_buffers_per_bank; j++) { - ci = i * num_buffers_per_bank + j; - tmp_desc[ci].generation_number = 0; - tmp_desc[ci].bank_index = i; - tmp_desc[ci].buffer_index = j; - /* - * iboffload don't have any header, but other bcols may to have. So - * we need to take it in account. - */ - tmp_desc[ci].data_addr = (void *) - ((unsigned char *) base_addr + ci * size_buffer + header_size); - IBOFFLOAD_VERBOSE(10, ("RDMA setup %d %d - %p", i, j, tmp_desc[ci].data_addr)); - } - } - - return OMPI_SUCCESS; -} - -static int set_endpoint_remote_rdma_info(mca_bcol_iboffload_endpoint_t *ep, mca_bcol_iboffload_rdma_info_t *remote_rdma_info) -{ - mca_bcol_iboffload_rem_rdma_block_t *rem_block = &ep->remote_rdma_block; - - /* We'll continue if - - 1. The module rdma_block is already initilized on this stage - 2. All peers have the same rdma block configuration that actually is - define on ML level - - Otherwise set flag to init it lately. - */ - if (NULL == ep->iboffload_module->rdma_block.ml_mem_desc) { - IBOFFLOAD_VERBOSE(10, ("RDMA block information hasn't been inited yet.")); - ep->need_toset_remote_rdma_info = true; - return OMPI_SUCCESS; - } - - /* set the rdma addr for barrier */ - ep->remote_zero_rdma_addr = remote_rdma_info[0]; - - IBOFFLOAD_VERBOSE(10, ("RDMA block information %p %d", - remote_rdma_info[0].addr, remote_rdma_info[0].rkey)); - - /* set the rdma block memory structs */ - rem_block->ib_info = remote_rdma_info[1]; - - - /* if we got some real data. lets init memory adress sctructures */ - if (0 != rem_block->ib_info.addr) { - if (OMPI_SUCCESS != init_rdma_buf_desc(&rem_block->rdma_desc, (void *)rem_block->ib_info.addr, - ep->iboffload_module->rdma_block.bdesc.num_banks, - ep->iboffload_module->rdma_block.bdesc.num_buffers_per_bank, - ep->iboffload_module->rdma_block.bdesc.size_buffer, - /* remember, we use lkey to pass the data offset value */ - rem_block->ib_info.lkey)) { - IBOFFLOAD_VERBOSE(10, ("Failed to allocate RDMA buffer descriptor")); - return OMPI_ERROR; - } - } - - IBOFFLOAD_VERBOSE(10, ("endpoint - %p, recv barrier rdma: rem addr - %p, rem rkey - %d.\n", - ep, ep->remote_zero_rdma_addr.addr, ep->remote_zero_rdma_addr.rkey)); - IBOFFLOAD_VERBOSE(10, ("endpoint - %p, recv ml rdma: rem addr - %p, rem rkey - %d.\n", - ep, ep->remote_rdma_block.ib_info.addr, ep->remote_rdma_block.ib_info.rkey)); - - return OMPI_SUCCESS; -} - -static int unpack_endpoint_rdma_addr(void *callback_data) -{ - int rc; - struct iovec payload_iovec; - - size_t max_size = 0; - uint32_t out_size = 1; - - mca_bcol_iboffload_collfrag_t *coll_frag = (mca_bcol_iboffload_collfrag_t *) callback_data; - mca_bcol_iboffload_collreq_t* collreq = coll_frag->coll_full_req; - - mca_bcol_iboffload_task_t *wait_task = (mca_bcol_iboffload_task_t *) coll_frag->signal_task_wr_id; - - mca_bcol_iboffload_frag_t *recv_frag = wait_task->frag; - mca_bcol_iboffload_endpoint_t *ep = wait_task->endpoint; - - rc = opal_convertor_copy_and_prepare_for_recv( - ompi_mpi_local_convertor, - &opal_datatype_uint1, - sizeof(mca_bcol_iboffload_rdma_info_t) * MAX_REMOTE_RDMA_INFO, - ep->remote_rdma_info, 0, - &collreq->recv_convertor); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return OMPI_ERROR; - } - - payload_iovec.iov_base = (void*) (uintptr_t) - recv_frag->sg_entry.addr; - - payload_iovec.iov_len = sizeof(mca_bcol_iboffload_rdma_info_t) * MAX_REMOTE_RDMA_INFO; - - if (0 > opal_convertor_unpack(&collreq->recv_convertor, - &payload_iovec, &out_size, &max_size)) { - return OMPI_ERROR; - } - - if (OMPI_SUCCESS != set_endpoint_remote_rdma_info(ep, ep->remote_rdma_info)) { - return OMPI_ERROR; - } - - opal_convertor_cleanup(&collreq->send_convertor); - opal_convertor_cleanup(&collreq->recv_convertor); - - return OMPI_SUCCESS; -} - -/* RDMA addr exchange with rem proc */ -int mca_bcol_iboffload_exchange_rem_addr(mca_bcol_iboffload_endpoint_t *ep) -{ - int rc; - /* the [0] used for constant barrier rdma operations - the [1] used for rdma block inforation exchange. The rdma - block is used for RDMA operation over ML allocated memory */ - mca_bcol_iboffload_rdma_info_t remote_rdma_addr[MAX_REMOTE_RDMA_INFO]; - - mca_bcol_iboffload_task_t *send_task, - *wait_task; - - mca_bcol_iboffload_frag_t *send_fragment, - *preposted_recv_frag; - - ompi_free_list_item_t *item; - - mca_bcol_iboffload_collreq_t *coll_request; - mca_bcol_iboffload_collfrag_t *coll_fragment; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item); - if (NULL == item) { - IBOFFLOAD_ERROR(("Failing for coll request free list waiting.\n")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - coll_request = (mca_bcol_iboffload_collreq_t *) item; - - coll_request->completion_cb_fn = unpack_endpoint_rdma_addr; - /* For the exchange the progress_fn should be never used */ - coll_request->progress_fn = NULL; - coll_request->module = ep->iboffload_module; - coll_request->ml_buffer_index = MCA_COLL_ML_NO_BUFFER; - coll_request->buffer_info[SBUF].offset = 0; - coll_request->buffer_info[RBUF].offset = 0; - coll_request->qp_index = MCA_BCOL_IBOFFLOAD_QP_REGULAR; - /* - * setup collective work request - */ - - /* get collective frag */ - coll_fragment = &coll_request->first_collfrag; - mca_bcol_iboffload_collfrag_init(coll_fragment); - - coll_fragment->mq_credits = 2; - coll_fragment->mq_index = COLL_MQ; - coll_fragment->tail_next = &coll_fragment->to_post; - /* overwrite mq index to run over service setup */ - - /* Update the algorithm type in order to support credit mechanism */ - coll_fragment->alg = REMOTE_EXCHANGE_ALG; - if (OPAL_UNLIKELY(false == - BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(ep->iboffload_module, - coll_fragment->mq_index, 2))) { - IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n")); - - goto out_of_resources; - } - - /* set pointers for (coll frag) <-> (coll full request) */ - MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(coll_request, coll_fragment); - - remote_rdma_addr[0].addr = - ep->iboffload_module->device->dummy_frags[MCA_BCOL_IBOFFLOAD_QP_BARRIER].sg_entry.addr; - remote_rdma_addr[0].rkey = - ep->iboffload_module->device->dummy_frags[MCA_BCOL_IBOFFLOAD_QP_BARRIER].registration->mr->rkey; - - if (NULL != ep->iboffload_module->rdma_block.ml_mem_desc) { - remote_rdma_addr[1].addr = ep->iboffload_module->rdma_block.ib_info.addr; - remote_rdma_addr[1].rkey = ep->iboffload_module->rdma_block.ib_info.rkey; - /* Little bit ugly, but easy solution. The data_offset */ - remote_rdma_addr[1].lkey = ep->iboffload_module->rdma_block.bdesc.data_offset; - } else { - /* since it is no data lets send 0, so remote side will knox that no real - data was send */ - remote_rdma_addr[1].addr = 0; - remote_rdma_addr[1].rkey = 0; - remote_rdma_addr[1].lkey = 0; - } - - IBOFFLOAD_VERBOSE(10, ("endpoint - %p, sending barrier rdma: addr - %p, rkey - %d.\n", - ep, remote_rdma_addr[0].addr, remote_rdma_addr[0].rkey)); - IBOFFLOAD_VERBOSE(10, ("endpoint - %p, sending ml rdma: addr - %p, rkey - %d.\n", - ep, remote_rdma_addr[1].addr, remote_rdma_addr[1].rkey)); - - rc = opal_convertor_copy_and_prepare_for_send( - ompi_mpi_local_convertor, - &opal_datatype_uint1, - sizeof(mca_bcol_iboffload_rdma_info_t) * MAX_REMOTE_RDMA_INFO, - &remote_rdma_addr, 0, - &coll_request->send_convertor); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - goto out_of_resources; - } - - send_fragment = mca_bcol_iboffload_get_send_frag( - coll_request, ep->index, coll_request->qp_index, - sizeof(mca_bcol_iboffload_rdma_info_t) * MAX_REMOTE_RDMA_INFO, - 0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT); - if (OPAL_UNLIKELY(NULL == send_fragment)) { - IBOFFLOAD_ERROR(("Failing for getting and packing send frag.\n")); - goto out_of_resources; - } - - send_task = mca_bcol_iboffload_get_send_task(ep->iboffload_module, - ep->index, coll_request->qp_index, send_fragment, - coll_fragment, INLINE); - if (OPAL_UNLIKELY(NULL == send_task)) { - IBOFFLOAD_ERROR(("Failing for getting send task.\n")); - goto out_of_resources; - } - - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task); - MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, send_task); - - /* post wait */ - preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag( - ep->iboffload_module, ep->index, coll_request->qp_index); - if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) { - IBOFFLOAD_ERROR(("Exchaging: " - "Failing for getting prepost recv frag.\n")); - goto out_of_resources; - } - - wait_task = mca_bcol_iboffload_get_wait_task(ep->iboffload_module, - ep->index, 1, preposted_recv_frag, coll_request->qp_index, NULL); - - if (OPAL_UNLIKELY(NULL == wait_task)) { - IBOFFLOAD_VERBOSE(10, ("Exchanging: " - "Failing for getting wait task.\n")); - goto out_of_resources; - } - - MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task); - MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, wait_task); - - /* The last element must end with ZERO */ - wait_task->element.next = NULL; - - /* number of sends that need to be completed asynchronously */ - coll_fragment->n_sends = 1; - SENDWR(send_task)->send_flags |= IBV_SEND_SIGNALED; - - /* finish initializing full message descriptor */ - coll_request->n_fragments = 1; - coll_request->n_frags_sent = 1; - - coll_request->n_frag_mpi_complete = 0; - coll_request->n_frag_net_complete = 0; - coll_request->user_handle_freed = false; - - wait_task->element.flags |= MQE_WR_FLAG_SIGNAL; - coll_fragment->signal_task_wr_id = - (uint64_t) (uintptr_t) wait_task->element.wr_id; - - wait_task->element.wr_id = (uint64_t) (uintptr_t) coll_fragment; - - /* post the mwr */ - rc = mca_bcol_iboffload_post_mqe_tasks(coll_request->module, coll_fragment->to_post); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n")); - /* Note: need to clean up */ - return rc; - } - - coll_request->user_handle_freed = true; - /* complete the exchange - progress releases full request descriptors */ - while (!BCOL_IS_COMPLETED(coll_request)) { - mca_bcol_iboffload_component_progress(); - } - - IBOFFLOAD_VERBOSE(10, ("RDMA addr exchange with comm rank: %d was finished.\n", - ep->iboffload_module->ibnet->super.group_list[ep->index])); - - return OMPI_SUCCESS; - -out_of_resources: - /* Release all resources */ - IBOFFLOAD_VERBOSE(10, ("RDMA addr exchange, adding collfrag to collfrag_pending.\n")); - return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, ep->iboffload_module); -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_qp_info.c b/ompi/mca/bcol/iboffload/bcol_iboffload_qp_info.c deleted file mode 100644 index bfc5e4fbbf..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_qp_info.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include - -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_device.h" -#include "bcol_iboffload_qp_info.h" -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_endpoint.h" - -static int mca_bcol_iboffload_dummy_frag_qp_prepost( - mca_bcol_iboffload_endpoint_t *endpoint, - int qp_index, int num_to_prepost) -{ - struct ibv_recv_wr *recv_wr, *recv_bad; - int ret, num_preposted = 0, start_wr_index; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs; - - IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d", - (void *) endpoint, num_to_prepost)); - - if (OPAL_UNLIKELY(0 == num_to_prepost)) { - IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate")); - return OMPI_SUCCESS; - } - - /* make sure that we do not overrun number of rd_wqe */ - if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) { - IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d", - num_to_prepost, endpoint->qps[qp_index].rd_wqe)); - - num_to_prepost = endpoint->qps[qp_index].rd_wqe; - } - - OPAL_THREAD_LOCK(&recv_wrs->lock); - - /* calculate start index in array - * of pre-allocated work requests */ - start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost; - recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index]; - - IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, " - "start index of WRs - %d, rd_wqe - %d", - (void *) endpoint, qp_index, num_to_prepost, - start_wr_index, endpoint->qps[qp_index].rd_wqe)); - - while (num_preposted < num_to_prepost) { - /* prepost the special barrier frag to recv queue */ - struct ibv_sge *dummy_sg_entry = - &endpoint->iboffload_module->device->dummy_frags[qp_index].sg_entry; - - recv_wr[num_preposted].sg_list = dummy_sg_entry; - ++num_preposted; - } - - if (OPAL_LIKELY(num_preposted > 0)) { - /* Set the tail */ - recv_wr[num_preposted - 1].next = NULL; - - /* post the list of recvs */ - ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad); - if (OPAL_UNLIKELY(0 != ret)) { - IBOFFLOAD_ERROR(("ibv_post_recv failed, error: %s [%d], " - "qp_index - %d.\n", strerror(errno), ret, qp_index)); - - return OMPI_ERROR; - } - - /* recover last recv_wr if needed */ - if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) { - recv_wr[num_preposted - 1].next = &recv_wr[num_preposted]; - } - - /* decresing numbers of free recv wqe */ - endpoint->qps[qp_index].rd_wqe -= num_preposted; - } - - OPAL_THREAD_UNLOCK(&recv_wrs->lock); - - IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d, qp_index - %d", - (void *) endpoint, num_to_prepost, num_preposted, qp_index)); - - return OMPI_SUCCESS; -} - -/* - * Receive prepost: - * return values: - * 0 - no prepost was done - * -1 - fatal error during prepost - * other value - number preposted elements - */ -static int mca_bcol_iboffload_frag_reg_qp_prepost( - mca_bcol_iboffload_endpoint_t *endpoint, - int qp_index, int num_to_prepost) -{ - ompi_free_list_item_t *item; - mca_bcol_iboffload_frag_t *frag; - - struct ibv_recv_wr *recv_wr, *recv_bad; - int i, ret, num_preposted = 0, start_wr_index; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device; - - opal_list_t *preposted = &(endpoint->qps[qp_index].preposted_frags); - mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs; - - IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d", - (void *) endpoint, num_to_prepost)); - - if (OPAL_UNLIKELY(0 == num_to_prepost)) { - IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate")); - return OMPI_SUCCESS; - } - - /* make sure that we do not overrun number of rd_wqe */ - if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) { - IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d", - num_to_prepost, endpoint->qps[qp_index].rd_wqe)); - - num_to_prepost = endpoint->qps[qp_index].rd_wqe; - } - - OPAL_THREAD_LOCK(&recv_wrs->lock); - - /* calculate start index in array - * of pre-allocated work requests */ - start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost; - recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index]; - - IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, " - "start index of WRs - %d, rd_wqe - %d", - (void *) endpoint, qp_index, num_to_prepost, - start_wr_index, endpoint->qps[qp_index].rd_wqe)); - - while (num_preposted < num_to_prepost) { - /* put the item on list of preposted */ - OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item); - if (OPAL_UNLIKELY(NULL == item)) { - break; - } - - frag = (mca_bcol_iboffload_frag_t *) item; - opal_list_append(preposted, (opal_list_item_t *) item); - - recv_wr[num_preposted].sg_list = &frag->sg_entry; - /* TODO Pasha - fix it later */ /* Vasily: Is it right place to take a size value ???? */ - frag->sg_entry.length = cm->qp_infos[qp_index].size; - ++num_preposted; - } - - if (OPAL_LIKELY(num_preposted > 0)) { - /* Set the tail */ - recv_wr[num_preposted - 1].next = NULL; - - /* post the list of recvs */ - ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad); - if (OPAL_UNLIKELY(0 != ret)) { - IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], " - "qp_index - %d.\n", - ibv_get_device_name(device->dev.ib_dev), - strerror(errno), ret, qp_index)); - - /* Return allocated frags */ - for (i = 0; i < num_preposted; i++) { - OMPI_FREE_LIST_RETURN_MT(&device->frags_free[qp_index], - (ompi_free_list_item_t *) - opal_list_remove_last(preposted)); - } - - return OMPI_ERROR; - } - - /* recover last recv_wr if needed */ - if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) { - recv_wr[num_preposted - 1].next = &recv_wr[num_preposted]; - } - - /* decresing numbers of free recv wqe */ - endpoint->qps[qp_index].rd_wqe -= num_preposted; - } - - OPAL_THREAD_UNLOCK(&recv_wrs->lock); - - IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d", - (void *) endpoint, num_to_prepost, num_preposted)); - - return OMPI_SUCCESS; -} - - -static void mca_bcol_iboffload_fillin_qp_attr(int qp_index, - mca_bcol_iboffload_endpoint_t *ep, - ompi_common_ofacm_base_qp_config_t *qp_config) -{ - uint32_t max_sge, *init_attr_mask = - &qp_config->init_attr_mask[qp_index]; - - struct ibv_qp_attr *attr = &qp_config->attr[qp_index]; - struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index]; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - /* Set special init attributes mask */ - *init_attr_mask = IBV_M_QP_EXT_CLASS_1 | - IBV_M_QP_EXT_CLASS_2 | - IBV_M_QP_EXT_IGNORE_RQ_OVERFLOW; - - /* Set init attributes */ - init_attr->qp_type = IBV_QPT_RC; - -/* Vasily: ?????? - init_attr->cap.max_inline_data = - max_inline_size(qp, iboffload_module->device); -*/ - /* Pasha: we can not leave max_inline empty ! - Todo: copy max_inline_size() from ofacm to - common area. - */ - init_attr->cap.max_inline_data = (int32_t) cm->max_inline_data; - - /* We allocate SG list for some algorithms (Bruck's alltoall) */ - max_sge = ep->iboffload_module->group_size / 2 + - ep->iboffload_module->group_size % 2; - - /* max send sge should be less than device maximums */ - if (max_sge > (uint32_t) - ep->iboffload_module->device->ib_dev_attr.max_sge) { - max_sge = (uint32_t) ep->iboffload_module->device->ib_dev_attr.max_sge; - } - - init_attr->cap.max_send_sge = max_sge; - init_attr->cap.max_recv_sge = max_sge; -/* Vasily: the value will be changed later */ -/* TODO Pasha: this is real crap */ - init_attr->cap.max_recv_wr = (uint32_t) cm->cq_size; - init_attr->cap.max_send_wr = (uint32_t) cm->cq_size; - - /* Set attributes */ - - /* attr->pkey_index = 0; */ /* Vasily: ????? */ - - attr->port_num = ep->iboffload_module->port; -/* Vasily: the value will be changed later */ - attr->path_mtu = (uint32_t)cm->mtu; - - attr->max_dest_rd_atomic = cm->max_rdma_dst_ops; - attr->min_rnr_timer = (uint32_t)cm->min_rnr_timer; - - attr->ah_attr.is_global = 0; - attr->ah_attr.sl = (uint32_t)cm->service_level; -/* Vasily: from struct mca_bcol_iboffload_port_t ????? */ -/* - attr->ah_attr.src_path_bits = iboffload_module->src_path_bits; -*/ - attr->ah_attr.port_num = ep->iboffload_module->port; - /* JMS to be filled in later dynamically */ - attr->ah_attr.static_rate = 0; - /* RTS params */ - attr->timeout = (uint32_t)cm->timeout; - attr->retry_cnt = (uint32_t)cm->retry_count; - attr->rnr_retry = (uint32_t)cm->rnr_retry; - attr->max_rd_atomic = (uint32_t)cm->max_rdma_dst_ops; - - /* Init for local mca_bcol_iboffload_endpoint_qp_t qps structure - * that caches the qp information on endpoint */ - OBJ_CONSTRUCT(&ep->qps[qp_index].preposted_frags, opal_list_t); - - /* Pasha: Need to add function that will */ - ep->qps[qp_index].ib_inline_max = cm->max_inline_data; - /* TODO Pasha - this is crap too... we do not have info for sevice qps. Fix it later */ - - ep->qps[qp_index].sd_wqe = cm->qp_infos[qp_index].rd_num; - ep->qps[qp_index].rd_wqe = cm->qp_infos[qp_index].rd_num; - - IBOFFLOAD_VERBOSE(10, ("ep - %p, qp index - %d, num of rd_wqe - %d.", - ep, qp_index, ep->qps[qp_index].rd_wqe)); -} - -static int mca_bcol_iboffload_alloc_reg_qp_resource(int qp_index, mca_bcol_iboffload_device_t *device) -{ - int length; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - ompi_free_list_t *frags_free = &device->frags_free[qp_index]; - - OBJ_CONSTRUCT(frags_free, ompi_free_list_t); - length = cm->qp_infos[qp_index].size; - - IBOFFLOAD_VERBOSE(10, ("free list len %d\n", length)); - if (OMPI_SUCCESS != ompi_free_list_init_ex_new(frags_free, - sizeof(mca_bcol_iboffload_frag_t), MCA_IBOFFLOAD_CACHE_LINE_SIZE, - OBJ_CLASS(mca_bcol_iboffload_frag_t), - length, cm->buffer_alignment, - cm->free_list_num, - cm->free_list_max, - cm->free_list_inc, - device->mpool, - mca_bcol_iboffload_frag_init, - (void *) &cm->qp_infos[qp_index].qp_index)) { - IBOFFLOAD_ERROR(("Failed to allocate frags_free")); - return OMPI_ERROR; - } - - return OMPI_SUCCESS; -} -static int mca_bcol_iboffload_dealloc_reg_qp_resource(int qp_index, mca_bcol_iboffload_device_t *device) -{ - OBJ_DESTRUCT(&device->frags_free[qp_index]); - - return OMPI_SUCCESS; -} - -static mca_bcol_iboffload_frag_t *mca_bcol_iboffload_get_dummy_frag( - mca_bcol_iboffload_endpoint_t *ep, int qp_index) -{ - return &ep->iboffload_module->device->dummy_frags[qp_index]; -} - -static mca_bcol_iboffload_frag_t *mca_bcol_iboffload_endpoint_get_preposted_frag( - mca_bcol_iboffload_endpoint_t *ep, int qp_index) -{ - return (mca_bcol_iboffload_frag_t *) - opal_list_remove_first(&ep->qps[qp_index].preposted_frags); -} - -static void mca_bcol_iboffload_regular_qp_attr(int qp_index, - mca_bcol_iboffload_endpoint_t *ep, - ompi_common_ofacm_base_qp_config_t *qp_config) -{ - struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index]; - - mca_bcol_iboffload_fillin_qp_attr(qp_index, ep, qp_config); - - init_attr->send_cq = ep->iboffload_module->device->ib_cq; - init_attr->recv_cq = ep->recv_cq[IBOFFLOAD_CQ_SMALL_MESSAGES]; -} - -static void mca_bcol_iboffload_large_buff_qp_attr(int qp_index, - mca_bcol_iboffload_endpoint_t *ep, - ompi_common_ofacm_base_qp_config_t *qp_config) -{ - struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index]; - - mca_bcol_iboffload_fillin_qp_attr(qp_index, ep, qp_config); - - init_attr->send_cq = ep->iboffload_module->device->ib_cq; - init_attr->recv_cq = ep->recv_cq[IBOFFLOAD_CQ_LARGE_MESSAGES]; -} - -static void mca_bcol_iboffload_sync_qp_attr(int qp_index, - mca_bcol_iboffload_endpoint_t *ep, - ompi_common_ofacm_base_qp_config_t *qp_config) -{ - struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index]; - - mca_bcol_iboffload_fillin_qp_attr(qp_index, ep, qp_config); - - init_attr->send_cq = ep->iboffload_module->device->ib_cq; - init_attr->recv_cq = ep->recv_cq[IBOFFLOAD_CQ_SYNC]; -} - -static int mca_bcol_iboffload_setup_barrier_qp(mca_bcol_iboffload_qp_info_t* qp_info) -{ - qp_info->config_qp = mca_bcol_iboffload_regular_qp_attr; - qp_info->prepost_recv = mca_bcol_iboffload_dummy_frag_qp_prepost; - - qp_info->alloc_resource = NULL; - qp_info->dealloc_resource = NULL; - - qp_info->get_preposted_recv = mca_bcol_iboffload_get_dummy_frag; - - return OMPI_SUCCESS; -} - -static int mca_bcol_iboffload_setup_regular_qp(mca_bcol_iboffload_qp_info_t* qp_info) -{ - qp_info->config_qp = mca_bcol_iboffload_regular_qp_attr; - qp_info->prepost_recv = mca_bcol_iboffload_frag_reg_qp_prepost; - - qp_info->alloc_resource = mca_bcol_iboffload_alloc_reg_qp_resource; - qp_info->dealloc_resource = mca_bcol_iboffload_dealloc_reg_qp_resource; - - qp_info->get_preposted_recv = mca_bcol_iboffload_endpoint_get_preposted_frag; - - return OMPI_SUCCESS; -} - -static int mca_bcol_iboffload_setup_large_buff_qp(mca_bcol_iboffload_qp_info_t* qp_info) -{ - qp_info->config_qp = mca_bcol_iboffload_large_buff_qp_attr; - - qp_info->prepost_recv = NULL; /* We use "manual" ML frag preposting for this QP */ - qp_info->alloc_resource = NULL; - qp_info->dealloc_resource = NULL; - qp_info->get_preposted_recv = NULL; - - return OMPI_SUCCESS; -} - -static int mca_bcol_iboffload_setup_credit_qp(mca_bcol_iboffload_qp_info_t* qp_info) -{ - qp_info->config_qp = mca_bcol_iboffload_large_buff_qp_attr; - qp_info->prepost_recv = mca_bcol_iboffload_dummy_frag_qp_prepost; - - qp_info->alloc_resource = NULL; - qp_info->dealloc_resource = NULL; - - qp_info->get_preposted_recv = mca_bcol_iboffload_get_dummy_frag; - - return OMPI_SUCCESS; -} - -static int mca_bcol_iboffload_setup_sync_qp(mca_bcol_iboffload_qp_info_t* qp_info) -{ - qp_info->config_qp = mca_bcol_iboffload_sync_qp_attr; - qp_info->prepost_recv = mca_bcol_iboffload_dummy_frag_qp_prepost; - - qp_info->alloc_resource = NULL; - qp_info->dealloc_resource = NULL; - - qp_info->get_preposted_recv = mca_bcol_iboffload_get_dummy_frag; - - return OMPI_SUCCESS; -} - -mca_bcol_iboffload_setup_qps_fn_t setup_qps_fn[MCA_BCOL_IBOFFLOAD_QP_LAST] = { - mca_bcol_iboffload_setup_barrier_qp, /* MCA_BCOL_IBOFFLOAD_QP_BARRIER */ - mca_bcol_iboffload_setup_regular_qp, /* MCA_BCOL_IBOFFLOAD_QP_REGULAR */ - mca_bcol_iboffload_setup_sync_qp, /* MCA_BCOL_IBOFFLOAD_QP_SYNC */ - mca_bcol_iboffload_setup_credit_qp, /* MCA_BCOL_IBOFFLOAD_QP_CREDIT */ - mca_bcol_iboffload_setup_large_buff_qp, /* MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF */ - /* MCA_BCOL_IBOFFLOAD_QP_LAST */ -}; diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_qp_info.h b/ompi/mca/bcol/iboffload/bcol_iboffload_qp_info.h deleted file mode 100644 index e904e10888..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_qp_info.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/* - * In order to add a new QP you need to do next steps: - * - * 1) Add new index to enum with list of the all QPs, - * MCA_BCOL_IBOFFLOAD_QP_NEW_QP e.g. - * - * 2) In the setup_qps_fn array init MCA_BCOL_IBOFFLOAD_QP_NEW_QP - * index with your init func for this QP. - * - * 3) In the init func you added init the next func pointers: - * a) config_qp - in this func you need to fill in ibv_qp_init_attr - * structure will be used for this QP creation. - * - * b) prepost_recv - you have to specify this poiner if you want - * automatically executed preposting to your new QP. - * - * c) alloc_resource - will be called during device activation, - * if you need any device resource (list of frags for example) - * for your new QP here the right place to allocate it. - * - * d) dealloc_resource - if any resource was allocated dynamically - * by alloc_resource func destruct it in this func. - * - * e) get_preposted_recv - the function returns preposted recieve for 'wait task'. - * - * d) If you don't need any of these funcs you have to init appropriate pointer with NULL. - */ - -#ifndef MCA_BCOL_IBOFFLOAD_QP_INFO_H -#define MCA_BCOL_IBOFFLOAD_QP_INFO_H - -#include "ompi_config.h" - -BEGIN_C_DECLS - -/* forward declarations */ -struct mca_bcol_iboffload_device_t; -struct mca_bcol_iboffload_collreq_t; -struct mca_bcol_iboffload_qp_info_t; -struct mca_bcol_iboffload_endpoint_t; - -/* The list of the all required QPs */ -enum { - MCA_BCOL_IBOFFLOAD_QP_BARRIER, - MCA_BCOL_IBOFFLOAD_QP_REGULAR, - MCA_BCOL_IBOFFLOAD_QP_SYNC, - MCA_BCOL_IBOFFLOAD_QP_CREDIT, - MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, - MCA_BCOL_IBOFFLOAD_QP_LAST -}; - -typedef enum { - MCA_BCOL_IBOFFLOAD_PP_QP, - MCA_BCOL_IBOFFLOAD_SRQ_QP, - MCA_BCOL_IBOFFLOAD_XRC_QP -} mca_bcol_iboffload_qp_type_t; - -struct mca_bcol_iboffload_pp_qp_info_t { - int32_t rd_win; - int32_t rd_rsv; -}; typedef struct mca_bcol_iboffload_pp_qp_info_t mca_bcol_iboffload_pp_qp_info_t; - -struct mca_bcol_iboffload_srq_qp_info_t { - int32_t sd_max; -}; typedef struct mca_bcol_iboffload_srq_qp_info_t mca_bcol_iboffload_srq_qp_info_t; - -typedef int (*mca_bcol_iboffload_setup_qps_fn_t) (struct mca_bcol_iboffload_qp_info_t*); -typedef int (*mca_bcol_iboffload_prepost_qps_fn_t) - (struct mca_bcol_iboffload_endpoint_t *endpoint, - int qp_index, int num_to_prepost); - -typedef void (*mca_bcol_iboffload_config_qps_fn_t) - (int qp_index, - struct mca_bcol_iboffload_endpoint_t *ep, - ompi_common_ofacm_base_qp_config_t *qp_config); - -typedef int (*mca_bcol_iboffload_alloc_qps_resource_fn_t) - (int qp_index, - struct mca_bcol_iboffload_device_t *device); - -typedef int (*mca_bcol_iboffload_dealloc_qps_resource_fn_t) - (int qp_index, - struct mca_bcol_iboffload_device_t *device); - -typedef struct mca_bcol_iboffload_frag_t* (*mca_bcol_iboffload_get_preposted_recv_fn_t) - (struct mca_bcol_iboffload_endpoint_t *ep, int qp_index); - -struct mca_bcol_iboffload_qp_info_t { - size_t size; - - int32_t rd_num; - int32_t rd_low; - int32_t rd_pp_win; /* prepost window = rd_num - rd_low */ - int qp_index; - - mca_bcol_iboffload_qp_type_t type; - - mca_bcol_iboffload_config_qps_fn_t config_qp; - mca_bcol_iboffload_prepost_qps_fn_t prepost_recv; - - mca_bcol_iboffload_alloc_qps_resource_fn_t alloc_resource; - mca_bcol_iboffload_dealloc_qps_resource_fn_t dealloc_resource; - - mca_bcol_iboffload_get_preposted_recv_fn_t get_preposted_recv; - - union { - mca_bcol_iboffload_pp_qp_info_t pp_qp; - mca_bcol_iboffload_srq_qp_info_t srq_qp; - } u; -}; typedef struct mca_bcol_iboffload_qp_info_t mca_bcol_iboffload_qp_info_t; - -extern mca_bcol_iboffload_setup_qps_fn_t setup_qps_fn[MCA_BCOL_IBOFFLOAD_QP_LAST]; - -END_C_DECLS - -#endif /* MCA_BCOL_IBOFFLOAD_QP_INFO_H */ - diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_task.c b/ompi/mca/bcol/iboffload/bcol_iboffload_task.c deleted file mode 100644 index 6fcb62391a..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_task.c +++ /dev/null @@ -1,81 +0,0 @@ - /* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_task.h" - -static void task_constructor(mca_bcol_iboffload_task_t *task) -{ - task->frag = NULL; - task->collfrag = NULL; - task->endpoint = NULL; - task->next_task = NULL; - - task->sg_entries = NULL; - task->sg_entries_num = 0; - - task->task_list = NULL; - - memset(&task->wr, 0, sizeof(task->wr)); - - memset(&task->element, 0, sizeof(struct mqe_task)); - memset(&task->task_mqe_qp_entry, 0, sizeof(struct mqe_qp_entry)); -} - -static void task_destructor(mca_bcol_iboffload_task_t *task) -{ - if (NULL != task->sg_entries) { - free(task->sg_entries); - } -} - -OBJ_CLASS_INSTANCE( - mca_bcol_iboffload_task_t, - ompi_free_list_item_t, - task_constructor, - task_destructor); - -void -mca_bcol_iboffload_calc_task_init(ompi_free_list_item_t* item, void* ctx) -{ - mca_bcol_iboffload_task_t *calc_task = - (mca_bcol_iboffload_task_t *) item; - - calc_task->task_list = (ompi_free_list_t *) ctx; - - calc_task->sg_entries_num = 2; - calc_task->sg_entries = (struct ibv_sge *) malloc (2 * sizeof(struct ibv_sge)); -} - -void -mca_bcol_iboffload_iovec_task_init(ompi_free_list_item_t* item, void* ctx) -{ - mca_bcol_iboffload_task_t *iovec_task = - (mca_bcol_iboffload_task_t *) item; - - mca_bcol_iboffload_module_t *iboffload_module = - (mca_bcol_iboffload_module_t *) ctx; - - int nitems, group_size = iboffload_module->group_size; - - nitems = group_size / 2 + group_size % 2; - if (nitems > iboffload_module->device->ib_dev_attr.max_sge) { - nitems = iboffload_module->device->ib_dev_attr.max_sge; - } - - iovec_task->sg_entries_num = nitems; - iovec_task->task_list = &iboffload_module->iovec_tasks_free; - - iovec_task->sg_entries = (struct ibv_sge *) - malloc(nitems * sizeof(struct ibv_sge)); -} diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_task.h b/ompi/mca/bcol/iboffload/bcol_iboffload_task.h deleted file mode 100644 index 99bbe8eb1a..0000000000 --- a/ompi/mca/bcol/iboffload/bcol_iboffload_task.h +++ /dev/null @@ -1,613 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_IBOFFLOAD_TASK_H -#define MCA_BCOL_IBOFFLOAD_TASK_H - -#include "ompi_config.h" - -#include -#include -#include - -#include "bcol_iboffload.h" -#include "bcol_iboffload_frag.h" -#include "bcol_iboffload_collreq.h" -#include "bcol_iboffload_endpoint.h" -#include "bcol_iboffload_collfrag.h" - -#define SENDWR(task) ((task)->element.post.send_wr) - -BEGIN_C_DECLS - -/* the mca_bcol_ibv_mwr_task_t name was replaced with mca_bcol_iboffload_task_t */ -struct mca_bcol_iboffload_task_t { - ompi_free_list_item_t super; - - /* pointer to the memory descriptor associated with the task */ - mca_bcol_iboffload_frag_t *frag; - - /* pointer to the bcol descriptor, - * we need it for send task only becasue we complete them in async maner - */ - mca_bcol_iboffload_collfrag_t *collfrag; - - /* task to be posted */ - struct mqe_task element; - - /* allocate ibv_sge structs array - in a CALC case - * for example it will have two entries. - */ - struct ibv_sge *sg_entries; - - /* sg_entries array length */ - int sg_entries_num; - - /* Each task is a member of some free list, - if the pointer is NULL => we assume the task - is a member of the common task list (tasks_free) */ - ompi_free_list_t *task_list; - - /* Pointer to the next task */ - struct mca_bcol_iboffload_task_t *next_task; - - /* pasha - it is crappy work around for driver interface - * the send_wr and recv_wr should be part of mqe_task and not pointers ! - */ - union { - struct ibv_m_send_wr send_wr; - struct ibv_recv_wr recv_wr; - } wr; - - /* If we'll decide to post a task to a different qp */ - struct mqe_qp_entry task_mqe_qp_entry; - - /* Pointer to endpoint for this task */ - mca_bcol_iboffload_endpoint_t *endpoint; -}; -typedef struct mca_bcol_iboffload_task_t mca_bcol_iboffload_task_t; -OBJ_CLASS_DECLARATION(mca_bcol_iboffload_task_t); - - -/* calc_tasks_free free list init function */ -void -mca_bcol_iboffload_calc_task_init(ompi_free_list_item_t* item, void* ctx); - -/* iovec_tasks_free free list init function */ -void -mca_bcol_iboffload_iovec_task_init(ompi_free_list_item_t* item, void* ctx); - -static inline __opal_attribute_always_inline__ void - mca_bcol_iboffload_return_frag_tolist( - mca_bcol_iboffload_frag_t *frag, - ompi_free_list_t *list) -{ - if (NULL != frag) { - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - assert(MCA_BCOL_IBOFFLOAD_NONE_OWNER != frag->type); - - if (MCA_BCOL_IBOFFLOAD_DUMMY_OWNER != frag->type && - 0 == frag->ref_counter) { - if (MCA_BCOL_IBOFFLOAD_BCOL_OWNER == frag->type) { - OMPI_FREE_LIST_RETURN_MT((&(list[frag->qp_index])), - (ompi_free_list_item_t*) frag); - } else if (MCA_BCOL_IBOFFLOAD_ML_OWNER == frag->type) { - OMPI_FREE_LIST_RETURN_MT((&(cm->ml_frags_free)), - (ompi_free_list_item_t*) frag); - } - } - } -} - -static inline __opal_attribute_always_inline__ void - mca_bcol_iboffload_return_recv_frags_toendpoint( - mca_bcol_iboffload_frag_t *frags, - mca_bcol_iboffload_endpoint_t *ep, - int qp_index) -{ - mca_bcol_iboffload_frag_t *recv_frag = frags; - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - - while (NULL != recv_frag) { - assert(MCA_BCOL_IBOFFLOAD_NONE_OWNER != recv_frag->type); - if (MCA_BCOL_IBOFFLOAD_ML_OWNER != recv_frag->type) { - opal_list_prepend(&ep->qps[qp_index].preposted_frags, - (opal_list_item_t *) recv_frag); - } else { - OMPI_FREE_LIST_RETURN_MT((&(cm->ml_frags_free)), - (ompi_free_list_item_t*) recv_frag); - } - - recv_frag = recv_frag->next; - } -} - -/* Wait task allocation and initialization */ -static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t* - mca_bcol_iboffload_get_wait_task(mca_bcol_iboffload_module_t *iboffload, - uint32_t source, int num_waits, - mca_bcol_iboffload_frag_t *frags, - int qp_index, struct ibv_qp *qp) -{ - ompi_free_list_item_t *item; - mca_bcol_iboffload_task_t *task; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[source]; - - /* blocking allocation for send fragment */ - OMPI_FREE_LIST_GET_MT(&cm->tasks_free, item); - if (OPAL_UNLIKELY(NULL == item)) { - mca_bcol_iboffload_return_recv_frags_toendpoint(frags, endpoint, qp_index); - return NULL; - } - - task = (mca_bcol_iboffload_task_t *) item; - /* set pointer to corresponding recv fragment */ - IBOFFLOAD_SET_FRAGS_ON_TASK(frags, task); - - task->next_task = NULL; - task->endpoint = endpoint; - - /* set opcode */ - task->element.opcode = MQE_WR_CQE_WAIT; - task->element.flags = 0; /* Here maybe ANY flag, anyway driver ignore it */ - /* set task id */ - task->element.wr_id = (uint64_t) (uintptr_t) task; - /* set CQ */ - task->element.wait.cq = endpoint->qp_config.init_attr[qp_index].recv_cq; - - /* set number of tasks to task */ - task->element.wait.count = num_waits; - /* set pointer to QP */ - - if (NULL == qp) { /* NULL means use MQ's QP */ - task->element.wait.mqe_qp = NULL; - } else { /* Post wait to the SQ of this QP */ - task->task_mqe_qp_entry.next = NULL; - task->task_mqe_qp_entry.qp = qp; - - task->element.wait.mqe_qp = &task->task_mqe_qp_entry; - } - - IBOFFLOAD_VERBOSE(10, ("Allocating task %p, cq: %p, num waits: %d, qp_index - %d, " - "destination %d for comm rank: %d.\n", - (void *) task, (void *) task->element.wait.cq, - task->element.wait.count, qp_index, source, - endpoint->iboffload_module->ibnet->super.group_list[endpoint->index])); - return task; -} - -static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t* -mca_bcol_iboffload_prepare_send_task( - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_endpoint_t *endpoint, - int qp_index, ompi_free_list_t *task_list, - mca_bcol_iboffload_collfrag_t *collfrag) -{ - ompi_free_list_item_t *item; - mca_bcol_iboffload_task_t *task; - - IBOFFLOAD_VERBOSE(10, ("Destination rank - %d, QP index - %d, " - "for comm rank - %d\n", endpoint->index, qp_index, - endpoint->iboffload_module->ibnet->super.group_list[endpoint->index])); - - /* get item from free list */ - OMPI_FREE_LIST_GET_MT(task_list, item); - if (OPAL_UNLIKELY(NULL == item)) { - return NULL; - } - - task = (mca_bcol_iboffload_task_t*) item; - task->endpoint = endpoint; - - ++(collfrag->n_sends); - task->collfrag = collfrag; - - task->next_task = NULL; - task->element.wr_id = (uint64_t) (uintptr_t) task; - - task->element.post.qp = endpoint->qps[qp_index].qp->lcl_qp; - - task->element.opcode = MQE_WR_SEND; - - /* define send work request */ - SENDWR(task) = &(task->wr.send_wr); - - SENDWR(task)->next = NULL; - - SENDWR(task)->wr_id = (uint64_t) (uintptr_t) collfrag; - IBOFFLOAD_VERBOSE(10, ("coll_frag - %p.\n", collfrag)); - - /* Allways send IMM on sends ! */ - task->element.flags = MQE_WR_FLAG_IMM_EXE; - - /* Always signal completion */ - SENDWR(task)->send_flags = IBV_SEND_SIGNALED; - - return task; -} - -static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t* -mca_bcol_iboffload_get_send_task( - mca_bcol_iboffload_module_t *iboffload, - uint32_t destination, int qp_index, - mca_bcol_iboffload_frag_t *frag, - mca_bcol_iboffload_collfrag_t *collfrag, - bool enable_inline) -{ - mca_bcol_iboffload_task_t *task; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[destination]; - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_get_send_task qp_index %d\n", - qp_index)); - - task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index, - &cm->tasks_free, - collfrag); - - if (OPAL_UNLIKELY(NULL == task)) { - mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free); - return NULL; - } - - /* no support for multiple frags */ - IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task); - - /* We can not do send with 0 byte but we can do zero byte RDMA with immidiate */ - if (0 == frag->sg_entry.length) { - SENDWR(task)->imm_data = 0; - SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - - SENDWR(task)->wr.rdma.rkey = endpoint->remote_zero_rdma_addr.rkey; - SENDWR(task)->wr.rdma.remote_addr = endpoint->remote_zero_rdma_addr.addr; - } else { - SENDWR(task)->opcode = IBV_WR_SEND; - } - - /* single sge */ - SENDWR(task)->num_sge = 1; - SENDWR(task)->sg_list = &(frag->sg_entry); - - /* Use inline send when it is possible */ - if (enable_inline && - frag->sg_entry.length < cm->max_inline_data) { - IBOFFLOAD_VERBOSE(10, ("Setting inline for len %d\n", frag->sg_entry.length)); - SENDWR(task)->send_flags |= IBV_SEND_INLINE; - } - - return task; -} - -static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t* -mca_bcol_iboffload_get_send_vec_task( - mca_bcol_iboffload_module_t *iboffload, - uint32_t destination, int qp_index, - size_t nitems, - struct iovec *buff_iovec, - uint32_t lkey, - mca_bcol_iboffload_frag_t *frag, - mca_bcol_iboffload_collfrag_t *collfrag, - bool enable_inline) -{ - mca_bcol_iboffload_task_t *task; - int i; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[destination]; - - IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_get_send_task qp_index %d\n", - qp_index)); - - task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index, - &iboffload->iovec_tasks_free, - collfrag); - - if (OPAL_UNLIKELY(NULL == task)) { - mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free); - return NULL; - } - - /* no support for multiple frags */ - IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task); - - /* We can not do send with 0 byte but we can do zero byte RDMA with immidiate */ - SENDWR(task)->opcode = IBV_WR_SEND; - - assert (task->sg_entries != NULL); - - for (i = 0; (size_t) i < nitems; ++i){ - task->sg_entries[i].length = buff_iovec[i].iov_len; - task->sg_entries[i].addr = (uint64_t) buff_iovec[i].iov_base; - task->sg_entries[i].lkey = lkey; - } - - /* multiple sge */ - SENDWR(task)->num_sge = nitems; - SENDWR(task)->sg_list = (task->sg_entries); - - /* Use inline send when it is possible */ - if (enable_inline && - frag->sg_entry.length < cm->max_inline_data) { - IBOFFLOAD_VERBOSE(10, ("Setting inline for len %d\n", frag->sg_entry.length)); - SENDWR(task)->send_flags |= IBV_SEND_INLINE; - } - - return task; -} -static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t* - mca_bcol_iboffload_get_rdma_vec_task( - uint32_t destination, size_t offset, size_t nitems, - mca_bcol_iboffload_frag_t *frag, - mca_bcol_iboffload_module_t *iboffload, - struct iovec *buff_iovec, uint32_t lkey, - mca_bcol_iboffload_collfrag_t *collfrag) -{ - int i; - mca_bcol_iboffload_collreq_t *coll_request = collfrag->coll_full_req; - - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_endpoint_t *endpoint = - iboffload->endpoints[destination]; - - task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, - coll_request->qp_index, - &iboffload->iovec_tasks_free, - collfrag); - if (OPAL_UNLIKELY(NULL == task)) { - mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free); - return NULL; - } - - /* no support for multiple frags */ - IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task); - - SENDWR(task)->imm_data = 0; - SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - SENDWR(task)->wr.rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey; - - SENDWR(task)->wr.rdma.remote_addr = (uint64_t) (uintptr_t) - ((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset); - - for (i = 0; (size_t) i < nitems; ++i){ - task->sg_entries[i].length = buff_iovec[i].iov_len; - task->sg_entries[i].addr = (uint64_t) buff_iovec[i].iov_base; - task->sg_entries[i].lkey = lkey; - } - - /* single sge */ - SENDWR(task)->num_sge = nitems; - SENDWR(task)->sg_list = (task->sg_entries); - - IBOFFLOAD_VERBOSE(10, ("The remote offset %ld \n", offset)); - return task; -} - -static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t* - mca_bcol_iboffload_get_rdma_task( - uint32_t destination, size_t offset, - mca_bcol_iboffload_frag_t *frag, - mca_bcol_iboffload_module_t *iboffload, - mca_bcol_iboffload_collfrag_t *collfrag) -{ - mca_bcol_iboffload_collreq_t *coll_request = collfrag->coll_full_req; - - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_endpoint_t *endpoint = - iboffload->endpoints[destination]; - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, - coll_request->qp_index, - &cm->tasks_free, collfrag); - if (OPAL_UNLIKELY(NULL == task)) { - mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free); - return NULL; - } - - /* no support for multiple frags */ - IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task); - - SENDWR(task)->imm_data = 0; - SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - SENDWR(task)->wr.rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey; - /* Pasha: I really not happy with the way we calculate remote addresses. - why we don't use rbuf + offset ?*/ - SENDWR(task)->wr.rdma.remote_addr = (uint64_t) (uintptr_t) - ((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset); - /* single sge */ - SENDWR(task)->num_sge = 1; - SENDWR(task)->sg_list = &(frag->sg_entry); - - IBOFFLOAD_VERBOSE(10, ("The remote offset %ld \n", offset)); - return task; -} - -/* Pasha: hacking version of calc operation */ - static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t* -mca_bcol_iboffload_get_calc_task(mca_bcol_iboffload_module_t *iboffload, - uint32_t destination, int qp_index, mca_bcol_iboffload_frag_t *frag, - struct ibv_sge *l_operand, struct ibv_sge *r_operand, - mca_bcol_iboffload_collreq_t *coll_request, - bool enable_inline) -/* Some specifications for this function: - * 1) We assume that the len of two operands (ibv_sge structs) is a same. - * 2) Possibly we use the results (ibv_sge structs) from previous - * calc operations => maybe the frag pointer is NULL. - */ -{ - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_endpoint_t *endpoint = - iboffload->endpoints[destination]; - - mca_bcol_iboffload_collfrag_t *collfrag = - (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index, - &cm->calc_tasks_free, collfrag); - if (OPAL_UNLIKELY(NULL == task)) { - mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free); - return NULL; - } - - if (NULL != frag) { - IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task); - } else { - task->frag = NULL; - } - - task->sg_entries[0] = *l_operand; - task->sg_entries[1] = *r_operand; - - SENDWR(task)->num_sge = 2; - SENDWR(task)->sg_list = task->sg_entries; - - SENDWR(task)->opcode = MCA_BCOL_IBOFFLOAD_SEND_CALC; -#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA - SENDWR(task)->wr.calc_send.data_type = coll_request->actual_ib_dtype; - SENDWR(task)->wr.calc_send.calc_op = coll_request->actual_ib_op; -#else - SENDWR(task)->wr.calc.data_type = coll_request->actual_ib_dtype; - SENDWR(task)->wr.calc.calc_op = coll_request->actual_ib_op; -#endif - - return task; -} - -static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t* - mca_bcol_iboffload_get_rdma_calc_task(mca_bcol_iboffload_module_t *iboffload, - uint32_t destination, int qp_index, mca_bcol_iboffload_frag_t *frag, - struct ibv_sge *l_operand, struct ibv_sge *r_operand, - mca_bcol_iboffload_collreq_t *coll_request, - size_t offset) -/* Some specifications for this function: - * 1) We assume that the len of two operands (ibv_sge structs) is a same. - * 2) Possibly we use the results (ibv_sge structs) from previous - * calc operations => maybe the frag pointer is NULL. - */ -{ - mca_bcol_iboffload_task_t *task; - mca_bcol_iboffload_endpoint_t *endpoint = - iboffload->endpoints[destination]; - - mca_bcol_iboffload_collfrag_t *collfrag = - (mca_bcol_iboffload_collfrag_t *) - opal_list_get_last(&coll_request->work_requests); - - mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; - task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index, - &cm->calc_tasks_free, collfrag); - if (OPAL_UNLIKELY(NULL == task)) { - mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free); - return NULL; - } - - if (NULL != frag) { - IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task); - } else { - task->frag = NULL; - } - - task->sg_entries[0] = *l_operand; - - /* Hack - we don't really use it. - task->sg_entries[1] = *r_operand; - */ - /* We use only single entry - SENDWR(task)->num_sge = 2; - */ - SENDWR(task)->num_sge = 1; - SENDWR(task)->sg_list = task->sg_entries; - -#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA - SENDWR(task)->opcode = IBV_M_WR_CALC_RDMA_WRITE_WITH_IMM; - SENDWR(task)->wr.calc_rdma.data_type = coll_request->actual_ib_dtype; - SENDWR(task)->wr.calc_rdma.calc_op = coll_request->actual_ib_op; - SENDWR(task)->wr.calc_rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey; - SENDWR(task)->wr.calc_rdma.remote_addr = (uint64_t) (uintptr_t) - ((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset); -#else - IBOFFLOAD_ERROR(("Fatal error: RDMA CALC was called, but the driver does not support this operation")); - return NULL; -#endif - - return task; -} - -static inline __opal_attribute_always_inline__ - int release_frags_on_task(mca_bcol_iboffload_task_t *task, - ompi_free_list_t *list) -{ - int rc, qp_index; - - mca_bcol_iboffload_frag_t *temp_frag = task->frag; - mca_bcol_iboffload_endpoint_t *endpoint = task->endpoint; - - mca_bcol_iboffload_component_t *cm = - &mca_bcol_iboffload_component; - - IBOFFLOAD_VERBOSE(10, ("\nCalling release_frags_on_task")); - - while (NULL != temp_frag) { - qp_index = temp_frag->qp_index; - - --(temp_frag->ref_counter); - - /* Return credits */ - if (MQE_WR_CQE_WAIT == task->element.opcode) { - ++(endpoint->qps[qp_index].rd_wqe); - - IBOFFLOAD_VERBOSE(10, ("Return rd_wqe %d pp_win %d", - endpoint->qps[qp_index].rd_wqe, - cm->qp_infos[qp_index].rd_pp_win)); - - /* Call for recv prepost */ - if (endpoint->qps[qp_index].rd_wqe >= - cm->qp_infos[qp_index].rd_pp_win) { - IBOFFLOAD_VERBOSE(10, ("Prepost to endpoint->index - %d, qp_index - %d", endpoint->index, qp_index)); - rc = mca_bcol_iboffload_prepost_recv(endpoint, qp_index, - endpoint->qps[qp_index].rd_wqe); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBOFFLOAD_ERROR(("QP %d: failed to prepost.\n", qp_index)); - return OMPI_ERROR; - } - /* What happens if we can not prepost ?*/ - } - } else if (MQE_WR_SEND == task->element.opcode) { - ++(endpoint->qps[qp_index].sd_wqe); - - assert(endpoint->qps[qp_index].sd_wqe <= cm->qp_infos[qp_index].rd_num); - - IBOFFLOAD_VERBOSE(10, ("Return sd_wqe %d, qp_index - %d, endpoint - %p", - endpoint->qps[qp_index].sd_wqe, qp_index, endpoint)); - } else { - /* We should not arrive to this case */ - IBOFFLOAD_ERROR(("Unsupporeted operation")); - - return OMPI_ERROR; - } - - mca_bcol_iboffload_return_frag_tolist(temp_frag, list); - temp_frag = temp_frag->next; - } - - return OMPI_SUCCESS; -} - -END_C_DECLS - -#endif diff --git a/ompi/mca/bcol/iboffload/configure.m4 b/ompi/mca/bcol/iboffload/configure.m4 deleted file mode 100644 index 510e0117e0..0000000000 --- a/ompi/mca/bcol/iboffload/configure.m4 +++ /dev/null @@ -1,40 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2015 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_ompi_bcol_iboffload_CONFIG([should_build]) -# ------------------------------------------ -# AC_DEFUN([MCA_ompi_bcol_iboffload_POST_CONFIG], [ -# ]) - - -# MCA_ompi_bcol_iboffload_CONFIG([action-if-can-compile], -# [action-if-cant-compile]) -# ------------------------------------------------ -AC_DEFUN([MCA_ompi_bcol_iboffload_CONFIG],[ - AC_CONFIG_FILES([ompi/mca/bcol/iboffload/Makefile]) - bcol_ofa_happy="no" - bcol_mlnx_ofed_happy="no" - - OPAL_CHECK_OPENFABRICS([bcol_iboffload], [bcol_ofa_happy="yes"]) - OPAL_CHECK_MLNX_OPENFABRICS([bcol_iboffload], [bcol_mlnx_ofed_happy="yes"]) - - AS_IF([test "$bcol_ofa_happy" = "yes" && test "$bcol_mlnx_ofed_happy" = "yes"], - [$1], - [$2]) - - # substitute in the things needed to build iboffload - AC_SUBST([bcol_iboffload_CFLAGS]) - AC_SUBST([bcol_iboffload_CPPFLAGS]) - AC_SUBST([bcol_iboffload_LDFLAGS]) - AC_SUBST([bcol_iboffload_LIBS]) -])dnl diff --git a/ompi/mca/bcol/iboffload/owner.txt b/ompi/mca/bcol/iboffload/owner.txt deleted file mode 100644 index 1c86df367b..0000000000 --- a/ompi/mca/bcol/iboffload/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL -status: unmaintained diff --git a/ompi/mca/bcol/ptpcoll/Makefile.am b/ompi/mca/bcol/ptpcoll/Makefile.am deleted file mode 100644 index a0bd0cb83e..0000000000 --- a/ompi/mca/bcol/ptpcoll/Makefile.am +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2013 Mellanox Technologies. All rights reserved. -# Copyright (c) 2013 Los Alamos National Security, LLC. All rights -# reserved. -# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources = \ - bcol_ptpcoll.h \ - bcol_ptpcoll_utils.h \ - bcol_ptpcoll_utils.c \ - bcol_ptpcoll_mca.h \ - bcol_ptpcoll_mca.c \ - bcol_ptpcoll_barrier.c \ - bcol_ptpcoll_bcast.c \ - bcol_ptpcoll_bcast.h \ - bcol_ptpcoll_component.c \ - bcol_ptpcoll_fanin.c \ - bcol_ptpcoll_fanout.c \ - bcol_ptpcoll_module.c \ - bcol_ptpcoll_allreduce.h \ - bcol_ptpcoll_allreduce.c \ - bcol_ptpcoll_reduce.h \ - bcol_ptpcoll_reduce.c \ - bcol_ptpcoll_allgather.c - - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -component_noinst = -component_install = -if MCA_BUILD_ompi_bcol_ptpcoll_DSO -component_install += mca_bcol_ptpcoll.la -else -component_noinst += libmca_bcol_ptpcoll.la -endif - -# See ompi/mca/btl/sm/Makefile.am for an explanation of -# libmca_common_sm.la. - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_bcol_ptpcoll_la_SOURCES = $(sources) -mca_bcol_ptpcoll_la_LDFLAGS = -module -avoid-version -mca_bcol_ptpcoll_la_LIBADD = - -noinst_LTLIBRARIES = $(component_noinst) -libmca_bcol_ptpcoll_la_SOURCES =$(sources) -libmca_bcol_ptpcoll_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h deleted file mode 100644 index a72197c78a..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h +++ /dev/null @@ -1,474 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_PTPCOLL_EXPORT_H -#define MCA_BCOL_PTPCOLL_EXPORT_H - -#include "ompi_config.h" - -#include "mpi.h" -#include "ompi/mca/mca.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "opal/mca/mpool/mpool.h" -#include "ompi/request/request.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/patterns/net/netpatterns.h" - -BEGIN_C_DECLS - -#ifdef HAVE_SCHED_YIELD -# include -# define SPIN sched_yield() -#else /* no switch available */ -# define SPIN -#endif - -/** - * Structure to hold the basic shared memory coll component. First it holds the - * base coll component, and then holds a bunch of - * sm-coll-component-specific stuff (e.g., current MCA param - * values). - */ -struct mca_bcol_ptpcoll_component_t { - /** Base coll component */ - mca_bcol_base_component_2_0_0_t super; - /** Verbosity level, used only in debug enabled builds */ - int verbose; - /** The radix of K-nomial tree, initilized by mca parameter */ - int k_nomial_radix; - /** The radix of narray tree, initilized by mca parameter */ - int narray_radix; - /** The radix is used for narray scatther and knomail gather for - large message bcast **/ - int narray_knomial_radix; - /** Number of times to poll for specific tag/src */ - int num_to_probe; - /* - * bcast small messages algorithm - * 1 - Knomial bcast - * 2 - Narray bcast - */ - int bcast_small_messages_known_root_alg; - /* - * bcast large messages algorithm - * 1 - binomial scatter-gather - * 2 - Narray scatther, knomial gather - */ - int bcast_large_messages_known_root_alg; - /* - * barrier algorithm - * 1 - recursive doubling - * 2 - recursive K-ing - */ - int barrier_alg; - - int use_brucks_smsg_alltoall_rdma; -}; - -struct mca_bcol_ptpcoll_collreq_t { - opal_free_list_item_t super; - - int tag; - int num_reqs; - int exchange; - - int need_toserv_extra; - int extra_partner_rank; - - ompi_request_t **requests; -}; -typedef struct mca_bcol_ptpcoll_collreq_t mca_bcol_ptpcoll_collreq_t; -OBJ_CLASS_DECLARATION(mca_bcol_ptpcoll_collreq_t); - -/** - * Convenience typedef - */ -typedef struct mca_bcol_ptpcoll_component_t mca_bcol_ptpcoll_component_t; - -/* Bcast small messages, - known root algorithm */ -enum { - PTPCOLL_KNOMIAL = 1, - PTPCOLL_NARRAY -}; - -/* Bcast large messages, - known root algorithm */ -enum { - PTPCOLL_BINOMIAL_SG = 1, /* Binomila scatter-gather */ - PTPCOLL_NARRAY_KNOMIAL_SG /* Narray-Knomial scatter-gather */ -}; - -/* - * Implemented function index list - */ - -/* barrier */ -enum{ - FANIN_FAN_OUT_BARRIER_FN, - RECURSIVE_DOUBLING_BARRIER_FN, - N_BARRIER_FNS -}; - -/* reduce */ -enum{ - FANIN_REDUCE_FN, - REDUCE_SCATTER_GATHER_FN, - N_REDUCE_FNS -}; -enum{ - SHORT_DATA_FN_REDUCE, - LONG_DATA_FN_REDUCE, - N_REDUCE_FNS_USED -}; - -/* all-reduce */ -enum{ - FANIN_FANOUT_ALLREDUCE_FN, - REDUCE_SCATTER_ALLGATHER_FN, - N_ALLREDUCE_FNS -}; -enum{ - SHORT_DATA_FN_ALLREDUCE, - LONG_DATA_FN_ALLREDUCE, - N_ALLREDUCE_FNS_USED -}; - - -/* - * N-order tree node description - */ -struct tree_node_t { - /* my rank within the group */ - int my_rank; - /* my node type - root, leaf, or interior */ - int my_node_type; - /* number of nodes in the tree */ - int tree_size; - /* number of parents (0/1) */ - int n_parents; - /* number of children */ - int n_children; - /* parent rank within the group */ - int parent_rank; - /* chidren ranks within the group */ - int *children_ranks; -}; -typedef struct tree_node_t tree_node_t; - -struct pair_exchange_node_t { - - /* number of nodes this node will exchange data with */ - int n_exchanges; - - /* ranks of nodes involved in data exchnge */ - int *rank_exchanges; - - /* number of extra sources of data - outside largest power of 2 in - * this group */ - int n_extra_sources; - - /* rank of the extra source */ - int rank_extra_source; - - /* number of tags needed per stripe */ - int n_tags; - - /* log 2 of largest full power of 2 for this node set */ - int log_2; - - /* largest power of 2 that fits in this group */ - int n_largest_pow_2; - - /* node type */ - int node_type; - -}; -typedef struct pair_exchange_node_t pair_exchange_node_t; - -/* - * Barrier request objects - */ - -/* enum for phase at which the nb barrier is in */ -enum{ - NB_BARRIER_INACTIVE, - NB_BARRIER_FAN_IN, - NB_BARRIER_FAN_OUT, - /* done and not started are the same for all practicle - * purposes, as the init funtion always sets this flag - */ - NB_BARRIER_DONE -}; - -typedef enum { - PTPCOLL_NOT_STARTED = 1, - PTPCOLL_WAITING_FOR_DATA = 1 << 1, - PTPCOLL_SCATTER_STARTED = 1 << 2, - PTPCOLL_GATHER_STARTED = 1 << 3, - PTPCOLL_EXTRA_SEND_STARTED = 1 << 4, - PTPCOLL_ROOT_SEND_STARTED = 1 << 5 -} ptpcoll_op_status; - -struct mca_bcol_ptpcoll_ml_buffer_desc_t { - void *data_addr; /* buffer address */ - uint64_t bank_index; /* my bank */ - uint64_t buffer_index; /* my buff index */ - int active_requests; /* keep number of active requests */ - ompi_request_t **requests; /* caching pointers to requests */ - int data_src; /* used for bcast to cache internal data */ - int radix_mask; /* used for bcast to cache internal data */ - int radix_mask_pow; /* used for bcast to cache internal data */ - int iteration; /* buffer iteration in knomial, binomail, etc. algorithms */ - int tag; /* tag number that is attached to this operation */ - int status; /* operation status */ - /* Fixme: Probably we can get rid of these fields by redesigning - * the reduce implementation - */ - int reduction_status; /* used for reduction to cache internal - reduction status */ - bool reduce_init_called; -}; -typedef struct mca_bcol_ptpcoll_ml_buffer_desc_t mca_bcol_ptpcoll_ml_buffer_desc_t; - -/* - * Information that we need to keep in order to access and - * track local ML memory that is used as source and destinatination - * for collectives operations - */ -struct mca_bcol_ptpcoll_local_mlmem_desc_t { - /* Bank index to release */ - uint32_t bank_index_for_release; - /* number of memory banks */ - uint32_t num_banks; - /* number of buffers per bank */ - uint32_t num_buffers_per_bank; - /* size of a payload buffer */ - uint32_t size_buffer; - /* pointer to buffer descriptors initialized */ - mca_bcol_ptpcoll_ml_buffer_desc_t *ml_buf_desc; -}; -typedef struct mca_bcol_ptpcoll_local_mlmem_desc_t mca_bcol_ptpcoll_local_mlmem_desc_t; - -typedef enum { - PTPCOLL_PROXY = 1, - PTPCOLL_IN_GROUP = 1 << 1, - PTPCOLL_EXTRA = 1 << 2, - PTPCOLL_KN_PROXY = 1 << 3, - PTPCOLL_KN_IN_GROUP = 1 << 4, - PTPCOLL_KN_EXTRA = 1 << 5 -} node_type_pow2; - -struct mca_bcol_ptpcoll_module_t { - /* base structure */ - mca_bcol_base_module_t super; - - /* size */ - int group_size; - - /* size of each memory segment */ - size_t segment_size; - - /* k_nomial radix */ - int k_nomial_radix; - /* caching power of K, for K-nomial operations */ - int pow_k; - /* caching power of K number that is smaller or equal to size of group */ - int pow_knum; - /* caching power of 2, it is special case for some algorithms */ - int pow_2; - /* caching power of 2 number that is closet to size of group */ - int pow_2num; - /* type of this node in group of power 2 */ - int pow_2type; - /* type of this node in group of K-nomaial tree */ - int pow_ktype; - /* type of this node in group of narray tree */ - int narray_type; - /* size of full narray tree */ - int full_narray_tree_size; - /* num leafs on last level */ - int full_narray_tree_num_leafs; - - /* Nary tree info */ - netpatterns_tree_node_t *narray_node; - - /* if the rank in group, it keeps the extra peer. - if the rank is extra, it keeps the proxy peer. - */ - int proxy_extra_index; /* pow2 algorithm */ - int *kn_proxy_extra_index; /* K nomaila algorithm */ - int kn_proxy_extra_num; /* number of extra peers , maximum k - 1*/ - - /* collective tag */ - long long collective_tag; - - /* tag mask - the pml has a limit on tag size, so need - * to wrap around - */ - uint64_t tag_mask; - - /* Caching information about local ml memory. - * Since ptpcoll does not support RDMA operations over pml, - * we don't need to keep any information about remote buffers - */ - mca_bcol_ptpcoll_local_mlmem_desc_t ml_mem; - - - /* Narray-Knomial scatther gather */ - - /* list of extra indexes */ - int *narray_knomial_proxy_extra_index; - /* number of extra peers , maximum k - 1*/ - int narray_knomial_proxy_num; - /* Narray-Knomial node information array */ - netpatterns_narray_knomial_tree_node_t *narray_knomial_node; - /* Knomial exchange tree */ - netpatterns_k_exchange_node_t knomial_exchange_tree; - /* knomial allgather tree --- Do not disable, we need both - different algorithms define recursive k - ing differently - */ - netpatterns_k_exchange_node_t knomial_allgather_tree; - - /* Knomial allgather offsets */ - int **allgather_offsets; - - /* Free lists of outstanding collective operations */ - opal_free_list_t collreqs_free; - - int log_group_size; - struct iovec *alltoall_iovec; -}; - -typedef struct mca_bcol_ptpcoll_module_t mca_bcol_ptpcoll_module_t; -OBJ_CLASS_DECLARATION(mca_bcol_ptpcoll_module_t); - - -/** - * Global component instance - */ -OMPI_MODULE_DECLSPEC extern mca_bcol_ptpcoll_component_t -mca_bcol_ptpcoll_component; - - -/* - * coll module functions - */ - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_bcol_ptpcoll_init_query(bool enable_progress_threads, - bool enable_mpi_threads); - -/* query to see if the module is available for use on the given - * communicator, and if so, what it's priority is. - */ -mca_bcol_base_module_t ** -mca_bcol_ptpcoll_comm_query(mca_sbgp_base_module_t *sbgp, int *num_modules); - -/* interface function to setup recursive k-ing tree */ -int mca_bcol_ptpcoll_setup_knomial_tree(mca_bcol_base_module_t *super); - -/* barrier routines */ -int bcol_ptpcoll_barrier_recurs_dbl(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -int bcol_ptpcoll_barrier_recurs_knomial(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -int bcol_ptpcoll_barrier_init(mca_bcol_base_module_t *super); -int mca_bcol_ptpcoll_memsync_init(mca_bcol_base_module_t *super); -void * bcol_ptpcoll_allocate_memory(size_t length, size_t alignment, - struct mca_bcol_base_module_t *bcol_module); -int bcol_ptpcoll_register_memory(void * in_ptr, size_t length, size_t alignment, - struct mca_bcol_base_module_t *bcol_module); -int bcol_ptpcoll_deregister_memory( void * in_ptr, - struct mca_bcol_base_module_t *bcol_module); -int bcol_ptpcoll_free_memory(void *ptr, - struct mca_bcol_base_module_t *bcol_module); -int bcol_ptpcoll_fanin( bcol_function_args_t *input_args, - struct mca_bcol_base_module_t *module); -int bcol_ptpcoll_fanout( bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - - -/* allgather routine */ -int bcol_ptpcoll_k_nomial_allgather_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -/* allgather progress */ -int bcol_ptpcoll_k_nomial_allgather_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -/* allgather register */ -int bcol_ptpcoll_allgather_init(mca_bcol_base_module_t *super); - -static inline __opal_attribute_always_inline__ - int mca_bcol_ptpcoll_test_for_match(ompi_request_t **request , int *rc) -{ - int matched = 0; - int i; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - *rc = OMPI_SUCCESS; - - for (i = 0; i < cm->num_to_probe && - 0 == matched && OMPI_SUCCESS == *rc ; i++) { - *rc = ompi_request_test(request, &matched, MPI_STATUS_IGNORE); - } - - return matched; -} - -static inline __opal_attribute_always_inline__ - int mca_bcol_ptpcoll_test_all_for_match(int *n_requests, ompi_request_t **requests , int *rc) -{ - int matched = 0; - int i; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - *rc = OMPI_SUCCESS; - - assert(*n_requests >= 0); - - if (0 == *n_requests) { - return 1; - } - - for (i = 0; i < cm->num_to_probe && - 0 == matched && OMPI_SUCCESS == *rc; i++) { - *rc = ompi_request_test_all - (*n_requests, requests, &matched, MPI_STATUS_IGNORE); - } - - if (matched) { - *n_requests = 0; - } - - return matched; -} - -/* Some negative tags already used by OMPI, making sure that we take safe offset */ -#define PTPCOLL_TAG_OFFSET 100 -#define PTPCOLL_TAG_FACTOR 2 - -static inline int lognum(int n){ - int count = 1, lognum = 0; - - while (count < n) { - count = count << 1; - lognum++; - } - return lognum; -} - -END_C_DECLS - -#endif /* MCA_BCOL_PTPCOLL_EXPORT_H */ diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allgather.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allgather.c deleted file mode 100644 index eeed28e9fe..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allgather.c +++ /dev/null @@ -1,605 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/bcol/bcol.h" -#include "bcol_ptpcoll_allreduce.h" -/* - * Recursive K-ing allgather - */ - -/* - * - * Recurssive k-ing algorithm - * Example k=3 n=9 - * - * - * Number of Exchange steps = log (basek) n - * Number of steps in exchange step = k (radix) - * - */ - -int bcol_ptpcoll_k_nomial_allgather_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variables */ - - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - netpatterns_k_exchange_node_t *exchange_node = &ptpcoll_module->knomial_allgather_tree; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int group_size = ptpcoll_module->group_size; - int *list_connected = ptpcoll_module->super.list_n_connected; /* critical for hierarchical colls */ - - int tag; - int i, j; - int knt; - int comm_src, comm_dst, src, dst; - int recv_offset, recv_len; - int send_offset, send_len; - - uint32_t buffer_index = input_args->buffer_index; - int pow_k, tree_order; - int rc = OMPI_SUCCESS; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int completed = 0; /* initialized */ - void *data_buffer = (void*)( - (unsigned char *) input_args->sbuf + - (size_t) input_args->sbuf_offset); - int pack_len = input_args->count * input_args->dtype->super.size; - -#if 0 - fprintf(stderr,"entering p2p allgather pack_len %d. exchange node: %p\n",pack_len, exchange_node); -#endif - /* initialize the iteration counter */ - int *iteration = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration; - *iteration = 0; - - /* reset active request counter */ - *active_requests = 0; - - /* keep tag within the limit supported by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - /* k-nomial parameters */ - tree_order = exchange_node->tree_order; - pow_k = exchange_node->log_tree_order; - - - /* let's begin the collective, starting with extra ranks and their - * respective proxies - */ - if( EXTRA_NODE == exchange_node->node_type ) { - - /* then I will send to my proxy rank*/ - dst = exchange_node->rank_extra_sources_array[0]; - /* find rank in the communicator */ - comm_dst = group_list[dst]; - /* now I need to calculate my own offset */ - knt = 0; - for (i = 0 ; i < my_group_index; i++){ - knt += list_connected[i]; - } - - /* send the data to my proxy */ - rc = MCA_PML_CALL(isend((void *) ( (unsigned char *) data_buffer + - knt*pack_len), - pack_len * list_connected[my_group_index], - MPI_BYTE, - comm_dst, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10,("Failed to isend data")); - return OMPI_ERROR; - } - ++(*active_requests); - - /* now I go ahead and post the receive from my proxy */ - comm_src = comm_dst; - knt = 0; - for( i =0; i < group_size; i++){ - knt += list_connected[i]; - } - rc = MCA_PML_CALL(irecv(data_buffer, - knt * pack_len, - MPI_BYTE, - comm_src, - tag , comm, &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to post ireceive ")); - return OMPI_ERROR; - } - - ++(*active_requests); - /* poll for completion */ - /* this polls internally */ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if(completed){ - /* go to buffer release */ - goto FINISHED; - }else{ - /* save state and hop out - * nothing to save here - */ - return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED); - } - }else if ( 0 < exchange_node->n_extra_sources ) { - - /* I am a proxy for someone */ - src = exchange_node->rank_extra_sources_array[0]; - /* find the rank in the communicator */ - comm_src = group_list[src]; - knt = 0; - for(i = 0; i < src; i++){ - knt += list_connected[i]; - } - /* post the receive */ - rc = MCA_PML_CALL(irecv((void *) ( (unsigned char *) data_buffer - + knt*pack_len), - pack_len * list_connected[src], - MPI_BYTE, - comm_src, - tag , comm, &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to post ireceive ")); - return OMPI_ERROR; - } - - ++(*active_requests); - /* poll for completion */ - /* this routine polls internally */ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if(!completed){ - /* save state and hop out - * We really do need to block here so set - * the iteration to -1 indicating we need to - * finish this part first - */ - *iteration = -1; - return ((OMPI_SUCCESS != rc )? OMPI_ERROR : BCOL_FN_STARTED); - } - - } - - /* we start the recursive k - ing phase */ - /* fprintf(stderr,"tree order %d pow_k %d \n",tree_order,pow_k);*/ - for( i = 0; i < pow_k; i++) { - for(j = 0; j < (tree_order - 1); j++) { - - /* send phase */ - dst = exchange_node->rank_exchanges[i][j]; - if( dst < 0 ){ - continue; - } - comm_dst = group_list[dst]; - send_offset = exchange_node->payload_info[i][j].s_offset * pack_len; - send_len = exchange_node->payload_info[i][j].s_len * pack_len; - /* debug print */ - /* fprintf(stderr,"sending %d bytes to rank %d at offset %d\n",send_len, */ - /* comm_dst,send_offset); */ - rc = MCA_PML_CALL(isend((void*)((unsigned char *) data_buffer + - send_offset), - send_len, - MPI_BYTE, - comm_dst, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10,("Failed to isend data")); - return OMPI_ERROR; - } - ++(*active_requests); - - /* sends are posted */ - } - - /* Now post the recv's */ - for( j = 0; j < (tree_order - 1); j++ ) { - - /* recv phase */ - src = exchange_node->rank_exchanges[i][j]; - if( src < 0 ) { - continue; - } - comm_src = group_list[src]; - recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len; - recv_len = exchange_node->payload_info[i][j].r_len * pack_len; - /* debug print */ - /* fprintf(stderr,"recving %d bytes to rank %d at offset %d\n",recv_len, */ - /* comm_src,recv_offset); */ - /* post the receive */ - rc = MCA_PML_CALL(irecv((void *) ((unsigned char *) data_buffer + - recv_offset), - recv_len, - MPI_BYTE, - comm_src, - tag, comm, &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to post ireceive ")); - return OMPI_ERROR; - } - - ++(*active_requests); - } - /* finished all send/recv's now poll for completion before - * continuing to next iteration - */ - completed = 0; - /* polling internally on 2*(k - 1) requests */ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - - if(!completed){ - /* save state and hop out - * only the iteration needs to be tracked - */ - *iteration = i; /* need to pick up here */ - - return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED); - } - } - - /* finish off the last piece, send the data back to the extra */ - if( 0 < exchange_node->n_extra_sources ) { - dst = exchange_node->rank_extra_sources_array[0]; - comm_dst = group_list[dst]; - knt = 0; - for( i = 0; i < group_size; i++){ - knt += list_connected[i]; - } - /* debug print */ - /* - fprintf(stderr,"sending %d bytes to extra %d \n",pack_len*knt,comm_dst); - */ - rc = MCA_PML_CALL(isend(data_buffer, - pack_len * knt, - MPI_BYTE, - comm_dst, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10,("Failed to isend data")); - return OMPI_ERROR; - } - ++(*active_requests); - - /* probe for send completion */ - completed = 0; - /* polling internally */ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if(!completed){ - /* save state and hop out - * We really do need to block here so set - * the iteration to pow_k +1 indicating we need to - * finish progressing the last part - */ - *iteration = pow_k + 1; - - return (OMPI_SUCCESS != rc ? OMPI_ERROR : BCOL_FN_STARTED); - } - } - -FINISHED: - /* recycle buffer if need be */ - return BCOL_FN_COMPLETE; -} - -/* allgather progress function */ - -int bcol_ptpcoll_k_nomial_allgather_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - - - /* local variables */ - - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - netpatterns_k_exchange_node_t *exchange_node = &ptpcoll_module->knomial_allgather_tree; - int group_size = ptpcoll_module->group_size; - int *list_connected = ptpcoll_module->super.list_n_connected; /* critical for hierarchical colls */ - - - int tag; - int i, j; - int knt; - int comm_src, comm_dst, src, dst; - int recv_offset, recv_len; - int send_offset, send_len; - uint32_t buffer_index = input_args->buffer_index; - - int pow_k, tree_order; - int rc = OMPI_SUCCESS; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int completed = 0; /* initialized */ - void *data_buffer = (void*)( - (unsigned char *) input_args->sbuf + - (size_t) input_args->sbuf_offset); - int pack_len = input_args->count * input_args->dtype->super.size; - /* initialize the counter */ - int *iteration = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration; - - -#if 0 - fprintf(stderr,"%d: entering p2p allgather progress AR: %d iter: %d\n",my_group_index,*active_requests, - *iteration); -#endif - /* keep tag within the limit supported by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - /* k-nomial tree parameters */ - tree_order = exchange_node->tree_order; - pow_k = exchange_node->log_tree_order; - - /* let's begin the collective, starting with extra ranks and their - * respective proxies - */ - if( EXTRA_NODE == exchange_node->node_type ) { - - /* debug print */ - /*fprintf(stderr,"666 \n");*/ - /* simply poll for completion */ - completed = 0; - /* polling internally */ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if(completed){ - /* go to buffer release */ - goto FINISHED; - }else{ - /* save state and hop out - * nothing to save here - */ - return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED); - } - }else if ( 0 < exchange_node->n_extra_sources && (-1 == *iteration)) { - - /* I am a proxy for someone */ - /* Simply poll for completion */ - completed = 0; - /* polling internally */ - assert( 1 == *active_requests); - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if(!completed){ - /* save state and hop out - * We really do need to block here so set - * the iteration to -1 indicating we need to - * finish this part first - */ - (*iteration) = -1; - return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED); - } - /* I may now proceed to the recursive k - ing phase */ - *iteration = 0; - } - - - /* the ordering here between the extra rank and progress active requests - * is critical - */ - /* extra rank */ - if( (pow_k + 1) == *iteration ){ - /* finish off the last one */ - goto PROGRESS_EXTRA; - } - - /* active requests must be completed before continuing on to - * recursive k -ing step - * CAREFUL HERE, IT THIS REALLY WHAT YOU WANT?? - */ - if( 0 < (*active_requests) ) { - /* then we have something to progress from last step */ - /* debug print */ - /* - fprintf(stderr,"%d: entering progress AR: %d iter: %d\n",my_group_index,*active_requests, - *iteration); - */ - completed = 0; - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if(!completed){ - /* save state and hop out - * state hasn't changed - */ - - return ((MPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED); - } - ++(*iteration); - } - - - - /* we start the recursive k - ing phase */ - for( i = *iteration; i < pow_k; i++) { - /* nothing changes here */ - for(j = 0; j < (tree_order - 1); j++) { - - /* send phase */ - dst = exchange_node->rank_exchanges[i][j]; - if( dst < 0 ){ - continue; - } - comm_dst = group_list[dst]; - send_offset = exchange_node->payload_info[i][j].s_offset * pack_len; - send_len = exchange_node->payload_info[i][j].s_len * pack_len; - rc = MCA_PML_CALL(isend((void*)((unsigned char *) data_buffer + - send_offset), - send_len, - MPI_BYTE, - comm_dst, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10,("Failed to isend data")); - return OMPI_ERROR; - } - ++(*active_requests); - - /* sends are posted */ - } - - /* Now post the recv's */ - for( j = 0; j < (tree_order - 1); j++ ) { - - /* recv phase */ - src = exchange_node->rank_exchanges[i][j]; - if( src < 0 ) { - continue; - } - comm_src = group_list[src]; - recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len; - recv_len = exchange_node->payload_info[i][j].r_len * pack_len; - /* post the receive */ - rc = MCA_PML_CALL(irecv((void *) ((unsigned char *) data_buffer + - recv_offset), - recv_len, - MPI_BYTE, - comm_src, - tag, comm, &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to post ireceive ")); - return OMPI_ERROR; - } - - ++(*active_requests); - } - /* finished all send/recv's now poll for completion before - * continuing to next iteration - */ - completed = 0; - /* make this non-blocking */ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if(!completed){ - /* save state and hop out - * We really do need to block here so set - * the iteration to -1 indicating we need to - * finish this part first - */ - *iteration = i; /* need to pick up here */ - - return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED); - } - } - - /* finish off the last piece, send the data back to the extra */ - if( 0 < exchange_node->n_extra_sources ) { - dst = exchange_node->rank_extra_sources_array[0]; - comm_dst = group_list[dst]; - knt = 0; - for( i = 0; i < group_size; i++){ - knt += list_connected[i]; - } - rc = MCA_PML_CALL(isend(data_buffer, - pack_len * knt, - MPI_BYTE, - comm_dst, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10,("Failed to isend data")); - return OMPI_ERROR; - } - ++(*active_requests); - - /* probe for send completion */ - completed = 0; - /* make this non-blocking */ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if(!completed){ - /* save state and hop out - * We really do need to block here so set - * the iteration to pow_k +1 indicating we need to - * finish progressing the last part - */ - *iteration = pow_k + 1; - - return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED); - } - } - /* folks need to skip this unless they really are the proxy - * reentering with the intent of progressing the final send - */ - goto FINISHED; - -PROGRESS_EXTRA: - - /* probe for send completion */ - completed = 0; - /* make this non-blocking */ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if(!completed){ - /* save state and hop out - * We really do need to block here so set - * the iteration to pow_k +1 indicating we need to - * finish progressing the last part - */ - - return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED); - } - -FINISHED: - /* recycle buffer if need be */ - return BCOL_FN_COMPLETE; -} - -/* - * Register allreduce functions to the BCOL function table, - * so they can be selected - */ -int bcol_ptpcoll_allgather_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_ALLGATHER; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_k_nomial_allgather_init, - bcol_ptpcoll_k_nomial_allgather_progress); - - - comm_attribs.data_src = DATA_SRC_KNOWN; - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_k_nomial_allgather_init, - bcol_ptpcoll_k_nomial_allgather_progress); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.c deleted file mode 100644 index 14a4f76958..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.c +++ /dev/null @@ -1,1032 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/bcol/bcol.h" -#include "bcol_ptpcoll_allreduce.h" - -/* - * Recursive K-ing allreduce - */ -static inline int bcol_ptpcoll_allreduce_narray_schedule_extra_node_exchange (mca_bcol_ptpcoll_module_t *ptpcoll_module, netpatterns_k_exchange_node_t *k_node, - void *data_buffer, size_t data_size, ompi_request_t **requests, int *active_requests, - int tag) -{ - ompi_communicator_t *comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - int peer_comm_rank, k, offset, rc; - - if (EXCHANGE_NODE == k_node->node_type) { - /* the send data resides in the first part of the buffer */ - for (k = 0, offset = data_size ; k < k_node->n_extra_sources ; ++k, offset += data_size) { - peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[k]]; - - PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p len %d tag %d", - peer_comm_rank, data_buffer, data_size, tag)); - rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + offset), - data_size, MPI_BYTE, peer_comm_rank, tag, comm, - &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - - ++(*active_requests); - } - } else { - peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[0]]; - - PTPCOLL_VERBOSE(10, ("Send data to %d, addr %p len %d tag %d", - peer_comm_rank, data_buffer, data_size, tag)); - - rc = MCA_PML_CALL(isend(data_buffer, data_size, MPI_BYTE, peer_comm_rank, - tag, MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - - ++(*active_requests); - } - - return OMPI_SUCCESS; -} - -static inline void bcol_ptpcoll_allreduce_narray_reduce (void *data_buffer, struct ompi_datatype_t *data_type, int count, struct ompi_op_t *op, int sources) -{ - size_t data_size = mca_bcol_base_get_buff_length(data_type, count); - - for (int k = 0, offset = data_size ; k < sources ; ++k, offset += data_size) { - ompi_op_reduce(op, (char *) data_buffer + offset, data_buffer, count, data_type); - } -} - -static int bcol_ptpcoll_allreduce_narraying_progress (bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - void *data_buffer = (void *) ( (unsigned char *) input_args->sbuf + - (size_t) input_args->sbuf_offset); - struct ompi_datatype_t *data_type = input_args->dtype; - uint32_t buffer_index = input_args->buffer_index; - struct ompi_op_t *op = input_args->op; - int count = input_args->count; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - int k, rc, peer, group_peer; - int offset = 0; - ompi_communicator_t *comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - - netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; - int k_radix = k_node->tree_order; - - size_t data_size = mca_bcol_base_get_buff_length(data_type, count); - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - - /* if we are just staring the collective and there are extra sources then schedule the - * extra node exchange. otherwise check if the exchange is complete. */ - if (-1 == *iteration) { - if (0 < k_node->n_extra_sources) { - if (!(*active_requests)) { - rc = bcol_ptpcoll_allreduce_narray_schedule_extra_node_exchange (ptpcoll_module, k_node, data_buffer, data_size, - requests, active_requests, tag); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } - } - - /* check for extra node exchange completion */ - if (!mca_bcol_ptpcoll_test_all_for_match (active_requests, requests, &rc)) { - return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc; - } - - if (EXCHANGE_NODE == k_node->node_type) { - bcol_ptpcoll_allreduce_narray_reduce (data_buffer, data_type, count, op, k_node->n_extra_sources); - } - } - - /* start recursive k-ing */ - *iteration = 0; - } - - if (*iteration < k_node->n_exchanges) { - if (*active_requests) { - if (!mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc)) { - return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc; - } - - ++(*iteration); - bcol_ptpcoll_allreduce_narray_reduce (data_buffer, data_type, count, op, k_radix - 1); - } - } - - for ( ; *iteration < k_node->n_exchanges ; ++(*iteration)) { - for (k = 0; k < k_radix - 1; k++) { - group_peer = k_node->rank_exchanges[*iteration][k]; - - peer = group_list[group_peer]; - - PTPCOLL_VERBOSE(10, ("Send data to %d, addr %p len %d tag %d", - peer, data_buffer, data_size, tag)); - rc = MCA_PML_CALL(isend(data_buffer, data_size, MPI_BYTE, peer, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - - ++(*active_requests); - } - - for (k = 0, offset = data_size ; k < k_radix - 1 ; ++k, offset += data_size) { - group_peer = k_node->rank_exchanges[*iteration][k]; - peer = group_list[group_peer]; - - PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p len %d tag %d", - peer, data_buffer, data_size, tag)); - rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + offset ), - data_size, MPI_BYTE, peer, tag, comm, - &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - - ++(*active_requests); - } - - if (!mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc)) { - return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc; - } - - bcol_ptpcoll_allreduce_narray_reduce (data_buffer, data_type, count, op, k_radix - 1); - } - - /* ensure extra nodes get the result */ - if (0 < k_node->n_extra_sources) { - if (!(*active_requests)) { - int peer_comm_rank; - - if (EXTRA_NODE == k_node->node_type) { - peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[0]]; - - PTPCOLL_VERBOSE(10, ("EXTRA_NODE: Recv data from %d, addr %p len %d tag %d", - peer_comm_rank, data_buffer, data_size, tag)); - rc = MCA_PML_CALL(irecv(data_buffer, data_size, MPI_BYTE, peer_comm_rank, - tag, comm, &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - - ++(*active_requests); - } else { - for (k = 0; k < k_node->n_extra_sources; k++) { - peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[k]]; - - PTPCOLL_VERBOSE(10, ("EXCHANGE_NODE: Send data to %d, addr %p len %d tag %d", - peer_comm_rank, data_buffer, data_size, tag)); - rc = MCA_PML_CALL(isend(data_buffer, data_size, MPI_BYTE, peer_comm_rank, - tag, MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - - ++(*active_requests); - } - } - } - - if (!mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc)) { - return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc; - } - } - - return BCOL_FN_COMPLETE; -} - -int bcol_ptpcoll_allreduce_narraying_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args){ - - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - uint64_t sequence_number = input_args->sequence_num; - uint32_t buffer_index = input_args->buffer_index; - int count = input_args->count; - struct ompi_datatype_t *dtype = input_args->dtype; - size_t buffer_size; - int tag; - - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1; - - /* start with extra node exchange if needed */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration = -1; - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests = 0; - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status = PTPCOLL_NOT_STARTED; - - /* - * ML bufer is segmented into k segments and each of the k segment is used - * for reductions - */ - /* This has to be based on ml buffer size. Need to take into account the space used - * by the headers of other bcol modules. */ - buffer_size = ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX; - assert(buffer_size >= count * dtype->super.size * - ptpcoll_module->k_nomial_radix); - (void)buffer_size; // silence compiler warning - (void)dtype; - (void)count; - - return bcol_ptpcoll_allreduce_narraying_progress (input_args, const_args); -} - -static inline int compute_seg_index(int peer, int kpow_num, int tree_order) { - - int peer_base, peer_position, peer_base_rank, peer_index; - - peer_base = peer / (kpow_num * tree_order); - peer_base_rank = peer_base * kpow_num * tree_order ; - peer_position = peer_base_rank == 0 ? peer : peer % (peer_base_rank); - peer_index = peer_position / kpow_num ; - - return peer_index; -} - -int compute_knomial_allgather_offsets(int group_index, int count, struct - ompi_datatype_t *dtype,int k_radix,int n_exchanges, - int **offsets){ - - int modulo_group_size; - size_t seg_count, seg_size, seg_index, seg_offset; - size_t block_offset, block_count; - int exchange_step; - ptrdiff_t lb, extent; - - if (0 >= n_exchanges) { - PTPCOLL_VERBOSE(10,("Nothing to initialize ")); - return 0; - } - modulo_group_size = 1; - seg_count = count / k_radix; - ompi_datatype_get_extent(dtype, &lb, &extent); - seg_size = seg_count * extent; - - seg_index = group_index % k_radix; - seg_offset = seg_index * seg_size; - - offsets[0][BLOCK_OFFSET] = block_offset = 0; - offsets[0][BLOCK_COUNT] = block_count = count; - offsets[0][LOCAL_REDUCE_SEG_OFFSET] = seg_offset; - offsets[0][SEG_SIZE] = seg_size; - - - for(exchange_step = 1; exchange_step < n_exchanges; exchange_step++) { - - /* Previous step's segment is this exchange step's block */ - block_count = seg_count; - block_offset = seg_offset; - - /* Divide the segment into k parts */ - seg_count = seg_count / k_radix; - seg_size = seg_count * extent; - - /* Among different segments in block, which segment should I reduce ? */ - /* For allgather phase, I will not send out this segment to peers */ - modulo_group_size *= k_radix; - seg_index = compute_seg_index(group_index, modulo_group_size, k_radix); - seg_offset = seg_index * seg_size; - - - offsets[exchange_step][BLOCK_OFFSET] = block_offset; - offsets[exchange_step][LOCAL_REDUCE_SEG_OFFSET] = seg_offset; - offsets[exchange_step][BLOCK_COUNT] = block_count; - offsets[exchange_step][SEG_SIZE] = seg_size; - - /* Change to absolute offset */ - seg_offset = block_offset + seg_offset; - - } - - return 0; -} - -static inline int compute_send_segment_size(int block_offset, - int send_offset, - int segment_size, - int padded_offset) { - int send_size = -1; - /* segment to be sent starts here */ - int segment_offset = block_offset + send_offset ; - send_size = (segment_offset + segment_size) >= padded_offset ? - segment_size - (segment_offset + segment_size - padded_offset) : segment_size; - return send_size; -} - -static inline int compute_recv_segment_size(int block_offset, - int recv_offset, - int segment_size, - int padded_offset) { - int recv_size = -1; - /* segment to be sent starts here */ - int segment_offset = block_offset + recv_offset ; - recv_size = (segment_offset + segment_size) >= padded_offset ? - segment_size - (segment_offset + segment_size - padded_offset) : segment_size; - - return recv_size; -} - -/* - * - * K-nomial Reduce Scatter - * Example k=3 n=9 - * - * | ABCDEFGH |0| - * - * Number of Exchange steps = log (basek) n - * Number of steps in exchange step = k (radix) - * - * block_size = Size of data that is reduce in exchange step - * segment_size = Size of data that is send or received by rank in radix step - * - * block_size = segment_size * k - * - * my_block_start_addr = Address of the segment in the block where I reference my - * offsets - * - * This is version 1 : Experimenting with decoupling offset calcuations - */ -int bcol_ptpcoll_allreduce_recursivek_scatter_reduce(mca_bcol_ptpcoll_module_t *ptpcoll_module, - const int buffer_index, void *sbuf, - void *rbuf, - struct ompi_op_t *op, - const int count, struct ompi_datatype_t *dtype, - const int relative_group_index, - const int padded_start_byte){ - int blocks_in_step = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask; - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; - mca_bcol_ptpcoll_component_t *cm = - &mca_bcol_ptpcoll_component; - void *my_block_start_addr = NULL, *my_block_addr = NULL; - int i, k, group_peer, peer ; - int k_radix = k_node->tree_order; - int rc = OMPI_SUCCESS; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int completed; - void *my_recv_start_addr, *my_recv_addr; - size_t block_offset, reduce_seg_offset, send_offset, recv_offset; - int seg_size, block_size; - int block_count, seg_count; - ptrdiff_t lb, extent; - ompi_datatype_get_extent(dtype, &lb, &extent); - - my_recv_start_addr = rbuf; - my_block_start_addr = sbuf; - block_count = count; - block_size = count * extent; - - - for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration; - i < k_node->n_exchanges; i++, blocks_in_step *= cm->narray_knomial_radix) { - - block_offset = ptpcoll_module->allgather_offsets[i][BLOCK_OFFSET]; - reduce_seg_offset = ptpcoll_module->allgather_offsets[i][LOCAL_REDUCE_SEG_OFFSET]; - block_count = ptpcoll_module->allgather_offsets[i][BLOCK_COUNT]; - seg_size = ptpcoll_module->allgather_offsets[i][SEG_SIZE]; - block_size = block_count * extent; - - PTPCOLL_VERBOSE(10,("Block offset %d, reduce_seg_offset %d, block_count %d seg_size %d", - block_offset, reduce_seg_offset, block_count, seg_size)); - - seg_count = block_count / k_radix; - my_block_addr = (void*)((char*)my_block_start_addr + block_offset); - my_recv_addr = (void*)((char*)my_recv_start_addr + block_offset); - - for (k = 0; k < k_radix - 1; k++) { - size_t soffset; - int snd_size = 0; - - group_peer = k_node->rank_exchanges[i][k]; - peer = group_list[group_peer]; - - send_offset = reduce_seg_offset + (seg_size * (k + 1)); - - if ((int)send_offset + seg_size > block_size) { - send_offset = send_offset % block_size; - } - - PTPCOLL_VERBOSE(10, ("Send data to %d,send offset %d len %d", - peer, send_offset, seg_size)); - - soffset = send_offset; - snd_size = - compute_send_segment_size((int)block_offset,(int)soffset,(int)seg_size,padded_start_byte); - - if (snd_size > 0) { - rc = MCA_PML_CALL(isend((void *)((unsigned char *)my_block_addr - + soffset), - snd_size, MPI_BYTE, - peer, tag, MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send the segment to %d", peer)); - return OMPI_ERROR; - } - ++(*active_requests); - } - - } - - /* - * Receive the segments to tmp addr and then do a reduction - */ - for (k = 0; k < k_radix - 1; k++) { - int recv_size=0; - - group_peer = k_node->rank_exchanges[i][k]; - peer = group_list[group_peer]; - - recv_offset = reduce_seg_offset + (seg_size * (k+1)); - - if ((int)recv_offset + seg_size > block_size) { - recv_offset = recv_offset % block_size; - } - - PTPCOLL_VERBOSE(10, ("Receive data to receive buffer at offset %d\n", - recv_offset)); - recv_size = compute_recv_segment_size((int)block_offset, - (int)reduce_seg_offset, (int)seg_size, - padded_start_byte); - - if (recv_size > 0 ) { - rc = MCA_PML_CALL(irecv((void *)((unsigned char *) - my_recv_addr + recv_offset), - recv_size, MPI_BYTE, - peer, tag, comm, &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive the segment from %d", peer)); - return OMPI_ERROR; - } - ++(*active_requests); - } - - } - - completed = 0; - while(!completed){ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - } - - /* Do a reduction on received buffers */ - { - void *src_data_buffer = NULL, *dst_data_buffer = NULL; - int reduce_data_count = 0; - - src_data_buffer = my_block_addr; - dst_data_buffer = my_recv_addr; - - for (k = 0; k < k_radix - 1; k++) { - recv_offset = reduce_seg_offset + (seg_size * (k+1)); - - if ((int)recv_offset + seg_size > block_size) { - recv_offset = recv_offset % block_size; - } - - reduce_data_count = (int)(block_offset + reduce_seg_offset) + seg_size >= padded_start_byte ? - (seg_size - (((int)(block_offset + reduce_seg_offset) + seg_size) - padded_start_byte))/(int)dtype->super.size - : (int)seg_count; - - if (reduce_data_count > 0) { - ompi_3buff_op_reduce(op, - (void*)((unsigned char*)my_recv_addr + recv_offset), - (void*)((unsigned char*)src_data_buffer + - reduce_seg_offset), - (void*)((unsigned char*)dst_data_buffer + - reduce_seg_offset), - reduce_data_count,dtype); - } - - src_data_buffer = dst_data_buffer; - - } - } - - /* After first iteration we have data (to work with) in recv buffer */ - my_block_start_addr = rbuf; - - } - - return rc; -} - - -int bcol_ptpcoll_allreduce_knomial_allgather(mca_bcol_ptpcoll_module_t *ptpcoll_module, - const int buffer_index, - void *sbuf,void *rbuf, int count, struct - ompi_datatype_t *dtype, - const int relative_group_index, - const int padded_start_byte){ - - size_t block_offset = 0, send_offset = 0, recv_offset = 0; - int seg_size=0, block_size=0; - int i,k,completed; - void *my_block_start_addr = rbuf, *my_block_addr; - size_t block_count = count; - netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; - int k_radix = k_node->tree_order; - int peer, group_peer; - int rc = OMPI_SUCCESS; - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - int exchange_step; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - ptrdiff_t lb, extent; - ompi_datatype_get_extent(dtype, &lb, &extent); - - - for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration; - i < k_node->n_exchanges; i++) { - - exchange_step = k_node->n_exchanges - 1 - i; - - block_offset = ptpcoll_module->allgather_offsets[exchange_step][BLOCK_OFFSET]; - send_offset = ptpcoll_module->allgather_offsets[exchange_step][LOCAL_REDUCE_SEG_OFFSET]; - block_count = ptpcoll_module->allgather_offsets[exchange_step][BLOCK_COUNT]; - seg_size = ptpcoll_module->allgather_offsets[exchange_step][SEG_SIZE]; - block_size = block_count * extent; - - - PTPCOLL_VERBOSE(10, ("Send offset %d block_offset %d seg_size %\n", - send_offset, block_offset, seg_size)); - - my_block_addr = (void*)((unsigned char*)my_block_start_addr + block_offset); - - for (k = 0; k < k_radix - 1; k++) { - size_t soffset=0; int snd_size = 0; - group_peer = k_node->rank_exchanges[exchange_step][k]; - peer = group_list[group_peer]; - - soffset = send_offset; - snd_size = compute_send_segment_size((int)block_offset, - (int)soffset, - (int)seg_size, - padded_start_byte); - if (snd_size > 0) { - rc = MCA_PML_CALL(isend((void *)((unsigned char *)my_block_addr - + soffset), - snd_size, MPI_BYTE, - peer, tag, MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send the segment to %d", peer)); - return OMPI_ERROR; - } - - ++(*active_requests); - } - - PTPCOLL_VERBOSE(10, ("Send data to receive buffer at offset %d to %d\n", - send_offset, peer)); - } - - for (k = 0; k < k_radix - 1; k++) { - int recv_size=0; - - group_peer = k_node->rank_exchanges[exchange_step][k]; - peer = group_list[group_peer]; - - recv_offset = send_offset + (k + 1) * seg_size; - - if ((int)recv_offset + seg_size > block_size){ - recv_offset = recv_offset % block_size; - } - - PTPCOLL_VERBOSE(10, ("Receive data to receive buffer at offset %d from %d\n", - recv_offset, peer)); - - - recv_size = compute_recv_segment_size((int)block_offset, - (int)recv_offset, - (int)seg_size, - padded_start_byte); - if (recv_size > 0) { - rc = MCA_PML_CALL(irecv((void *)((unsigned char *) - my_block_addr + recv_offset), - recv_size, MPI_BYTE, - peer, tag, comm, &requests[*active_requests])); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive the segment from %d", peer)); - return OMPI_ERROR; - } - ++(*active_requests); - } - - } - - completed = 0; - while(!completed){ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - } - - block_count = block_count * k_radix; - block_size = block_count * extent; - - } - - return rc; - -} - -static inline int compute_padding_count(int count, int k_radix, int n_exchanges){ - bool fpadding = false; - size_t dsize; - int i, pad_count=0, kpow; - - /* is padding required */ - dsize = count; - kpow = 1; - for ( i=0; i < n_exchanges; i++) { - if (dsize % k_radix) { - fpadding = true; - } - dsize /= k_radix; - kpow *= k_radix; - } - - if (fpadding) { - pad_count = count % kpow; - pad_count = kpow - pad_count; - } - - return pad_count; -} - - -int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args){ - - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - struct ompi_op_t *op = input_args->op; - int tag; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - uint64_t sequence_number = input_args->sequence_num; - uint32_t buffer_index = input_args->buffer_index; - void *src_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - - void *recv_buffer = (void *) ( - (unsigned char *)input_args->rbuf + - (size_t)input_args->rbuf_offset); - - int count = input_args->count; - struct ompi_datatype_t *dtype = input_args->dtype; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int *status = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); - ptrdiff_t lb, extent; - - /* Get the knomial tree */ - netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; - int k_radix = k_node->tree_order; - int n_exchanges = k_node->n_exchanges; - int padded_start_byte; - int padding_count = compute_padding_count(count, k_radix, n_exchanges); - - ompi_datatype_get_extent(dtype, &lb, &extent); - padded_start_byte = count * extent; - - - /* Init for making the functions Re-entrant */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1; - *active_requests = 0; - *iteration = -1; - *status = PTPCOLL_NOT_STARTED; - *iteration = 0; - - compute_knomial_allgather_offsets(my_group_index,count + padding_count, dtype,k_radix,n_exchanges, - ptpcoll_module->allgather_offsets); - - /* Perform a recursive k'ing reduce scatter */ - bcol_ptpcoll_allreduce_recursivek_scatter_reduce(ptpcoll_module, buffer_index, - src_buffer, recv_buffer, op, count + padding_count, dtype, - my_group_index,padded_start_byte); - - - /* Perform a recursive k'ing allgather */ - bcol_ptpcoll_allreduce_knomial_allgather(ptpcoll_module, - buffer_index, - src_buffer, recv_buffer, count + padding_count, dtype, - my_group_index, padded_start_byte); - - return BCOL_FN_COMPLETE; -} - -int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module, - int buffer_index, - void *sbuf, - void *rbuf, - struct ompi_op_t *op, - const int count, struct ompi_datatype_t *dtype){ - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; - netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; - int k, peer ; - int rc = OMPI_SUCCESS; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - int block_count, block_size; - char *tmprecv_buffer = NULL, *data_src_buffer, *data_dst_buffer; - ptrdiff_t lb, extent; - ompi_datatype_get_extent(dtype, &lb, &extent); - - block_count = count; - block_size = count * extent; - - - if (0 < block_size) { - tmprecv_buffer = (void*)malloc(block_size); - } - - data_src_buffer = sbuf; - data_dst_buffer = rbuf; - - if (EXCHANGE_NODE == k_node->node_type) { - for (k = 0; k < k_node->n_extra_sources; k++){ - - peer = ptpcoll_module->super.sbgp_partner_module->group_list[ - k_node->rank_extra_sources_array[k]]; - - rc = MCA_PML_CALL(recv((void *)((unsigned char *)tmprecv_buffer), - block_size, MPI_BYTE, - peer, tag, comm, MPI_STATUS_IGNORE)); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive the segment from %d", peer)); - rc = OMPI_ERROR; - goto clean; - } - - ompi_3buff_op_reduce(op, (void*)((unsigned char*)data_src_buffer), - (void*)((unsigned char*)tmprecv_buffer), - (void*)((unsigned char*)data_dst_buffer), - block_count,dtype); - data_src_buffer = data_dst_buffer; - } - } else { - peer = ptpcoll_module->super.sbgp_partner_module->group_list[ - k_node->rank_extra_sources_array[0]]; - - rc = MCA_PML_CALL(send((void *)((unsigned char *)sbuf), - block_size, MPI_BYTE, - peer, tag, MCA_PML_BASE_SEND_STANDARD, comm)); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - rc = OMPI_ERROR; - goto clean; - } - } - -clean: - if (tmprecv_buffer) { - free(tmprecv_buffer); - } - return rc; -} - -int bcol_ptpcoll_allreduce_knomial_allgather_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module, - int buffer_index, - void *sbuf, - void *rbuf, - const int count, struct ompi_datatype_t *dtype){ - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; - netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; - int k, peer ; - int rc = OMPI_SUCCESS; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - int block_size, completed; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - ptrdiff_t lb, extent; - ompi_datatype_get_extent(dtype, &lb, &extent); - - - block_size = count * extent; - - if (EXTRA_NODE == k_node->node_type) { - peer = ptpcoll_module->super.sbgp_partner_module->group_list[ - k_node->rank_extra_sources_array[0]]; - - rc = MCA_PML_CALL(irecv((void *)((unsigned char *)rbuf), - block_size, MPI_BYTE, - peer, tag, comm, &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - - ++(*active_requests); - } else { - for (k = 0; k < k_node->n_extra_sources; k++) { - peer = ptpcoll_module->super.sbgp_partner_module->group_list[ - k_node->rank_extra_sources_array[k]]; - - rc = MCA_PML_CALL(isend((void *)((unsigned char *)rbuf), - block_size, MPI_BYTE, - peer, tag, MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - - ++(*active_requests); - } - - } - - completed = 0; - - while(!completed){ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - } - - return rc; -} - -int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args){ - - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - struct ompi_op_t *op = input_args->op; - int tag; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - uint64_t sequence_number = input_args->sequence_num; - uint32_t buffer_index = input_args->buffer_index; - void *src_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - - void *recv_buffer = (void *) ( - (unsigned char *)input_args->rbuf + - (size_t)input_args->rbuf_offset); - - int count = input_args->count; - struct ompi_datatype_t *dtype = input_args->dtype; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int *status = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); - ptrdiff_t lb, extent; - /* Get the knomial tree */ - netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; - int k_radix = k_node->tree_order; - int n_exchanges = k_node->n_exchanges; - int padded_start_byte; - int padding_count = compute_padding_count(count, k_radix, n_exchanges); - void *tmpsrc_buffer = NULL; - - ompi_datatype_get_extent(dtype, &lb, &extent); - padded_start_byte = count * extent; - - /* Init for making the functions Re-entrant */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1; - *active_requests = 0; - *iteration = -1; - *status = PTPCOLL_NOT_STARTED; - *iteration = 0; - - compute_knomial_allgather_offsets(my_group_index,count + padding_count, dtype,k_radix,n_exchanges, - ptpcoll_module->allgather_offsets); - - if (EXCHANGE_NODE == k_node->node_type) { - bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(ptpcoll_module, - buffer_index, - src_buffer, recv_buffer, op, count, dtype); - tmpsrc_buffer = src_buffer; - if ( k_node->n_extra_sources > 0){ - tmpsrc_buffer = recv_buffer; - } - bcol_ptpcoll_allreduce_recursivek_scatter_reduce(ptpcoll_module, buffer_index, - tmpsrc_buffer, recv_buffer, op, count + padding_count, dtype, - my_group_index,padded_start_byte); - bcol_ptpcoll_allreduce_knomial_allgather(ptpcoll_module, - buffer_index, - src_buffer, recv_buffer, count + padding_count, dtype, - my_group_index, padded_start_byte); - bcol_ptpcoll_allreduce_knomial_allgather_extra(ptpcoll_module, - buffer_index, - src_buffer, recv_buffer, count, dtype); - - } - else if (EXTRA_NODE == k_node->node_type) { - bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(ptpcoll_module, - buffer_index, - src_buffer, recv_buffer, op, count, dtype); - bcol_ptpcoll_allreduce_knomial_allgather_extra(ptpcoll_module, - buffer_index, - src_buffer, recv_buffer, count, dtype); - } - - return BCOL_FN_COMPLETE; -} - - - -/* - * Register allreduce functions to the BCOL function table, - * so they can be selected - */ -int bcol_ptpcoll_allreduce_init(mca_bcol_base_module_t *super) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = - (mca_bcol_ptpcoll_module_t *) super; - - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_ALLREDUCE; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - - /* not an accurate attribute, none of these algorithms - * are non-blocking - */ - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_allreduce_narraying_init, - bcol_ptpcoll_allreduce_narraying_progress); - - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - - if (ptpcoll_module->pow_knum == ptpcoll_module->group_size) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init, - NULL); - - } else { - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init, - NULL); - - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.h deleted file mode 100644 index 144e256761..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_PTPCOLL_ALLREDUCE_H -#define MCA_BCOL_PTPCOLL_ALLREDUCE_H - -#include "ompi_config.h" -#include "ompi/op/op.h" -#include "ompi/datatype/ompi_datatype.h" -#include "bcol_ptpcoll.h" -#include "bcol_ptpcoll_utils.h" - -enum { - BLOCK_OFFSET = 0, - LOCAL_REDUCE_SEG_OFFSET, - BLOCK_COUNT, - SEG_SIZE, - NOFFSETS -}; - -BEGIN_C_DECLS -int bcol_ptpcoll_allreduce_narraying(mca_bcol_ptpcoll_module_t *ptpcoll_module, - const int buffer_index, void *data_buffer, - struct ompi_op_t *op, - const int count, struct ompi_datatype_t *dtype, const int - buffer_size, const int relative_group_index); - - -int bcol_ptpcoll_allreduce_narraying_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -int bcol_ptpcoll_allreduce_recursivek_scatter_reduce(mca_bcol_ptpcoll_module_t *ptpcoll_module, - const int buffer_index, void *sbuf, - void *rbuf, - struct ompi_op_t *op, - const int count, struct ompi_datatype_t *dtype, - const int relative_group_index, - const int padded_start_byte); - -int bcol_ptpcoll_allreduce_knomial_allgather(mca_bcol_ptpcoll_module_t *ptpcoll_module, - const int buffer_index, - void *sbuf,void *rbuf, int count, struct - ompi_datatype_t *dtype, - const int relative_group_index, - const int padded_start_byte); - -int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - - -int compute_knomial_allgather_offsets(int group_index, int count, struct - ompi_datatype_t *dtype,int k_radix,int n_exchanges, - int **offsets); - - -int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module, - int buffer_index, - void *sbuf, - void *rbuf, - struct ompi_op_t *op, - const int count, struct ompi_datatype_t *dtype); - -int bcol_ptpcoll_allreduce_knomial_allgather_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module, - int buffer_index, - void *sbuf, - void *rbuf, - const int count, struct ompi_datatype_t *dtype); - -int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -int bcol_ptpcoll_allreduce_init(mca_bcol_base_module_t *super); - -#if 0 -int knomial_reduce_scatter_offsets(int group_index,int count, struct ompi_datatype_t *dtype, int k_radix, - int n_exchanges, int nth_exchange, size_t *recv_offset, size_t - *block_offset, size_t *block_count, size_t *block_size, size_t - *seg_size); - -int allgather_offsets(int group_index,int count, struct ompi_datatype_t *dtype, int k_radix, - int n_exchanges, int nth_exchange, size_t *send_offset, size_t - *block_offset, size_t *block_count, size_t *block_size, size_t - *seg_size); -#endif - -END_C_DECLS - -#endif diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_barrier.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_barrier.c deleted file mode 100644 index 6ad04db6c6..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_barrier.c +++ /dev/null @@ -1,933 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/include/ompi/constants.h" -#include "bcol_ptpcoll.h" -#include "bcol_ptpcoll_utils.h" - -/* - * Fanin routines - no user data - */ - -/********************************************* New Barrier *********************************************/ -/*******************************************************************************************************/ -/*******************************************************************************************************/ - -/*************************************** K-nominal ***************************************/ -/*****************************************************************************************/ -static int bcol_ptpcoll_barrier_recurs_knomial_new( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variable */ - uint64_t sequence_number; - mca_bcol_ptpcoll_module_t *ptpcoll_module = - (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - - netpatterns_k_exchange_node_t *my_exchange_node = - &ptpcoll_module->knomial_exchange_tree; - - int rc, k, pair_comm_rank, exchange, completed, - tree_order = my_exchange_node->tree_order, tag, - n_extra_sources = my_exchange_node->n_extra_sources, - n_exchange = my_exchange_node->n_exchanges, num_reqs; - - ompi_communicator_t *comm = - ptpcoll_module->super.sbgp_partner_module->group_comm; - - int *extra_sources_array = NULL, - **rank_exchanges = my_exchange_node->rank_exchanges; - - ompi_request_t **requests; - opal_free_list_item_t *item; - - mca_bcol_ptpcoll_collreq_t *collreq; - - item = opal_free_list_wait (&ptpcoll_module->collreqs_free); - if (OPAL_UNLIKELY(NULL == item)) { - PTPCOLL_ERROR(("Free list waiting failed.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - collreq = (mca_bcol_ptpcoll_collreq_t *) item; - input_args->bcol_opaque_data = (void *) collreq; - - requests = collreq->requests; - - /* TAG Calculation */ - sequence_number = input_args->sequence_num; - - /* Keep tag within the limit supportd by the pml */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - - /* Mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - if (0 < n_extra_sources) { /* EXCHANGE_NODE case */ - collreq->need_toserv_extra = 1; - extra_sources_array = my_exchange_node->rank_extra_sources_array; - - /* I will participate in the exchange (of the algorithm) - - * wait for signal from extra process */ - for (k = 0; k < n_extra_sources; ++k) { - pair_comm_rank = - ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]]; - - rc = MCA_PML_CALL(irecv( - NULL, 0, MPI_INT, - pair_comm_rank, tag, - comm, &(requests[k]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("IRecv failed.")); - return rc; - } - } - - num_reqs = n_extra_sources; - - /* Test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - collreq->tag = tag; - collreq->num_reqs = num_reqs; - collreq->exchange = 0; - - return BCOL_FN_STARTED; - } - } else { - collreq->need_toserv_extra = 0; - } - - /* loop over exchange send/recv pairs */ - for (exchange = 0; exchange < n_exchange; ++exchange) { - for (k = 0; k < tree_order - 1; ++k) { - /* rank of exchange partner within the group */ - pair_comm_rank = - ptpcoll_module->super.sbgp_partner_module->group_list[rank_exchanges[exchange][k]]; - - assert(2 * ptpcoll_module->k_nomial_radix > (k * 2 + 1)); - - /* send to partner - we will wait for completion, as send - * completion is at the MPI level, and will not - * incur network level completion costs - */ - rc = MCA_PML_CALL(isend( - NULL, 0, MPI_INT, - pair_comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, - comm, &(requests[k * 2 + 1]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("ISend failed.")); - return rc; - } - - PTPCOLL_VERBOSE(10, ("Ex %d, K %d send to %d[%d]", exchange, k, - pair_comm_rank, rank_exchanges[exchange][k])); - - /* recive from partner */ - rc = MCA_PML_CALL(irecv( - NULL, 0, MPI_INT, - pair_comm_rank, tag, - comm, &(requests[k * 2]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("IRecv failed.")); - return rc; - } - - PTPCOLL_VERBOSE(10, ("Ex %d, K %d irecv from %d[%d]", exchange, k, - pair_comm_rank, rank_exchanges[exchange][k])); - } - - num_reqs = 2 * (tree_order - 1); - - /* Test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - collreq->tag = tag; - collreq->num_reqs = num_reqs; - collreq->exchange = exchange + 1; - - return BCOL_FN_STARTED; - } - } - - /* If non power of 2, may need to send message to "extra" proc */ - if (0 < n_extra_sources) { /* EXCHANGE_NODE case */ - for (k = 0; k < n_extra_sources; ++k) { - pair_comm_rank = - ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]]; - - rc = MCA_PML_CALL(isend( - NULL, 0, MPI_INT, - pair_comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, - comm, &(requests[k]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("ISend failed.")); - return rc; - } - } - - num_reqs = n_extra_sources; - - /* Test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - collreq->tag = tag; - collreq->num_reqs = num_reqs; - - collreq->exchange = n_exchange; - collreq->need_toserv_extra = 0; - - return BCOL_FN_STARTED; - } - } - - opal_free_list_return (&ptpcoll_module->collreqs_free, (opal_free_list_item_t *) collreq); - return BCOL_FN_COMPLETE; -} - -static int bcol_ptpcoll_barrier_recurs_knomial_new_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variable */ - mca_bcol_ptpcoll_module_t *ptpcoll_module = - (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - - netpatterns_k_exchange_node_t *my_exchange_node = - &ptpcoll_module->knomial_exchange_tree; - - int rc, k, tag, pair_comm_rank, exchange, - tree_order = my_exchange_node->tree_order, num_reqs, - n_exchange = my_exchange_node->n_exchanges, completed, - n_extra_sources = my_exchange_node->n_extra_sources; - - ompi_communicator_t *comm = - ptpcoll_module->super.sbgp_partner_module->group_comm; - - int *extra_sources_array, - **rank_exchanges = my_exchange_node->rank_exchanges; - - mca_bcol_ptpcoll_collreq_t *collreq = - (mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data; - - ompi_request_t **requests = collreq->requests; - - num_reqs = collreq->num_reqs; - - /* Test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - return BCOL_FN_STARTED; - } - - /* Continue loop over exchange send/recv pairs */ - tag = collreq->tag; - - for (exchange = collreq->exchange; exchange < n_exchange; ++exchange) { - for (k = 0; k < tree_order - 1; ++k) { - /* rank of exchange partner within the group */ - pair_comm_rank = - ptpcoll_module->super.sbgp_partner_module->group_list[rank_exchanges[exchange][k]]; - - assert(2 * ptpcoll_module->k_nomial_radix > (k * 2 + 1)); - - /* send to partner - we will wait for completion, as send - * completion is at the MPI level, and will not - * incur network level completion costs - */ - rc = MCA_PML_CALL(isend( - NULL, 0, MPI_INT, - pair_comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, - comm, &(requests[k * 2 + 1]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("ISend failed.")); - return rc; - } - - PTPCOLL_VERBOSE(10, ("Ex %d, K %d send to %d[%d]", exchange, k, - pair_comm_rank, rank_exchanges[exchange][k])); - - /* recive from partner */ - rc = MCA_PML_CALL(irecv( - NULL, 0, MPI_INT, - pair_comm_rank, tag, - comm, &(requests[k * 2]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("IRecv failed.")); - return rc; - } - - PTPCOLL_VERBOSE(10, ("Ex %d, K %d irecv from %d[%d]", exchange, k, - pair_comm_rank, rank_exchanges[exchange][k])); - } - - num_reqs = 2 * (tree_order - 1); - - /* Test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - collreq->num_reqs = num_reqs; - collreq->exchange = exchange + 1; - - return BCOL_FN_STARTED; - } - } - - /* If non power of 2, may need to send message to "extra" proc */ - if (collreq->need_toserv_extra) { /* EXCHANGE_NODE case */ - extra_sources_array = my_exchange_node->rank_extra_sources_array; - - for (k = 0; k < n_extra_sources; ++k) { - pair_comm_rank = - ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]]; - - rc = MCA_PML_CALL(isend( - NULL, 0, MPI_INT, - pair_comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, - comm, &(requests[k]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("ISend failed.")); - return rc; - } - } - - num_reqs = n_extra_sources; - - /* Test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - collreq->num_reqs = num_reqs; - collreq->exchange = n_exchange; - collreq->need_toserv_extra = 0; - - return BCOL_FN_STARTED; - } - } - - return BCOL_FN_COMPLETE; -} - -/****************************************** Extra node Barrier ******************************************/ - -static int bcol_ptpcoll_barrier_recurs_knomial_extra_new( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variable */ - uint64_t sequence_number; - int rc, tag, pair_comm_rank, - completed, num_reqs = 2; - - mca_bcol_ptpcoll_module_t *ptpcoll_module = - (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - - netpatterns_k_exchange_node_t *my_exchange_node = - &ptpcoll_module->knomial_exchange_tree; - - ompi_communicator_t *comm = - ptpcoll_module->super.sbgp_partner_module->group_comm; - - int *extra_sources_array = my_exchange_node->rank_extra_sources_array; - - ompi_request_t **requests; - opal_free_list_item_t *item; - - mca_bcol_ptpcoll_collreq_t *collreq; - - item = opal_free_list_wait (&ptpcoll_module->collreqs_free); - if (OPAL_UNLIKELY(NULL == item)) { - PTPCOLL_ERROR(("Free list waiting failed.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - collreq = (mca_bcol_ptpcoll_collreq_t *) item; - input_args->bcol_opaque_data = (void *) collreq; - - requests = collreq->requests; - - /* TAG Calculation */ - sequence_number = input_args->sequence_num; - - /* Keep tag within the limit supportd by the pml */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - - /* Mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - pair_comm_rank = - ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[0]]; - - rc = MCA_PML_CALL(isend( - NULL, 0, MPI_INT, - pair_comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, - comm, &(requests[0]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("ISend failed.")); - return rc; - } - - rc = MCA_PML_CALL(irecv( - NULL, 0, MPI_INT, - pair_comm_rank, tag, - comm, &(requests[1]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("IRecv failed.")); - return rc; - } - - /* Test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - return BCOL_FN_STARTED; - } - - opal_free_list_return (&ptpcoll_module->collreqs_free, (opal_free_list_item_t *) collreq); - return BCOL_FN_COMPLETE; -} - -/*************************************** Recursive-Doubling ***************************************/ -/**************************************************************************************************/ - -static int bcol_ptpcoll_barrier_recurs_dbl_new( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variable */ - uint64_t sequence_number; - mca_bcol_ptpcoll_module_t *ptp_module = - (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - - ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm; - - int rc, my_extra_partner_comm_rank = 0, exchange, completed, - pair_comm_rank, pair_rank, delta, tag, num_reqs = 0, - my_rank = ptp_module->super.sbgp_partner_module->my_index, - n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2; - - ompi_request_t **requests; - opal_free_list_item_t *item; - - mca_bcol_ptpcoll_collreq_t *collreq; - - item = opal_free_list_wait (&ptp_module->collreqs_free); - if (OPAL_UNLIKELY(NULL == item)) { - PTPCOLL_ERROR(("Free list waiting failed.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - collreq = (mca_bcol_ptpcoll_collreq_t *) item; - input_args->bcol_opaque_data = (void *) collreq; - - assert(PTPCOLL_EXTRA != ptp_module->pow_2type); - - requests = collreq->requests; - - /* TAG Calculation */ - sequence_number = input_args->sequence_num; - - /* keep tag within the limit supportd by the pml */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask); - - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - if (PTPCOLL_PROXY == ptp_module->pow_2type) { - /* I will participate in the exchange - wait for signal from extra - ** process */ - /* - * recv from extra rank - my_extra_partner_comm_rank - * can use blocking recv, as no other communications - * need to take place. - */ - my_extra_partner_comm_rank = - ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index]; - - collreq->need_toserv_extra = 1; - collreq->extra_partner_rank = my_extra_partner_comm_rank; - - rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT, - my_extra_partner_comm_rank, tag, comm, - &(requests[0]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("IRecv failed.")); - return rc; - } - - completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for irecv failed.")); - return rc; - } - - if (!completed) { - collreq->tag = tag; - collreq->num_reqs = 1; - collreq->exchange = 0; - - return BCOL_FN_STARTED; - } - } else { - collreq->need_toserv_extra = 0; - } - - /* Loop over exchange send/recv pairs */ - delta = 1; - for (exchange = 0; exchange < n_exchange; ++exchange) { - - /* rank of exchange partner within the group */ - pair_rank = my_rank ^ delta; - - /* rank within the communicator */ - pair_comm_rank = - ptp_module->super.sbgp_partner_module->group_list[pair_rank]; - - /* send to partner - we will wait for completion, as send - * completion is at the MPI level, and will not - * incur network level completion costs - */ - rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT, - pair_comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[0]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("ISend failed.")); - return rc; - } - - ++num_reqs; - - /* recive from partner */ - rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT, - pair_comm_rank, tag, comm, - &(requests[1]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("IRecv failed.")); - return rc; - } - - ++num_reqs; - - PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d", - exchange, pair_rank, pair_comm_rank)); - - /* test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - collreq->tag = tag; - collreq->num_reqs = num_reqs; - - collreq->exchange = exchange + 1; - assert(collreq->exchange >= 0); - - return BCOL_FN_STARTED; - } - - delta <<= 1; /* delta *= 2 */ - } - - if (PTPCOLL_PROXY == ptp_module->pow_2type) { - /* send - let the extra rank know that we are done */ - rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT, - my_extra_partner_comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[0]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("ISend failed.")); - return rc; - } - - completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for isend failed.")); - return rc; - } - - if (!completed) { - collreq->tag = tag; - collreq->num_reqs = 1; - - collreq->need_toserv_extra = 0; - collreq->exchange = n_exchange; - - return BCOL_FN_STARTED; - } - } - - opal_free_list_return (&ptp_module->collreqs_free, (opal_free_list_item_t *) collreq); - return BCOL_FN_COMPLETE; -} - -static int bcol_ptpcoll_barrier_recurs_dbl_new_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variable */ - mca_bcol_ptpcoll_module_t *ptp_module = - (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - - ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm; - - int rc, exchange, pair_comm_rank, tag, - pair_rank, delta, num_reqs, completed, - my_rank = ptp_module->super.sbgp_partner_module->my_index, - n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2; - - ompi_request_t **requests; - mca_bcol_ptpcoll_collreq_t *collreq = - (mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data; - - num_reqs = collreq->num_reqs; - requests = collreq->requests; - - /* test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - return BCOL_FN_STARTED; - } - - assert(PTPCOLL_EXTRA != ptp_module->pow_2type); - - /* Continue loop over exchange send/recv pairs */ - num_reqs = 0; - tag = collreq->tag; - - exchange = collreq->exchange; - assert(exchange >= 0); - - delta = 1 << exchange; - for (; exchange < n_exchange; ++exchange) { - - /* rank of exchange partner within the group */ - pair_rank = my_rank ^ delta; - - /* rank within the communicator */ - pair_comm_rank = - ptp_module->super.sbgp_partner_module->group_list[pair_rank]; - - /* send to partner - we will wait for completion, as send - * completion is at the MPI level, and will not - * incur network level completion costs - */ - rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT, - pair_comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[0]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("ISend failed.")); - return rc; - } - - ++num_reqs; - - /* recive from partner */ - rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT, - pair_comm_rank, tag, comm, - &(requests[1]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("IRecv failed.")); - return rc; - } - - ++num_reqs; - - PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d", - exchange, pair_rank, pair_comm_rank)); - - /* test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - collreq->num_reqs = num_reqs; - collreq->exchange = exchange + 1; - assert(collreq->exchange >= 0); - - return BCOL_FN_STARTED; - } - - delta <<= 1; /* delta *= 2 */ - } - - /* if non power of 2, may need to send message to "extra" proc */ - if (collreq->need_toserv_extra) { - /* send - let the extra rank know that we are done */ - rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT, - collreq->extra_partner_rank, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[0]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("ISend failed.")); - return rc; - } - - completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for isend failed.")); - return rc; - } - - if (!completed) { - collreq->num_reqs = 1; - collreq->need_toserv_extra = 0; - collreq->exchange = n_exchange; - - return BCOL_FN_STARTED; - } - } - - return BCOL_FN_COMPLETE; -} - -/****************************************** Extra node Barrier ******************************************/ - -static int bcol_ptpcoll_barrier_recurs_dbl_extra_new( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variable */ - uint64_t sequence_number; - int rc, completed, num_reqs = 2, - tag, my_extra_partner_comm_rank; - - ompi_request_t **requests; - opal_free_list_item_t *item; - - mca_bcol_ptpcoll_collreq_t *collreq; - - mca_bcol_ptpcoll_module_t *ptp_module = - (mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm; - - item = opal_free_list_wait (&ptp_module->collreqs_free); - if (OPAL_UNLIKELY(NULL == item)) { - PTPCOLL_ERROR(("Free list waiting failed.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - collreq = (mca_bcol_ptpcoll_collreq_t *) item; - input_args->bcol_opaque_data = (void *) collreq; - - requests = collreq->requests; - - /* TAG Calculation */ - sequence_number = input_args->sequence_num; - - /* Keep tag within the limit supportd by the pml */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask); - - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - /* I will not participate in the exchange - so just "register" as here, - * signal the extra rank that I am here */ - - my_extra_partner_comm_rank = - ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index]; - - rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT, - my_extra_partner_comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[0]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Send failed.")); - return rc; - } - - /* Recv signal that the rest are done - my_extra_partner_comm_rank */ - rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT, - my_extra_partner_comm_rank, tag, comm, - &(requests[1]))); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("IRecv failed.")); - return rc; - } - - /* Test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - return BCOL_FN_STARTED; - } - - opal_free_list_return (&ptp_module->collreqs_free, (opal_free_list_item_t *) collreq); - return BCOL_FN_COMPLETE; -} - -/* We have the same progress func for both cases (R-D and K-Nominal) */ -static int bcol_ptpcoll_barrier_extra_node_progress( - bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variable */ - ompi_request_t **requests; - int rc, completed, num_reqs = 2; - - mca_bcol_ptpcoll_collreq_t *collreq = - (mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data; - - requests = collreq->requests; - - /* test for completion */ - completed = - mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - PTPCOLL_ERROR(("Test for all failed.")); - return rc; - } - - if (!completed) { - return BCOL_FN_STARTED; - } - - return BCOL_FN_COMPLETE; -} - -static int mca_bcol_ptpcoll_barrier_setup(mca_bcol_base_module_t *super, int bcoll_type) -{ - netpatterns_k_exchange_node_t *my_exchange_node; - mca_bcol_ptpcoll_module_t * ptpcoll_module = - (mca_bcol_ptpcoll_module_t *) super; - - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = bcoll_type; - - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - comm_attribs.data_src = DATA_SRC_KNOWN; - - switch(mca_bcol_ptpcoll_component.barrier_alg) { - case 1: - if (PTPCOLL_EXTRA == ptpcoll_module->pow_2type) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_barrier_recurs_dbl_extra_new, - bcol_ptpcoll_barrier_extra_node_progress); - break; - } - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_barrier_recurs_dbl_new, - bcol_ptpcoll_barrier_recurs_dbl_new_progress); - break; - case 2: - my_exchange_node = &ptpcoll_module->knomial_exchange_tree; - if (my_exchange_node->n_extra_sources > 0 && - EXTRA_NODE == my_exchange_node->node_type) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_barrier_recurs_knomial_extra_new, - bcol_ptpcoll_barrier_extra_node_progress); - break; - } - - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_barrier_recurs_knomial_new, - bcol_ptpcoll_barrier_recurs_knomial_new_progress); - break; - default: - PTPCOLL_ERROR(("Wrong barrier_alg flag value.")); - } - - return OMPI_SUCCESS; -} - -int mca_bcol_ptpcoll_memsync_init(mca_bcol_base_module_t *super) -{ - return mca_bcol_ptpcoll_barrier_setup(super, BCOL_SYNC); -} - -int bcol_ptpcoll_barrier_init(mca_bcol_base_module_t *super) -{ - return mca_bcol_ptpcoll_barrier_setup(super, BCOL_BARRIER); -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.c deleted file mode 100644 index f2b039e3ac..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.c +++ /dev/null @@ -1,2321 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/bcol/bcol.h" -#include "bcol_ptpcoll_bcast.h" -#include "bcol_ptpcoll_utils.h" - -#define K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, \ - my_group_index, group_list, \ - data_buffer, count, tag, comm, send_requests, num_pending_sends) \ -do { \ - int rc = OMPI_SUCCESS; \ - int dst; \ - int comm_dst; \ - *num_pending_sends = 0; \ - \ - while(MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_PEER_CHECK_LEVEL(step_info)) { \ - /* For each level of tree, do sends */ \ - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_PEER(my_group_index, \ - radix, step_info, dst); \ - comm_dst = group_list[dst]; \ - \ - /* Non blocking send .... */ \ - PTPCOLL_VERBOSE(9 , ("Bcast, Isend data to %d[%d], count %d, tag %d, addr %p", \ - dst, comm_dst, count, tag, \ - data_buffer)); \ - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, \ - comm_dst, tag, \ - MCA_PML_BASE_SEND_STANDARD, comm, \ - &(send_requests[*num_pending_sends]))); \ - PTPCOLL_VERBOSE(10, ("send request addr is %p", send_requests[*num_pending_sends])); \ - if( OMPI_SUCCESS != rc ) { \ - PTPCOLL_VERBOSE(10, ("Failed to isend data")); \ - return OMPI_ERROR; \ - } \ - ++(*num_pending_sends); \ - } \ -} while(0) - -#define NARRAY_BCAST_NB(narray_node, process_shift, group_size, \ - data_buffer, count, tag, comm, send_requests, \ - num_pending_sends) \ -do { \ - int n, rc = OMPI_SUCCESS; \ - int dst; \ - int comm_dst; \ - \ - /* Send out data to all relevant childrens */ \ - for (n = 0; n < narray_node->n_children; n++) { \ - \ - dst = narray_node->children_ranks[n] + process_shift; \ - if (dst >= group_size) { \ - dst -= group_size; \ - } \ - comm_dst = group_list[dst]; \ - \ - /* Non blocking send .... */ \ - PTPCOLL_VERBOSE(9 , ("Bcast, Isend data to %d[%d], count %d, tag %d, addr %p", \ - dst, comm_dst, count, tag, \ - data_buffer)); \ - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, \ - comm_dst, tag, \ - MCA_PML_BASE_SEND_STANDARD, comm, \ - &(send_requests[*num_pending_sends]))); \ - if( OMPI_SUCCESS != rc ) { \ - PTPCOLL_VERBOSE(10, ("Failed to isend data")); \ - return OMPI_ERROR; \ - } \ - ++(*num_pending_sends); \ - } \ -} while(0) - - -int bcol_ptpcoll_bcast_k_nomial_anyroot_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - int completed = 0; - int rc; - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - uint32_t buffer_index = input_args->buffer_index; - - ompi_request_t **send_requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, send_requests, &rc); - if (OMPI_SUCCESS != rc) { - return OMPI_ERROR; - } - - /* DONE */ - if(completed) { - PTPCOLL_VERBOSE(10, ("bcast root is done")); - return BCOL_FN_COMPLETE; - } else { - PTPCOLL_VERBOSE(10, ("bcast root is started")); - return BCOL_FN_STARTED; - } -} - -/* K-nomial tree ( with any root ) algorithm */ -int bcol_ptpcoll_bcast_k_nomial_anyroot(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - - int tag; - int rc; - int matched = 0; /* not matched */ - int comm_root = 0; /* no root */ - int i; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - int radix = ptpcoll_module->k_nomial_radix; - int root_radix_mask = ptpcoll_module->pow_knum; - int peer = -1; - uint64_t sequence_number = input_args->sequence_num; - uint32_t buffer_index = input_args->buffer_index; - int extra_root = -1; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_status_public_t status; - ompi_request_t **send_requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - netpatterns_knomial_step_info_t step_info = {0, 0, 0}; - - PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - /* reset requests */ - *active_requests = 0; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_k: %d %d " - "buff: %p " - "radix: %d", - buffer_index, tag, - ptpcoll_module->tag_mask, sequence_number, - input_args->root_flag, - ptpcoll_module->pow_k, ptpcoll_module->pow_knum, - data_buffer, - radix)); - - if (input_args->root_flag) { - PTPCOLL_VERBOSE(10, ("I'm root of the data")); - /* - * I'm root of the operation - * send data to (k - 1) * log base k N neighbors - */ - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, - ptpcoll_module->pow_knum, my_group_index); - K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, - my_group_index, group_list, - data_buffer, count, tag, comm, send_requests, - active_requests); - - goto ANY_ROOT_KNOMIAL_EXTRA; - } - - /* - * I'm not root, and I don't know to calculate root, so just - * wait for data from ANY_SOURCE, once you get it, proceed like a root - */ - - for (i = 0; i < cm->num_to_probe; i++) { - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, ptpcoll_module->pow_knum, my_group_index); - while(MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_PEER_CHECK_LEVEL(step_info)) { - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_PEER(my_group_index, radix, step_info, peer); - PTPCOLL_VERBOSE(10, ("Bcast, iprobe tag %d rank %d", - tag, group_list[peer])); - MCA_PML_CALL(iprobe(group_list[peer], tag, - comm, &matched, &status)); - if (matched) { - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_UPDATE_LEVEL_FOR_BCAST(step_info, radix); - break; - } - } - - /* Check of the */ - if (PTPCOLL_KN_PROXY & ptpcoll_module->pow_ktype) { - for (i = 0 ; i < ptpcoll_module->kn_proxy_extra_num; i++) { - PTPCOLL_VERBOSE(10, ("Bcast, iprobe tag %d rank %d", - tag, group_list[peer])); - MCA_PML_CALL(iprobe(group_list[ptpcoll_module->kn_proxy_extra_index[i]], tag, - comm, &matched, &status)); - if (matched) { - step_info.k_level = root_radix_mask; - extra_root = group_list[ptpcoll_module->kn_proxy_extra_index[i]]; - goto ANY_ROOT_KNOMIAL_BCAST; - } - } - } - } - - /* the function always returns OMPI_SUCCESS, so we don't check return code */ - if (0 == matched) { - PTPCOLL_VERBOSE(10, ("IPROBE was not matched")); - /* No data was received, return no match error */ - return BCOL_FN_NOT_STARTED; - } - - /* set the source of data */ - comm_root = status.MPI_SOURCE; - - PTPCOLL_VERBOSE(10, ("A. step info %d %d %d", step_info.k_level, step_info.k_step, step_info.k_tmp_peer)); - - /* Bcast the data */ - PTPCOLL_VERBOSE(10, ("Starting data bcast")); - -ANY_ROOT_KNOMIAL_BCAST: - /* Post receive that will fetch the data */ - PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, tag %d, addr %p", - comm_root, count, tag, data_buffer)); - - rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, comm_root, tag, comm, MPI_STATUS_IGNORE)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - PTPCOLL_VERBOSE(10, ("Bcast, Data was received")); - - /* Sending forward the data over K-nomial tree */ - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, step_info.k_level, my_group_index); - - PTPCOLL_VERBOSE(10, ("B. step info %d %d %d", step_info.k_level, step_info.k_step, step_info.k_tmp_peer)); - K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, - my_group_index, group_list, - data_buffer, count, tag, comm, send_requests, - active_requests); - -ANY_ROOT_KNOMIAL_EXTRA: - /* Proxy node but NOT virtual root */ - if (PTPCOLL_KN_PROXY & ptpcoll_module->pow_ktype) { - for (i = 0 ; i < ptpcoll_module->kn_proxy_extra_num; i++) { - if (ptpcoll_module->kn_proxy_extra_index[i] == extra_root) - continue; - - PTPCOLL_VERBOSE(10, ("Extra_Isend to %d", ptpcoll_module->kn_proxy_extra_index[i])); - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->kn_proxy_extra_index[i]], tag - 1, - MCA_PML_BASE_SEND_STANDARD, comm, - &(send_requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - } - } - - if (*active_requests > 0) { - matched = - mca_bcol_ptpcoll_test_all_for_match - (active_requests, send_requests, &rc); - } - - /* If it is last call, we have to recycle memory */ - if(matched) { - PTPCOLL_VERBOSE(10, ("bcast root is done")); - return BCOL_FN_COMPLETE; - } else { - PTPCOLL_VERBOSE(10, ("bcast root is started")); - return BCOL_FN_STARTED; - } -} - -static int bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc; - int i; - int completed = 0; /* not completed */ - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - ompi_status_public_t status; - - PTPCOLL_VERBOSE(3, ("Knomial Anyroot, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - /* reset active requests */ - *active_requests = 0; - /* reset iteration counter */ - *iteration = -1; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot extra, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_k: %d %d " - "buff: %p " - ,buffer_index, tag, - ptpcoll_module->tag_mask, input_args->sequence_num, - input_args->root_flag, - ptpcoll_module->pow_k, ptpcoll_module->pow_knum, - data_buffer - )); - - /* we have a power 2 group */ - if (input_args->root_flag) { - - PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data, v root %d", ptpcoll_module->kn_proxy_extra_index[0])); - /* send the all data to your proxy peer */ - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - /* we have to store the iteration number somewhere */ - PTPCOLL_VERBOSE(10, ("Extra was started")); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } else { - for (i = 0; i < cm->num_to_probe && - 0 == completed; i++) { - MCA_PML_CALL(iprobe(group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag - 1, - comm, &completed, &status)); - } - if (0 == completed) { - /* No data was received */ - return BCOL_FN_NOT_STARTED; - } - - /* the data is ready */ - rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag - 1, - comm, MPI_STATUS_IGNORE)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - } - - PTPCOLL_VERBOSE(10, ("Extra was done")); - return BCOL_FN_COMPLETE; -} - -static int bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - int rc; - int completed = 0; /* not completed */ - int i; - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[input_args->buffer_index].requests; - uint32_t buffer_index = input_args->buffer_index; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - ompi_status_public_t status; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - /* keep tag within the limit support by the pml */ - int tag = -((PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask)); - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress extra, was called, tag %d\n", tag)); - if (input_args->root_flag) { - PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } else { - for (i = 0; i < cm->num_to_probe && - 0 == completed; i++) { - MCA_PML_CALL(iprobe(group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag - 1, - comm, &completed, &status)); - } - if (0 == completed) { - return BCOL_FN_STARTED; - } - /* the data is ready */ - - rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag - 1, - comm, MPI_STATUS_IGNORE)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - } - - /* Done */ - return BCOL_FN_COMPLETE; \ -} - -/* Know root means that we know exactly the source of data and we do not have to check multiple - * sources - */ - -#define K_NOMIAL_DATA_SRC(radix, my_group_index, group_size, group_root, data_src, radix_mask) \ - do { \ - int relative_rank = (my_group_index >= group_root) ? my_group_index - group_root : \ - my_group_index - group_root + group_size; \ - \ - radix_mask = 1; \ - while (radix_mask < group_size) { \ - if (relative_rank % (radix * radix_mask)) { \ - data_src = relative_rank/(radix * radix_mask) * (radix * radix_mask) + group_root; \ - if (data_src >= group_size) data_src -= group_size; \ - break; \ - } \ - radix_mask *= radix; \ - } \ - } while (0) - - -int bcol_ptpcoll_bcast_k_nomial_known_root_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc = OMPI_SUCCESS; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - int radix = ptpcoll_module->k_nomial_radix; - int radix_mask; - uint64_t sequence_number = input_args->sequence_num; - uint32_t buffer_index = input_args->buffer_index; - int group_root_index = 0; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **send_requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - ompi_request_t **recv_request = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int completed = 0; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - PTPCOLL_VERBOSE(3, ("BCAST Know root, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_k_nomial_known_root_progress, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_k: %d %d " - "buff: %p " - "radix: %d", - buffer_index, tag, - ptpcoll_module->tag_mask, sequence_number, - input_args->root_flag, - ptpcoll_module->pow_k, ptpcoll_module->pow_knum, - data_buffer, - radix)); - - if (input_args->root_flag) { - /* Check for completion */ - assert(*active_requests > 0); - PTPCOLL_VERBOSE(10, ("Requests %d", *active_requests)); - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, send_requests, &rc); - if (OMPI_SUCCESS != rc) { - return OMPI_ERROR; - } - } else { - /* No data was received. Waiting for data */ - if (0 == (*active_requests)) { - int extra_root = -1; - netpatterns_knomial_step_info_t step_info; - /* We can not block. So run couple of test for data arrival */ - if (0 == mca_bcol_ptpcoll_test_for_match(recv_request, &rc)) { - PTPCOLL_VERBOSE(10, ("Test was not matched (active request %d)", - *active_requests)); - /* No data was received, return no match error */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - radix_mask = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask; - group_root_index = input_args->root_route->rank; - - PTPCOLL_VERBOSE(10, ("Test was matched - radix %d", radix_mask)); - /* Bcast the data */ - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, - radix_mask, my_group_index); - K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, - my_group_index, group_list, - data_buffer, count, tag, comm, send_requests, - active_requests); - - if (PTPCOLL_KN_PROXY & ptpcoll_module->pow_ktype) { - int i; - if (radix_mask == ptpcoll_module->pow_knum) { - extra_root = group_root_index; - } - for (i = 0 ; i < ptpcoll_module->kn_proxy_extra_num; i++) { - if (ptpcoll_module->kn_proxy_extra_index[i] == extra_root) - continue; - PTPCOLL_VERBOSE(10, ("Extra_Isend to %d", ptpcoll_module->kn_proxy_extra_index[i])); - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->kn_proxy_extra_index[i]], tag - 1, - MCA_PML_BASE_SEND_STANDARD, comm, - &(send_requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - } - } - if (*active_requests > 0) { - completed = mca_bcol_ptpcoll_test_all_for_match - (active_requests, send_requests, &rc); - } else { - completed = 1; - } - } else { - /* Data was received and sent out, check for completion */ - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, send_requests, &rc); - if (OMPI_SUCCESS != rc) { - PTPCOLL_VERBOSE(10, ("Test was not matched (active request %d)", - *active_requests)); - return OMPI_ERROR; - } - } - } - /* DONE */ - if(completed) { - return BCOL_FN_COMPLETE; - } else { - PTPCOLL_VERBOSE(10, ("bcast root is started")); - return BCOL_FN_STARTED; - } -} - -int bcol_ptpcoll_bcast_k_nomial_known_root(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc; - int comm_root; - int data_src = -1; - int group_root_index; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - int radix = ptpcoll_module->k_nomial_radix; - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **send_requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - ompi_request_t **recv_request = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int matched = 0; - int k_level, logk_level; - int extra_root = -1; - netpatterns_knomial_step_info_t step_info; - - PTPCOLL_VERBOSE(3, ("BCAST Know root, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - /* reset active request counter */ - (*active_requests) = 0; - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_known_root, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_k: %d %d " - "buff: %p " - "radix: %d", - buffer_index, tag, - ptpcoll_module->tag_mask, input_args->sequence_num, - input_args->root_flag, - ptpcoll_module->pow_k, ptpcoll_module->pow_knum, - data_buffer, - radix)); - - if (input_args->root_flag) { - PTPCOLL_VERBOSE(10, ("I'm root of the data")); - /* - * I'm root of the operation - * send data to (k - 1) * log base k N neighbors - */ - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, - ptpcoll_module->pow_knum, my_group_index); - K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, - my_group_index, group_list, - data_buffer, count, tag, comm, send_requests, - active_requests); - goto KNOWN_ROOT_KNOMIAL_BCAST_EXTRA; - } - - /* I'm not root */ - group_root_index = input_args->root_route->rank; - - /* If Proxy node, check if extra node is root */ - PTPCOLL_VERBOSE(10, ("Check if I virtual root, groop root %d group_size_pow %d type %d\n", - group_root_index, ptpcoll_module->pow_knum , ptpcoll_module->pow_ktype)); - if (group_root_index >= ptpcoll_module->pow_knum) { - /* Chech if the rank is virtual root */ - int virtual_root = (group_root_index - - ptpcoll_module->pow_knum) / (radix - 1); - - if (my_group_index == virtual_root) { - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, - ptpcoll_module->pow_knum, my_group_index); - k_level = ptpcoll_module->pow_knum; - comm_root = group_list[group_root_index]; - extra_root = group_root_index; - PTPCOLL_VERBOSE(10, ("Im virtual root klevel %d, comm_root %d vroot %d\n", - k_level, comm_root, virtual_root)); - goto KNOWN_ROOT_KNOMIAL_BCAST; - } else { - /* set virtual root as real root of the group */ - group_root_index = virtual_root; - PTPCOLL_VERBOSE(10, ("My virtual root vroot %d\n", group_root_index)); - } - } - - data_src = netpatterns_get_knomial_data_source( - my_group_index, group_root_index, radix, ptpcoll_module->pow_knum, - &k_level, &logk_level); - - comm_root = group_list[data_src]; - -KNOWN_ROOT_KNOMIAL_BCAST: - PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, tag %d, addr %p", - comm_root, data_src, count, tag, data_buffer)); - - rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, comm_root, tag, comm, recv_request)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - - /* We can not block. So run couple of test for data arrival */ - if (0 == mca_bcol_ptpcoll_test_for_match(recv_request, &rc)) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - /* cache the radix mask for future progress */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = k_level; - /* No data was received, return no match error */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - /* Bcast the data */ - MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, - k_level, my_group_index); - - K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, - my_group_index, group_list, - data_buffer, count, tag, comm, send_requests, - active_requests); - -KNOWN_ROOT_KNOMIAL_BCAST_EXTRA: - /* Proxy node but NOT virtual root */ - if (PTPCOLL_KN_PROXY & ptpcoll_module->pow_ktype) { - int i; - for (i = 0 ; i < ptpcoll_module->kn_proxy_extra_num; i++) { - if (ptpcoll_module->kn_proxy_extra_index[i] == extra_root) - continue; - - PTPCOLL_VERBOSE(10, ("Extra_Isend to %d", ptpcoll_module->kn_proxy_extra_index[i])); - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->kn_proxy_extra_index[i]], tag - 1, - MCA_PML_BASE_SEND_STANDARD, comm, - &(send_requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - } - } - - if (*active_requests > 0) { - matched = - mca_bcol_ptpcoll_test_all_for_match - (active_requests, send_requests, &rc); - } else { - matched = 1; - } - - /* If it is last call, we have to recycle memory */ - if(matched) { - return BCOL_FN_COMPLETE; - } else { - PTPCOLL_VERBOSE(10, ("bcast root is started")); - return BCOL_FN_STARTED; - } -} - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc; - int i; - int completed = 0; /* not completed */ - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - ompi_status_public_t status; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - - PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - /* reset active requests */ - *active_requests = 0; - /* reset iteration counter */ - *iteration = -1; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot extra, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_k: %d %d " - "buff: %p " - "radix: %d" , - buffer_index, tag, - ptpcoll_module->tag_mask, input_args->sequence_num, - input_args->root_flag, - ptpcoll_module->pow_k, ptpcoll_module->pow_knum, - data_buffer, - 2 - )); - - /* we have a power 2 group */ - if (input_args->root_flag) { - - PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); - /* send the all data to your proxy peer */ - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->proxy_extra_index], tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - /* we have to store the iteration number somewhere */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } else { - for (i = 0; i < cm->num_to_probe && - 0 == completed; i++) { - MCA_PML_CALL(iprobe(group_list[ptpcoll_module->proxy_extra_index], tag - 1, - comm, &completed, &status)); - } - if (0 == completed) { - /* No data was received */ - return BCOL_FN_NOT_STARTED; - } - - /* the data is ready */ - rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->proxy_extra_index], tag - 1, - comm, MPI_STATUS_IGNORE)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - } - - return BCOL_FN_COMPLETE; -} - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - int rc; - int completed = 0; /* not completed */ - int i; - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[input_args->buffer_index].requests; - uint32_t buffer_index = input_args->buffer_index; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - ompi_status_public_t status; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - /* keep tag within the limit support by the pml */ - int tag = -((PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask)); - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress extra, was called, tag %d\n", tag)); - if (input_args->root_flag) { - PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } else { - for (i = 0; i < cm->num_to_probe && - 0 == completed; i++) { - MCA_PML_CALL(iprobe(group_list[ptpcoll_module->proxy_extra_index], tag - 1, - comm, &completed, &status)); - } - if (0 == completed) { - return BCOL_FN_STARTED; - } - /* the data is ready */ - - rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->proxy_extra_index], tag - 1, - comm, MPI_STATUS_IGNORE)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - } - - /* Done */ - return BCOL_FN_COMPLETE; -} - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int rc; - int completed = 0; /* not completed */ - uint32_t buffer_index = input_args->buffer_index; - - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - size_t base_block_size = (count + ptpcoll_module->pow_2num - 1) / - ptpcoll_module->pow_2num; - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - int *status = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status; - - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_2: %d %d " - "buff: %p " - "radix: %d" - "block_size: %d", - buffer_index, tag, - ptpcoll_module->tag_mask, 0, - input_args->root_flag, - ptpcoll_module->pow_2, ptpcoll_module->pow_2num, - data_buffer, - 2, - base_block_size)); - - switch(*status) { - case PTPCOLL_GATHER_STARTED: - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - ++(*iteration); /* start from next iteration */ - PTPCOLL_VERBOSE(10, ("Outstanding operation was comleted, starting next one ! %d", *iteration)); - break; - case PTPCOLL_EXTRA_SEND_STARTED: - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - return BCOL_FN_COMPLETE; - default: - PTPCOLL_VERBOSE(10, ("Unknown status %d", *status)); - return OMPI_ERROR; - } - - PTPCOLL_VERBOSE(10, ("Stating PR_GATHER")); - /* Gather, continue the recoursive doubling iterations */ - rc = bcol_ptpcoll_bcast_binomial_gather_anyroot(ptpcoll_module, buffer_index, data_buffer, - count, base_block_size); - if (BCOL_FN_COMPLETE != rc) { - assert(0 != *active_requests); - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - PTPCOLL_VERBOSE(10, ("PR_GATHER done")); - - /* it the process is proxy , it has to send full - message to remote peer */ - if ((PTPCOLL_PROXY & ptpcoll_module->pow_2type) && - ! CHECK_IF_ROOT_OR_VROOT(ptpcoll_module, buffer_index)) { - *status = PTPCOLL_EXTRA_SEND_STARTED; - rc = bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra( - ptpcoll_module, - data_buffer, count, tag - 1, - ptpcoll_module->proxy_extra_index, comm, - active_requests, requests); - if (BCOL_FN_COMPLETE != rc) { - return rc; - } - } - /* return */ - return BCOL_FN_COMPLETE; -} - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - uint64_t sequence_number = input_args->sequence_num; - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *radix_mask_pow = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - size_t base_block_size = (count + ptpcoll_module->pow_2num - 1) / - ptpcoll_module->pow_2num; - int root_pow2 = ptpcoll_module->pow_2 - 1; - int *status = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status; - - PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; - /* reset active requests */ - *active_requests = 0; - /* reset iteration counter */ - *iteration = -1; - /* set initial status */ - *status = PTPCOLL_NOT_STARTED; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_2: %d %d " - "buff: %p " - "radix: %d" - "block_size: %d", - buffer_index, tag, - ptpcoll_module->tag_mask, sequence_number, - input_args->root_flag, - ptpcoll_module->pow_2, ptpcoll_module->pow_2num, - data_buffer, - 2, - base_block_size)); - - /* we have a power 2 group */ - if (input_args->root_flag) { - - PTPCOLL_VERBOSE(10, ("I'm root of the data")); - /* for proxy we have little bit more work to do */ - if (PTPCOLL_PROXY & ptpcoll_module->pow_2type) { - /* send the all data to your extra peer */ - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->proxy_extra_index], - tag - 1, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - } - /* - * I'm root of the operation - * send data to (k - 1) * log base k N neighbors - */ - *radix_mask_pow = ptpcoll_module->pow_2; - - K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(root_pow2, - my_group_index, group_size, group_list, - data_buffer, base_block_size, count, tag, comm, requests, - active_requests); - - goto GATHER; - } - - /* <-- non root flow --> */ - rc = bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot(ptpcoll_module, buffer_index, - data_buffer, count, base_block_size); - if (BCOL_FN_COMPLETE != rc) { - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - -GATHER: - *iteration = 0; - *status = PTPCOLL_GATHER_STARTED; - rc = bcol_ptpcoll_bcast_binomial_gather_anyroot(ptpcoll_module, buffer_index, - data_buffer, count, base_block_size); - - if (BCOL_FN_COMPLETE != rc) { - assert(0 != *active_requests); - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - - ++(*iteration); /* I need it for progress */ - - /* proxy case */ - if ((PTPCOLL_PROXY & ptpcoll_module->pow_2type) && - ! CHECK_IF_ROOT_OR_VROOT(ptpcoll_module, buffer_index)) { - *status = PTPCOLL_EXTRA_SEND_STARTED; - rc = bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra(ptpcoll_module, - data_buffer, count, tag - 1, - ptpcoll_module->proxy_extra_index, comm, - active_requests, requests); - if (BCOL_FN_COMPLETE != rc) { - return rc; - } - } - - return BCOL_FN_COMPLETE; -} - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int rc; - int completed = 0; /* not completed */ - uint32_t buffer_index = input_args->buffer_index; - - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - size_t base_block_size = (count + ptpcoll_module->pow_2num - 1) / - ptpcoll_module->pow_2num; - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - int *status = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); - - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_known_progress, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_2: %d %d " - "buff: %p " - "radix: %d" - "block_size: %d", - buffer_index, tag, - ptpcoll_module->tag_mask, 0, - input_args->root_flag, - ptpcoll_module->pow_2, ptpcoll_module->pow_2num, - data_buffer, - 2, - base_block_size)); - - switch(*status) { - case PTPCOLL_WAITING_FOR_DATA: - PTPCOLL_VERBOSE(10, ("Probe for the data")); - rc = bcol_ptpcoll_bcast_binomial_test_and_scatter_known_root(ptpcoll_module, buffer_index, - data_buffer, count, base_block_size); - if (BCOL_FN_COMPLETE != rc) { - assert(0 != *active_requests); - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - *iteration = 0; - *status = PTPCOLL_GATHER_STARTED; - break; - case PTPCOLL_GATHER_STARTED: - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - ++(*iteration); /* start from next iteration */ - PTPCOLL_VERBOSE(10, ("Outstanding operation was comleted, starting next one ! %d", *iteration)); - break; - case PTPCOLL_EXTRA_SEND_STARTED: - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - return BCOL_FN_COMPLETE; - default: - PTPCOLL_VERBOSE(10, ("Unknown status %d", *status)); - return OMPI_ERROR; - } - - PTPCOLL_VERBOSE(10, ("Stating PR_GATHER")); - /* Gather, continue the recoursive doubling iterations */ - rc = bcol_ptpcoll_bcast_binomial_gather_anyroot(ptpcoll_module, buffer_index, data_buffer, - count, base_block_size); - if (BCOL_FN_COMPLETE != rc) { - assert(0 != *active_requests); - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - PTPCOLL_VERBOSE(10, ("PR_GATHER done")); - - /* it the process is proxy , it has to send full - message to remote peer */ - if ((PTPCOLL_PROXY & ptpcoll_module->pow_2type) && - ! CHECK_IF_ROOT_OR_VROOT(ptpcoll_module, buffer_index)) { - *status = PTPCOLL_EXTRA_SEND_STARTED; - rc = bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra( - ptpcoll_module, - data_buffer, count, tag - 1, - ptpcoll_module->proxy_extra_index, comm, - active_requests, requests); - if (BCOL_FN_COMPLETE != rc) { - return rc; - } - } - - /* return */ - return BCOL_FN_COMPLETE; -} - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int group_src, comm_root; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - int pow2_distance; - void *curr_data_buffer; - int recv_count; - uint64_t sequence_number = input_args->sequence_num; - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *radix_mask_pow = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - size_t base_block_size = (count + ptpcoll_module->pow_2num - 1) / - ptpcoll_module->pow_2num; - int root_pow2 = ptpcoll_module->pow_2 - 1; - int *status = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); - - PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; - /* reset active requests */ - *active_requests = 0; - /* reset iteration counter */ - *iteration = -1; - /* set initial status */ - *status = PTPCOLL_NOT_STARTED; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_known, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_2: %d %d " - "buff: %p " - "radix: %d" - "block_size: %d", - buffer_index, tag, - ptpcoll_module->tag_mask, sequence_number, - input_args->root_flag, - ptpcoll_module->pow_2, ptpcoll_module->pow_2num, - data_buffer, - 2, - base_block_size)); - - /* we have a power 2 group */ - if (input_args->root_flag) { - - PTPCOLL_VERBOSE(10, ("I'm root of the data")); - /* for proxy we have little bit more work to do */ - if (PTPCOLL_PROXY & ptpcoll_module->pow_2type) { - /* send the all data to your extra peer */ - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->proxy_extra_index], tag - 1, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - *active_requests = 1; - } - /* - * I'm root of the operation - * send data to (k - 1) * log base k N neighbors - */ - K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(root_pow2, - my_group_index, group_size, group_list, - data_buffer, base_block_size, count, tag, comm, requests, - active_requests); - - /* EXIT OR GO TO Gather */ - *iteration = 0; - *radix_mask_pow = ptpcoll_module->pow_2; - goto GATHER; - } - - /* <-- non root flow --> */ - /* prapare and post recv operation */ - group_src = bcol_ptpcoll_binomial_root_to_src(input_args->root_route->rank, - my_group_index, ptpcoll_module->pow_2num, - ptpcoll_module->group_size, &pow2_distance); - - assert(group_src >= 0); - - if (0 > pow2_distance) { - /* the rank is virtual root for this group, receive the data - and scatter gather as root */ - PTPCOLL_VERBOSE(10, ("Virtual root %d , set mask to %d", my_group_index, ptpcoll_module->pow_2)); - *radix_mask_pow = ptpcoll_module->pow_2; - curr_data_buffer = data_buffer; - recv_count = count; - } else { - int my_left_boundary_rank; - recv_count = base_block_size * (1 << pow2_distance); /* we may receive larger data */ - my_left_boundary_rank = my_group_index & ((~(int)0) << pow2_distance ); - curr_data_buffer = (void *)((unsigned char *)data_buffer + - (size_t) base_block_size * my_left_boundary_rank); - *radix_mask_pow = pow2_distance; - } - - comm_root = group_list[group_src]; - - PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, tag %d, addr %p", - comm_root, group_src, count, tag, data_buffer)); - - rc = MCA_PML_CALL(irecv(curr_data_buffer, recv_count, MPI_BYTE, comm_root, - tag, comm, &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - - ++(*active_requests); - - *status = PTPCOLL_WAITING_FOR_DATA; - rc = bcol_ptpcoll_bcast_binomial_test_and_scatter_known_root(ptpcoll_module, - buffer_index, data_buffer, count, base_block_size); - - if (BCOL_FN_COMPLETE != rc) { - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - - /* recv operation is done */ - - *iteration = 0; - -GATHER: - - *status = PTPCOLL_GATHER_STARTED; - rc = bcol_ptpcoll_bcast_binomial_gather_anyroot(ptpcoll_module, buffer_index, - data_buffer, count, base_block_size); - - if (BCOL_FN_COMPLETE != rc) { - assert(0 != *active_requests); - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - - ++(*iteration); /* I need it for progress */ - - /* proxy case */ - if ((PTPCOLL_PROXY & ptpcoll_module->pow_2type) && - ! CHECK_IF_ROOT_OR_VROOT(ptpcoll_module, buffer_index)) { - *status = PTPCOLL_EXTRA_SEND_STARTED; - rc = bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra( - ptpcoll_module, - data_buffer, count, tag - 1, - ptpcoll_module->proxy_extra_index, comm, - active_requests, requests); - if (BCOL_FN_COMPLETE != rc) { - return rc; - } - } - - return BCOL_FN_COMPLETE; -} - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc; - int completed = 0; /* not completed */ - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - - PTPCOLL_VERBOSE(3, ("BCAST known root, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - /* reset active requests */ - *active_requests = 0; - /* reset iteration counter */ - *iteration = -1; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot extra, buffer index: %d \n" - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "pow_k: %d %d " - "buff: %p " - "radix: %d" , - buffer_index, tag, - ptpcoll_module->tag_mask, input_args->sequence_num, - input_args->root_flag, - ptpcoll_module->pow_k, ptpcoll_module->pow_knum, - data_buffer, - 2 - )); - - /* we have a power 2 group */ - if (input_args->root_flag) { - PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); - /* send the all data to your proxy peer */ - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->proxy_extra_index], tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - /* we have to store the iteration number somewhere */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } else { - rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->proxy_extra_index], - tag - 1, comm, &requests[*active_requests])); - ++(*active_requests); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } - - return BCOL_FN_COMPLETE; -} - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - int rc; - int completed = 0; /* not completed */ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[input_args->buffer_index].requests; - uint32_t buffer_index = input_args->buffer_index; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_known_root_extra_progress extra, was called\n")); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - return BCOL_FN_COMPLETE; -} - -static int bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_progress( - bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int rc; - int completed = 0; /* not completed */ - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - uint32_t buffer_index = input_args->buffer_index; - - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - int *status = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); - int relative_group_index, - group_root_index = 0; - int group_size = ptpcoll_module->full_narray_tree_size; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_progress, buffer index: %d " - "tag: %d " - "tag_mask: %d " - "root: %d " - "buff: %p " - "radix: %d" - , buffer_index, tag, - ptpcoll_module->tag_mask, - input_args->root_flag, - data_buffer, - ptpcoll_module->narray_knomial_proxy_num - )); - - if (input_args->root_flag || - /* virtual root case */ - (input_args->root_route->rank >= group_size && - my_group_index == (input_args->root_route->rank - group_size) / - mca_bcol_ptpcoll_component.narray_knomial_radix)) { - relative_group_index = 0; - group_root_index = my_group_index; - } else { - if (input_args->root_route->rank >= group_size) { - group_root_index = (input_args->root_route->rank - group_size) / - mca_bcol_ptpcoll_component.narray_knomial_radix; - } else { - group_root_index = input_args->root_route->rank; - } - relative_group_index = my_group_index - group_root_index; - if (relative_group_index < 0) { - relative_group_index += group_size; - } - } - - switch(*status) { - case PTPCOLL_WAITING_FOR_DATA: - PTPCOLL_VERBOSE(10, ("Probe for the data")); - rc = bcol_ptpcoll_bcast_narray_test_and_scatter_known_root(ptpcoll_module, - buffer_index, data_buffer, count, group_root_index, - relative_group_index); - - if (BCOL_FN_COMPLETE != rc) { - assert(0 != *active_requests); - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - *iteration = 0; - *status = PTPCOLL_GATHER_STARTED; - break; - case PTPCOLL_ROOT_SEND_STARTED: - case PTPCOLL_GATHER_STARTED: - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - ++(*iteration); /* start from next iteration */ - PTPCOLL_VERBOSE(10, ("Outstanding operation was comleted, starting next one ! %d", *iteration)); - break; - case PTPCOLL_EXTRA_SEND_STARTED: - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - return BCOL_FN_COMPLETE; - default: - PTPCOLL_VERBOSE(10, ("Unknown status %d", *status)); - return OMPI_ERROR; - } - - PTPCOLL_VERBOSE(10, ("Stating PR_GATHER")); - /* Gather, continue the recoursive doubling iterations */ - rc = bcol_ptpcoll_bcast_narray_knomial_gather(ptpcoll_module, - buffer_index, data_buffer, count, - relative_group_index); - if (BCOL_FN_COMPLETE != rc) { - assert(0 != *active_requests); - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - PTPCOLL_VERBOSE(10, ("PR_GATHER done")); - - /* it the process is proxy , it has to send full - message to remote peer */ - if ((PTPCOLL_PROXY & ptpcoll_module->narray_type) && - !input_args->root_flag) { - *status = PTPCOLL_EXTRA_SEND_STARTED; - rc = bcol_ptpcoll_send_n_extra( - ptpcoll_module, - data_buffer, count, tag - 1, - ptpcoll_module->narray_knomial_proxy_extra_index, - ptpcoll_module->narray_knomial_proxy_num, - input_args->root_route->rank, - comm, active_requests, requests); - if (BCOL_FN_COMPLETE != rc) { - return rc; - } - } - - /* return */ - return BCOL_FN_COMPLETE; -} - - -static int bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag, rc, i; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int data_src, offset, - comm_root; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - void *curr_data_buffer; - int recv_count; - uint64_t sequence_number = input_args->sequence_num; - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - size_t base_block_size = 0; - int *status = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); - int relative_group_index, - group_root_index; - int group_size = ptpcoll_module->full_narray_tree_size; - int completed = 0; - int virtual_root; - netpatterns_narray_knomial_tree_node_t *narray_knomial_node = NULL; - netpatterns_narray_knomial_tree_node_t *narray_node = NULL; - - PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; - /* reset radix mask, it used to keep last block size */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1; - /* reset active requests */ - *active_requests = 0; - /* reset iteration counter */ - *iteration = -1; - /* set initial status */ - *status = PTPCOLL_NOT_STARTED; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root, buffer index: %d " - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "buff: %p " - "radix: %d" - ,buffer_index, tag, - ptpcoll_module->tag_mask, sequence_number, - input_args->root_flag, - data_buffer, - ptpcoll_module->narray_knomial_proxy_num - )); - - /* we have a power 2 group */ - if (input_args->root_flag) { - PTPCOLL_VERBOSE(10, ("I'm root of the data")); - narray_knomial_node = &ptpcoll_module->narray_knomial_node[0]; - relative_group_index = 0; - group_root_index = my_group_index; - - /* for proxy we have little bit more work to do */ - if (PTPCOLL_PROXY & ptpcoll_module->narray_type) { - /* send the all data to your extra peer */ - for (i = 0; i < ptpcoll_module->narray_knomial_proxy_num; ++i) { - PTPCOLL_VERBOSE(9, ("Extra send %d, dst %d, tag %d", - i, ptpcoll_module->narray_knomial_proxy_extra_index[i], tag - 1)); - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->narray_knomial_proxy_extra_index[i]], - tag - 1, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - } - } - /* - * I'm root of the operation - * send data to radix_k neighbors - */ - base_block_size = NARRAY_BLOCK_SIZE(count, ptpcoll_module, - narray_knomial_node->level_size); - - NARRAY_SCATTER_B(narray_knomial_node, my_group_index, - group_size, data_buffer, - base_block_size, count, tag, comm, requests, - active_requests, completed); - if (0 == completed) { - *status = PTPCOLL_ROOT_SEND_STARTED; - return BCOL_FN_STARTED; - } - goto EXIT; - } - - /* <-- non root flow --> */ - group_root_index = input_args->root_route->rank; - - if (group_root_index >= group_size) { - /* calculate virtual root */ - virtual_root = - (group_root_index - group_size) / - mca_bcol_ptpcoll_component.narray_knomial_radix; - if (my_group_index == virtual_root) { - PTPCOLL_VERBOSE(10, ("I'm virtual root of the data")); - - rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, - group_list[group_root_index], - tag, comm, &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - ++(*active_requests); - /* act like a root */ - relative_group_index = 0; - group_root_index = my_group_index; - goto SCATTER; - } - group_root_index = virtual_root; - } - - relative_group_index = my_group_index - group_root_index; - if (relative_group_index < 0) { - relative_group_index += group_size; - } - - narray_node = &ptpcoll_module->narray_knomial_node[relative_group_index]; - - data_src = narray_node->parent_rank + group_root_index; - if (data_src >= group_size) { - data_src -= group_size; - } - - comm_root = group_list[data_src]; - - recv_count = NARRAY_BLOCK_SIZE(count, ptpcoll_module, narray_node->level_size); - offset = recv_count * narray_node->rank_on_level; - /* make sure that we do not overun memory */ - if (OPAL_UNLIKELY(offset + recv_count > count)) { - recv_count = count - offset; - if (0 >= recv_count) { - goto GATHER; - } - } - - curr_data_buffer = (void *)((unsigned char *)data_buffer + (size_t)offset); - PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, tag %d, addr %p len %d offset %d", - comm_root, data_src, count, tag, data_buffer, recv_count, offset)); - - rc = MCA_PML_CALL(irecv(curr_data_buffer, recv_count, MPI_BYTE, comm_root, - tag, comm, &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - - ++(*active_requests); - -SCATTER: - *status = PTPCOLL_WAITING_FOR_DATA; - - rc = bcol_ptpcoll_bcast_narray_test_and_scatter_known_root(ptpcoll_module, - buffer_index, data_buffer, - count, group_root_index, relative_group_index); - - if (BCOL_FN_COMPLETE != rc) { - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - -GATHER: - /* recv operation is done */ - *iteration = 0; - *status = PTPCOLL_GATHER_STARTED; - rc = bcol_ptpcoll_bcast_narray_knomial_gather(ptpcoll_module, - buffer_index, data_buffer, count, - relative_group_index); - if (BCOL_FN_COMPLETE != rc) { - assert(0 != *active_requests); - PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); - return rc; - } - - ++(*iteration); /* I need it for progress */ - - /* proxy case */ - if ((PTPCOLL_PROXY & ptpcoll_module->narray_type) && - ! input_args->root_flag) { - *status = PTPCOLL_EXTRA_SEND_STARTED; - rc = bcol_ptpcoll_send_n_extra( - ptpcoll_module, - data_buffer, count, tag - 1, - ptpcoll_module->narray_knomial_proxy_extra_index, - ptpcoll_module->narray_knomial_proxy_num, - input_args->root_route->rank, - comm, active_requests, requests); - if (BCOL_FN_COMPLETE != rc) { - return rc; - } - } - -EXIT: - return BCOL_FN_COMPLETE; -} - -/* Pasha : need to move this code to some common function */ -static int bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_extra(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc; - int completed = 0; /* not completed */ - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *iteration = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - - PTPCOLL_VERBOSE(3, ("BCAST known root, index_this_type %d, num_of_this_type %d", - const_args->index_of_this_type_in_collective + 1, - const_args->n_of_this_type_in_collective)); - - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - /* reset active requests */ - *active_requests = 0; - /* reset iteration counter */ - *iteration = -1; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_extra, buffer index: %d " - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "buff: %p " - ,buffer_index, tag, - ptpcoll_module->tag_mask, input_args->sequence_num, - input_args->root_flag, - data_buffer - )); - - /* we have a power 2 group */ - if (input_args->root_flag) { - PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); - /* send the all data to your proxy peer */ - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->narray_knomial_proxy_extra_index[0]], tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - /* we have to store the iteration number somewhere */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } else { - PTPCOLL_VERBOSE(9, ("Posting recive from %d tag %d", - ptpcoll_module->narray_knomial_proxy_extra_index[0], tag - 1)); - rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, - group_list[ptpcoll_module->narray_knomial_proxy_extra_index[0]], - tag - 1, comm, &requests[*active_requests])); - ++(*active_requests); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } - - return BCOL_FN_COMPLETE; -} - -static int bcol_ptpcoll_bcast_known_root_extra_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - int rc; - int completed = 0; /* not completed */ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[input_args->buffer_index].requests; - uint32_t buffer_index = input_args->buffer_index; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_known_root_extra_progress extra, was called\n")); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - PTPCOLL_VERBOSE(10, ("Test was matched - %d", rc)); - return BCOL_FN_COMPLETE; -} - - -static int bcol_ptpcoll_bcast_narray_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag = -1; - int rc; - int group_size = ptpcoll_module->group_size; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **send_requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - ompi_request_t **recv_request = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int matched = true; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int relative_group_index = 0; - netpatterns_tree_node_t *narray_node = NULL; - - PTPCOLL_VERBOSE(3, ("Bcast, Narray tree Progress")); - - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_known_root, buffer index: %d " - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d [%d]" - "buff: %p ", - buffer_index, tag, - ptpcoll_module->tag_mask, input_args->sequence_num, - input_args->root_flag, input_args->root_route->rank, - data_buffer)); - - if (0 == *active_requests) { - int group_root_index = input_args->root_route->rank; - /* If the collective does not have any active requests, it - means the initial data was not received from parent. - Check if some data arrived - */ - if (0 == mca_bcol_ptpcoll_test_for_match(recv_request, &rc)) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - /* No data was received, return no match error */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - /* set all paremetres */ - relative_group_index = my_group_index - group_root_index; - if (relative_group_index < 0) { - relative_group_index +=group_size; - } - narray_node = &ptpcoll_module->narray_node[relative_group_index]; - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - /* Bcast the data */ - NARRAY_BCAST_NB(narray_node, group_root_index, group_size, - data_buffer, count, tag, comm, send_requests, active_requests); - } - - /* All data was received and sent out. - Check if the completion arrived */ - matched = mca_bcol_ptpcoll_test_all_for_match - (active_requests, send_requests, &rc); - if (OMPI_SUCCESS != rc) { - return OMPI_ERROR; - } - - /* If it is last call, we have to recycle memory */ - if(matched) { - return BCOL_FN_COMPLETE; - } else { - PTPCOLL_VERBOSE(10, ("bcast root is started")); - return BCOL_FN_STARTED; - } -} - -static int bcol_ptpcoll_bcast_narray(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc; - int data_src; - int group_size = ptpcoll_module->group_size; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - uint32_t buffer_index = input_args->buffer_index; - - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **send_requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - ompi_request_t **recv_request = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; - void *data_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - int count = input_args->count * input_args->dtype->super.size; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int matched = true; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int group_root_index; - int relative_group_index = 0; - netpatterns_tree_node_t *narray_node = NULL; - - PTPCOLL_VERBOSE(3, ("Bcast, Narray tree")); - - /* reset active request counter */ - (*active_requests) = 0; - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_narray, buffer index: %d " - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "buff: %p ", - buffer_index, tag, - ptpcoll_module->tag_mask, input_args->sequence_num, - input_args->root_flag, - data_buffer)); - - - if (input_args->root_flag) { - PTPCOLL_VERBOSE(10, ("I'm root of the data")); - narray_node = &ptpcoll_module->narray_node[0]; - group_root_index = my_group_index; - /* - * I'm root of the operation - * send data to N childrens - */ - goto NARRAY_BCAST_START; - } - - /* I'm not root */ - group_root_index = input_args->root_route->rank; - - relative_group_index = my_group_index - group_root_index; - if (relative_group_index < 0) { - relative_group_index += group_size; - } - - data_src = - ptpcoll_module->narray_node[relative_group_index].parent_rank + - group_root_index; - if (data_src >= group_size) { - data_src -= group_size; - } - - PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d [%d], count %d, tag %d, addr %p", - group_list[data_src], data_src, - count, tag, data_buffer)); - - - rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, - group_list[data_src], - tag, comm, recv_request)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - - /* We can not block. So run couple of test for data arrival */ - if (0 == mca_bcol_ptpcoll_test_for_match(recv_request, &rc)) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - /* No data was received, return no match error */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - narray_node = &ptpcoll_module->narray_node[relative_group_index]; - -NARRAY_BCAST_START: - /* Bcast the data */ - NARRAY_BCAST_NB(narray_node, group_root_index, group_size, - data_buffer, count, tag, comm, send_requests, active_requests); - - matched = mca_bcol_ptpcoll_test_all_for_match - (active_requests, send_requests, &rc); - if (OMPI_SUCCESS != rc) { - return OMPI_ERROR; - } - - /* If it is last call, we have to recycle memory */ - if(matched) { - return BCOL_FN_COMPLETE; - } else { - PTPCOLL_VERBOSE(10, ("bcast root is started")); - return BCOL_FN_STARTED; - } -} - -int bcol_ptpcoll_bcast_init(mca_bcol_base_module_t *super) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = - (mca_bcol_ptpcoll_module_t *) super; - - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - comm_attribs.bcoll_type = BCOL_BCAST; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - - comm_attribs.data_src = DATA_SRC_UNKNOWN; - - if(PTPCOLL_KN_EXTRA == ptpcoll_module->pow_ktype) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot, - bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress); - } else { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_k_nomial_anyroot, - bcol_ptpcoll_bcast_k_nomial_anyroot_progress); - } - - comm_attribs.data_src = DATA_SRC_KNOWN; - switch(mca_bcol_ptpcoll_component.bcast_small_messages_known_root_alg) { - case PTPCOLL_KNOMIAL: - if(PTPCOLL_KN_EXTRA == ptpcoll_module->pow_ktype) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot, - bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress); - } else { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_k_nomial_known_root, - bcol_ptpcoll_bcast_k_nomial_known_root_progress); - } - break; - case PTPCOLL_NARRAY: - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_narray, - bcol_ptpcoll_bcast_narray_progress); - break; - default: - PTPCOLL_ERROR(("Unknown algorithm index was selected %", - mca_bcol_ptpcoll_component.bcast_small_messages_known_root_alg)); - return OMPI_ERROR; - } - - comm_attribs.data_src = DATA_SRC_UNKNOWN; - inv_attribs.bcol_msg_min = 10000000; - inv_attribs.bcol_msg_max = 10485760; /* range 4 */ - - /* Anyroot large messages functions registration */ - - if (PTPCOLL_EXTRA == ptpcoll_module->pow_2type) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra, - bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra_progress); - } else { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot, - bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress); - } - - /* Known-root large messages functions registration */ - - comm_attribs.data_src = DATA_SRC_KNOWN; - switch(mca_bcol_ptpcoll_component.bcast_large_messages_known_root_alg) { - case PTPCOLL_BINOMIAL_SG: - if (PTPCOLL_EXTRA == ptpcoll_module->pow_2type) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra, - bcol_ptpcoll_bcast_known_root_extra_progress); - /* bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra_progress); */ - } else { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root, - bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_progress); - } - break; - case PTPCOLL_NARRAY_KNOMIAL_SG: - if (PTPCOLL_EXTRA == ptpcoll_module->narray_type) { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_extra, - bcol_ptpcoll_bcast_known_root_extra_progress); - } else { - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root, - bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_progress); - } - break; - default: - PTPCOLL_ERROR(("Unknown algorithm index was selected %", - mca_bcol_ptpcoll_component.bcast_large_messages_known_root_alg)); - return OMPI_ERROR; - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.h deleted file mode 100644 index 4e0581e350..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.h +++ /dev/null @@ -1,868 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_PTPCOLL_BCAST_H -#define MCA_BCOL_PTPCOLL_BCAST_H - -#include "ompi_config.h" -#include "bcol_ptpcoll.h" -#include "bcol_ptpcoll_utils.h" - -BEGIN_C_DECLS - -int bcol_ptpcoll_bcast_init(mca_bcol_base_module_t *super); - -int bcol_ptpcoll_bcast_k_nomial_anyroot (bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -int bcol_ptpcoll_bcast_k_nomial_anyroot_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -int bcol_ptpcoll_bcast_k_nomial_known_root(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -int bcol_ptpcoll_bcast_k_nomial_known_root_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); -int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - - -/* macros */ -#define K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER( \ - radix_mask_pow, \ - my_group_index, group_size, group_list, \ - data_buffer, segment_size, count, tag, \ - comm, send_requests, num_pending_sends) \ -do { \ - int rc = OMPI_SUCCESS; \ - int dst; \ - int comm_dst; \ - int send_size; \ - int send_offset; \ - int delta; \ - int dst_boundary_rank; \ - int radix_mask = radix_mask_pow >= 0 ? 1 << radix_mask_pow : 0; \ - \ - while(radix_mask_pow >= 0) { \ - /* For each level of tree, do sends */ \ - dst = my_group_index ^ radix_mask; \ - comm_dst = group_list[dst]; \ - \ - dst_boundary_rank = dst & ((~(int)0) << (radix_mask_pow)); \ - \ - send_offset = segment_size * dst_boundary_rank; \ - /* Pasha: make sure that we handle the corner cases */ \ - delta = count - send_offset; \ - if (delta <= 0) { \ - send_size = 0; /* we have to send something, other way it will hang */ \ - } else { \ - /* the tail case */ \ - send_size = (int) \ - (delta - (int)segment_size * radix_mask) < 0 ? delta : \ - (int)segment_size * radix_mask; \ - } \ - \ - /* Non blocking send .... */ \ - PTPCOLL_VERBOSE(9 , \ - ("Bcast p2s, Isend to %d[%d],count %d,tag %d,addr %p [%p] send_size %d,send_offset %d, radix %d %d",\ - dst, comm_dst, count, tag, \ - data_buffer, (void *)((unsigned char *)data_buffer + (size_t)send_offset), \ - send_size, \ - send_offset, \ - radix_mask, \ - radix_mask_pow \ - )); \ - rc = MCA_PML_CALL(isend((void *)((unsigned char *)data_buffer + (size_t)send_offset), \ - send_size, MPI_BYTE, \ - comm_dst, tag, \ - MCA_PML_BASE_SEND_STANDARD, comm, \ - &(send_requests[*num_pending_sends]))); \ - PTPCOLL_VERBOSE(10, ("send request addr is %p", send_requests[*num_pending_sends])); \ - if( OMPI_SUCCESS != rc ) { \ - PTPCOLL_VERBOSE(10, ("Failed to isend data")); \ - return OMPI_ERROR; \ - } \ - ++(*num_pending_sends); \ - radix_mask >>= 1; \ - radix_mask_pow--; \ - } \ -} while(0) - -#define NARRAY_SCATTER_NB(narray_node, process_shift, group_size, \ - data_buffer, base_block_size, count, tag, comm, send_requests, \ - num_pending_sends) \ -do { \ - int n, rc = OMPI_SUCCESS; \ - int dst; \ - int comm_dst; \ - int offset; \ - int size_count = count; \ - \ - /* Send out data to all relevant childrens */ \ - for (n = 0; n < narray_node->n_children && size_count > 0; n++) { \ - \ - dst = narray_node->children_ranks[n] + process_shift; \ - if (dst >= group_size) { \ - dst -= group_size; \ - } \ - \ - comm_dst = group_list[dst]; \ - offset = n * base_block_size; \ - size_count -= base_block_size; \ - if (OPAL_UNLIKELY(size_count < 0)) { \ - count = base_block_size + size_count; \ - } else { \ - count = base_block_size; \ - } \ - \ - /* Non blocking send .... */ \ - PTPCOLL_VERBOSE(9 , ("Bcast, Isend data to %d[%d], count %d, tag %d, addr %p", \ - dst, comm_dst, count, tag, \ - data_buffer)); \ - rc = MCA_PML_CALL(isend((void *)((char *)data_buffer + (size_t)offset), count, MPI_BYTE,\ - comm_dst, tag, \ - MCA_PML_BASE_SEND_STANDARD, comm, \ - &(send_requests[*num_pending_sends]))); \ - if( OMPI_SUCCESS != rc ) { \ - PTPCOLL_VERBOSE(10, ("Failed to isend data")); \ - return OMPI_ERROR; \ - } \ - ++(*num_pending_sends); \ - } \ -} while(0) - -#define NARRAY_SCATTER_B(narray_node, process_shift, group_size, \ - data_buffer, base_block_size, count, tag, comm, send_requests, \ - num_pending_sends, completed) \ -do { \ - NARRAY_SCATTER_NB(narray_node, process_shift, group_size, \ - data_buffer, base_block_size, count, tag, comm, send_requests, \ - num_pending_sends); \ - if (*num_pending_sends > 0) { \ - completed = mca_bcol_ptpcoll_test_all_for_match(num_pending_sends, send_requests, &rc); \ - if (OMPI_SUCCESS != rc) { \ - return OMPI_ERROR; \ - } \ - } else { \ - completed = 1; \ - } \ -} while (0) - -#define CHECK_IF_ROOT_OR_VROOT(module, i) \ - (module->pow_2 == module->ml_mem.ml_buf_desc[i].radix_mask_pow) - -/* inline functions */ -static inline __opal_attribute_always_inline__ -int bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra( - mca_bcol_ptpcoll_module_t *ptpcoll_module, - void *data_buffer, int count, int tag, - int extra_peer, ompi_communicator_t *comm, - int *active_requests, ompi_request_t **requests) -{ - int rc = OMPI_SUCCESS; - int completed = 0; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - - /* tag is -1 already */ - /* send the all data to your extra peer */ - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra to %d tag %d", - extra_peer, tag)); - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[extra_peer], tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - - ++(*active_requests); - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("PR Extra send was not completed")); - /* we have to store the iteration number somewhere */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - return BCOL_FN_COMPLETE; -} - -static inline __opal_attribute_always_inline__ -int bcol_ptpcoll_send_n_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module, - void *data_buffer, int count, int tag, - int *extra_peers, int num_peers, int skip, - ompi_communicator_t *comm, - int *active_requests, ompi_request_t **requests) -{ - int rc = OMPI_SUCCESS; - int completed = 0; - int i; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - - /* send the all data to your extra peer */ - for (i = 0; i < num_peers; i++) { - PTPCOLL_VERBOSE(10, ("send_n_extra to %d tag %d", - extra_peers[i], tag)); - if (extra_peers[i] == skip) { - PTPCOLL_VERBOSE(10, ("SKIP")); - continue; - } - - rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, - group_list[extra_peers[i]], tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - - ++(*active_requests); - } - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("PR Extra send was not completed")); - /* we have to store the iteration number somewhere */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - return BCOL_FN_COMPLETE; -} - -static inline __opal_attribute_always_inline__ -int bcol_ptpcoll_bcast_binomial_gather_anyroot(mca_bcol_ptpcoll_module_t *ptpcoll_module, - int buffer_index, void *data_buffer, int count, int base_block_size) -{ - int rc; - int completed = 0; /* not completed */ - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int i; - int *iteration = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - void *curr_data_sbuffer = NULL, - *curr_data_rbuffer = NULL; - int radix_mask_pow = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow; - int delta; - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_gather_anyroot %d %d %d", - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration, - ptpcoll_module->pow_2, - 1 << ptpcoll_module->pow_2)); - - /* we assume the iteration #iteration already was completed with probe */ - for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration; - i < ptpcoll_module->pow_2; i++) { - int pow2 = 1 << i; - int peer_index = my_group_index ^ pow2; - int comm_rank = group_list[peer_index]; - int slen, rlen, - send_offset, - recv_offset; - - if (i > radix_mask_pow) { - /* *active_requests = 0; */ - /* send - receive data from the peer */ - slen = rlen = pow2 * base_block_size; - send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i)); - recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i)); - curr_data_sbuffer = (void *)((unsigned char *)data_buffer + send_offset); - curr_data_rbuffer = (void *)((unsigned char *)data_buffer + recv_offset); - - delta = count - recv_offset; - if (delta > 0) { - if (delta < rlen) { - /* recv the tail */ - rlen = delta; - } - PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] recv data %p (offset %d) , len %d , dest %d", - pow2, - 1 << ptpcoll_module->pow_2, - curr_data_rbuffer, - recv_offset, - rlen, - comm_rank)); - rc = MCA_PML_CALL(irecv(curr_data_rbuffer, rlen, MPI_BYTE, - comm_rank, tag, comm, &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - ++(*active_requests); - } - - delta = count - send_offset; - if (delta > 0) { - if (delta < slen) { - /* recv the tail */ - slen = delta; - } - PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] sending data %p (offset %d) , len %d , dest %d", - pow2, - 1 << ptpcoll_module->pow_2, - curr_data_sbuffer, - send_offset, - slen, - comm_rank)); - rc = MCA_PML_CALL(isend(curr_data_sbuffer, slen, MPI_BYTE, - comm_rank, tag, - MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - } - - if (*active_requests > 0) { - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - *iteration = i; - /* we have to store the iteration number somewhere */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } - } else if (i == radix_mask_pow) { - /* only receive data */ - rlen = pow2 * base_block_size; - recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i)); - curr_data_rbuffer = (void *)((unsigned char *)data_buffer + recv_offset); - delta = count - recv_offset; - if (0 >= delta) { - /* we have nothing to send, skip the iteration */ - continue; - } - if (delta < rlen) { - /* recv the tail */ - rlen = delta; - } - /* receive data from the peer */ - PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] recv data %p (offset %d) , len %d , dest %d", - pow2, - 1 << ptpcoll_module->pow_2, - curr_data_rbuffer, - recv_offset, - rlen, - comm_rank)); - rc = MCA_PML_CALL(irecv(curr_data_rbuffer, rlen, MPI_BYTE, - comm_rank, tag, comm, &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - ++(*active_requests); - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - *iteration = i; - PTPCOLL_VERBOSE(10, ("Recv was not completed")); - /* we have to store the iteration number somewhere */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - PTPCOLL_VERBOSE(10, ("Recv was completed")); - } else if (i < radix_mask_pow) { - /* Only send data */ - slen = pow2 * base_block_size; - send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i)); - curr_data_sbuffer = (void *)((unsigned char *)data_buffer + send_offset); - delta = count - send_offset; - if (0 >= delta) { - /* we have nothing to send, skip the iteration */ - continue; - } - if (delta < slen) { - slen = delta; - } - PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] sending data %p (offset %d) , len %d , dest %d", - pow2, - 1 << ptpcoll_module->pow_2, - curr_data_sbuffer, - send_offset, - slen, - comm_rank)); - rc = MCA_PML_CALL(isend(curr_data_sbuffer, slen, MPI_BYTE, - comm_rank, tag, MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - *iteration = i; - /* we have to store the iteration number somewhere */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } - } - - return BCOL_FN_COMPLETE; -} - -static inline __opal_attribute_always_inline__ -int bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot(mca_bcol_ptpcoll_module_t *ptpcoll_module, - int buffer_index, void *data_buffer, int count, int base_block_size) -{ - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - int rc; - int completed = 0; /* not completed */ - int comm_root; - int i; - int *radix_mask_pow = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow); - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_status_public_t status; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - int pow2_group_size = ptpcoll_module->pow_2num; - int pow2_distance; - int my_left_boundary_rank; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int group_root_index = 0; - void *curr_data_buffer = NULL; - int tag = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag; - int recv_count = 0; - int *coll_status = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status; - - assert(0 == *active_requests); - - PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot")); - for (i = 0; i < cm->num_to_probe && - 0 == completed; i++) { - MCA_PML_CALL(iprobe(MPI_ANY_SOURCE, tag, - comm, &completed, &status)); - PTPCOLL_VERBOSE(10, ("Bcast, iprobe tag %d", - tag)); - } - - /* the function always returns OMPI_SUCCESS, so we don't check return code */ - if (0 == completed) { - PTPCOLL_VERBOSE(10, ("IPROBE was not matched")); - /* No data was received, return no match error */ - return BCOL_FN_NOT_STARTED; - } - - comm_root = status.MPI_SOURCE; - - - PTPCOLL_VERBOSE(9, ("IPROBE was matched, root of the data on communicator is %d", comm_root)); - - /* For proxy we have to check if we got something from extra node */ - if (PTPCOLL_PROXY & ptpcoll_module->pow_2type) { - if (group_list[ptpcoll_module->proxy_extra_index] == comm_root) { - PTPCOLL_VERBOSE(9, ("IPROBE was matched, root of the data on communicator is extra node %d", - comm_root)); - /* scatter the data among other peer in the pow2 group */ - *radix_mask_pow = ptpcoll_module->pow_2; - - pow2_distance = ptpcoll_module->pow_2 - 1; - curr_data_buffer = data_buffer; - recv_count = count; - goto PR_SCATTHER; - } - } - - /* Find group index for communicator root of the data */ - group_root_index = get_group_index_and_distance_for_binomial - (my_group_index, comm_root, pow2_group_size, group_list, &pow2_distance); - if (OPAL_UNLIKELY(group_root_index < 0)) { - PTPCOLL_ERROR(("Fatal error, no group root index found, my id %d, pow2_g_size %d comm_root %d", - my_group_index, pow2_group_size, comm_root)); - return OMPI_ERROR; - } - - PTPCOLL_VERBOSE(10, ("Group root index is %d distance is %d", - group_root_index, pow2_distance)); - - /* Use group_root_index to calculate the */ - - /* Post receive that will fetch the data */ - /* Pasha: Who is packing data ? - Should I assume that we get contiguous buffer ? - Or should I pack by myself - =================================================================================================== - === On this stage I assume that data is contiguous. So I use MPI_BYTE datatype and COUNT = size === - =================================================================================================== - */ - - recv_count = base_block_size * (1 << pow2_distance); /* we may receive larger data */ - - my_left_boundary_rank = my_group_index & ((~(int)0) << pow2_distance ); - - curr_data_buffer = (void *)((unsigned char *)data_buffer + - (size_t) base_block_size * my_left_boundary_rank); - - *radix_mask_pow = pow2_distance; - - pow2_distance--; - -PR_SCATTHER: - PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], " - "recv_count %d, tag %d, addr %p, offset %d, pow2_distace %d", - comm_root, group_root_index, recv_count, - tag, curr_data_buffer, - my_group_index * base_block_size, pow2_distance)); - - rc = MCA_PML_CALL(recv(curr_data_buffer, recv_count, MPI_BYTE, - comm_root, tag, comm, MPI_STATUS_IGNORE)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - - PTPCOLL_VERBOSE(10, ("Bcast, Data was received")); - - /* Sending forward the data over K-nomial tree */ - *coll_status = PTPCOLL_SCATTER_STARTED; - K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER( - pow2_distance, - my_group_index, group_size, group_list, - data_buffer, base_block_size, - count, tag, comm, requests, - active_requests); - - /* Since the next step (gather) does not really require - completion on scatter , we may return complete */ - return BCOL_FN_COMPLETE; -} - -static inline __opal_attribute_always_inline__ -int bcol_ptpcoll_binomial_root_to_src(int group_root, int my_rank, - int pow2_size, int group_size, int *distance) -{ - int root, relative_rank, src, - pow2_distance = 0, i; - - if (group_root < pow2_size) { - root = group_root; - } else { - /* the source of the data is extra node, - the real root it represented by some rank from - pow2 group */ - root = group_root - pow2_size; - /* shortcut for the case when my rank is root for the group */ - if (my_rank == root) { - *distance = -1; - return group_root; - } - } - - relative_rank = (my_rank - root) < 0 ? my_rank - root + pow2_size : - my_rank - root; - - for (i = 1; i < pow2_size; i<<=1, pow2_distance++) { - if (relative_rank & i) { - src = my_rank ^ i; - if (src >= pow2_size) - src -= pow2_size; - - *distance = pow2_distance; - return src; - } - } - - /* error case */ - *distance = -1; - return -1; -} - -static inline __opal_attribute_always_inline__ -int bcol_ptpcoll_bcast_binomial_test_and_scatter_known_root(mca_bcol_ptpcoll_module_t *ptpcoll_module, - int buffer_index, void *data_buffer, int count, int base_block_size) -{ - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - int rc; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int tmp_radix_mask_pow = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow - 1; - int tag = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag; - int *status = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status; - - PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot")); - - if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests, - requests, &rc)) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - PTPCOLL_VERBOSE(10, ("Bcast, Data was received")); - - /* Sending forward the data over binimial nomial tree */ - *status = PTPCOLL_SCATTER_STARTED; - K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER( - tmp_radix_mask_pow, - my_group_index, group_size, group_list, - data_buffer, base_block_size, - count, tag, comm, requests, - active_requests); - - - return BCOL_FN_COMPLETE; -} - -#define NARRAY_BLOCK_SIZE(size, module, level_size) \ - ((size + (module)->full_narray_tree_num_leafs - 1) / \ - (module)->full_narray_tree_num_leafs) * \ - ((module)->full_narray_tree_num_leafs / \ - ((0 == level_size) ? \ - mca_bcol_ptpcoll_component.narray_knomial_radix : \ - level_size)) - -static inline __opal_attribute_always_inline__ -int bcol_ptpcoll_bcast_narray_test_and_scatter_known_root(mca_bcol_ptpcoll_module_t *ptpcoll_module, - int buffer_index, void *data_buffer, int count, int process_shift, - int relative_group_index) -{ - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - int rc; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag; - int *status = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status; - int scatter_count = 0; - int offset = 0; - int base_block_size = 0; - void *curr_data_buffer = NULL; - - PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_narray_test_and_scatter_known_root")); - - if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests, - requests, &rc)) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - /* Sending forward the data over binimial nomial tree */ - *status = PTPCOLL_SCATTER_STARTED; - if(0 == relative_group_index) { - scatter_count = count; - } else { - scatter_count = NARRAY_BLOCK_SIZE(count, ptpcoll_module, - ptpcoll_module->narray_knomial_node[relative_group_index].level_size); - } - - offset = scatter_count * - ptpcoll_module->narray_knomial_node[relative_group_index].rank_on_level; - - /* make sure that we do not overun memory */ - if (OPAL_UNLIKELY(offset + scatter_count > count)) { - scatter_count = count - offset; - } - - PTPCOLL_VERBOSE(10, ("Bcast, Data was received %d %d %d", - scatter_count, - ptpcoll_module->narray_knomial_node[relative_group_index].level_size, - ptpcoll_module->narray_knomial_node[relative_group_index].rank_on_level)); - - - curr_data_buffer = (void *)((unsigned char *)data_buffer + (size_t)offset); - - /* calculating scatter block size for next level of tree */ - base_block_size = NARRAY_BLOCK_SIZE(count, ptpcoll_module, - ptpcoll_module->narray_knomial_node[relative_group_index].level_size * - mca_bcol_ptpcoll_component.narray_knomial_radix); - - PTPCOLL_VERBOSE(10, ("scatter_known_rootaaa %d %d %d %d %d",scatter_count, offset, base_block_size, - ptpcoll_module->narray_knomial_node[relative_group_index].level_size /mca_bcol_ptpcoll_component.narray_knomial_radix, - ptpcoll_module->full_narray_tree_num_leafs)); - - NARRAY_SCATTER_NB((&ptpcoll_module->narray_knomial_node[relative_group_index]), - process_shift, ptpcoll_module->full_narray_tree_size, - curr_data_buffer, base_block_size, scatter_count, tag, comm, - requests, active_requests); - - /* Bummer, I tried to prevent this, special case for virtual root */ - if(0 == relative_group_index) { - if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests, - requests, &rc)) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - *status = PTPCOLL_ROOT_SEND_STARTED; - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } - - return BCOL_FN_COMPLETE; -} - -static inline __opal_attribute_always_inline__ -int bcol_ptpcoll_bcast_narray_knomial_gather(mca_bcol_ptpcoll_module_t *ptpcoll_module, - const int buffer_index, void *data_buffer, const int count, - const int relative_group_index) -{ - int completed = 0; /* not completed */ - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int blocks_in_step = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask; - int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - int group_size = ptpcoll_module->full_narray_tree_size; - int i, k, - rc, - len, slen, rlen, - peer, group_peer; - size_t s_offset, - r_offset; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **requests = - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; - netpatterns_narray_knomial_tree_node_t *narray_node = - &ptpcoll_module->narray_knomial_node[relative_group_index]; - netpatterns_k_exchange_node_t *k_node = - &narray_node->k_node; - mca_bcol_ptpcoll_component_t *cm = - &mca_bcol_ptpcoll_component; - size_t base_block_size = - NARRAY_BLOCK_SIZE(count, ptpcoll_module, narray_node->level_size); - - PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_narray_knomial_gather %d %d %d %d %d %d %d", - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration, - base_block_size, count, narray_node->level_size, - relative_group_index, k_node->n_exchanges, tag)); - - /* we assume the iteration #iteration already was completed with probe */ - for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration; - i < k_node->n_exchanges; i++, blocks_in_step *= cm->narray_knomial_radix) { - - len = base_block_size * blocks_in_step; - - for (k = 0; k < cm->narray_knomial_radix - 1; k++) { - group_peer = my_group_index + - (k_node->rank_exchanges[i][k] - narray_node->rank_on_level); - if (group_peer >= group_size) { - group_peer -= group_size; - } else if (group_peer < 0) { - group_peer += group_size; - } - peer = group_list[group_peer]; - - r_offset = (size_t)k_node->rank_exchanges[i][k] / blocks_in_step * - len; - - /* check that we do not run out of message boundary */ - if (OPAL_UNLIKELY(r_offset + len > (size_t)count)) { - rlen = count - r_offset; - if (OPAL_UNLIKELY(rlen <= 0)) { - continue; - } - } else { - rlen = len; - } - PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p offset %d len %d %d %d tag %d", - peer, data_buffer, r_offset, rlen, len, blocks_in_step, tag)); - rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + r_offset), - rlen, MPI_BYTE, - peer, tag, comm, &requests[*active_requests])); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to receive data")); - return OMPI_ERROR; - } - ++(*active_requests); - } - - for (k = 0; k < cm->narray_knomial_radix - 1; k++) { - group_peer = my_group_index + - (k_node->rank_exchanges[i][k] - narray_node->rank_on_level); - if (group_peer >= group_size) { - group_peer -= group_size; - } else if (group_peer < 0) { - group_peer += group_size; - } - peer = group_list[group_peer]; - - s_offset = (size_t)narray_node->rank_on_level / blocks_in_step * - len; - - /* check that we do not run out of message boundary */ - if (OPAL_UNLIKELY(s_offset + len > (size_t)count)) { - slen = count - s_offset; - if (OPAL_UNLIKELY(slen <= 0)) { - continue; - } - } else { - slen = len; - } - - PTPCOLL_VERBOSE(10, ("Send data from %d, addr %p offset %d len %d %d %d tag %d", - peer, data_buffer, s_offset, slen, len, blocks_in_step, tag)); - rc = MCA_PML_CALL(isend((void *)((unsigned char *)data_buffer + s_offset), - slen, MPI_BYTE, - peer, tag, MCA_PML_BASE_SEND_STANDARD, comm, - &(requests[*active_requests]))); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - ++(*active_requests); - } - - completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); - if (0 == completed) { - /* cache data for next iteration */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration = - i; /* why not to store step for next iteration ?! */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = - blocks_in_step * cm->narray_knomial_radix; - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - } - - return BCOL_FN_COMPLETE; -} - -END_C_DECLS - -#endif diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_component.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_component.c deleted file mode 100644 index 9f2107882d..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_component.c +++ /dev/null @@ -1,174 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#include -#include -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "bcol_ptpcoll.h" -#include "ompi/mca/bcol/base/base.h" - -#include "bcol_ptpcoll_mca.h" -#include "bcol_ptpcoll_utils.h" - -/* - * Public string showing the bcol ptpcoll V2 component version number - */ -const char *mca_bcol_ptpcoll_component_version_string = - "Open MPI bcol - ptpcoll collective MCA component version " OMPI_VERSION; - - -/* - * Local functions - */ - -static int ptpcoll_open(void); -static int ptpcoll_close(void); - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ - -mca_bcol_ptpcoll_component_t mca_bcol_ptpcoll_component = { - - /* First, fill in the super */ - - { - /* First, the mca_component_t struct containing meta - information about the component itself */ - - .bcol_version = { - MCA_BCOL_BASE_VERSION_2_0_0, - - /* Component name and version */ - - .mca_component_name = "ptpcoll", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - - /* Component open and close functions */ - - .mca_open_component = ptpcoll_open, - .mca_close_component = ptpcoll_close, - .mca_register_component_params = mca_bcol_ptpcoll_register_mca_params, - }, - - /* Initialization / querying functions */ - - .collm_init_query = mca_bcol_ptpcoll_init_query, - .collm_comm_query = mca_bcol_ptpcoll_comm_query, - .init_done = false, - .need_ordering = false, - }, - - /* component specific */ - -}; - -static void -collreq_construct(mca_bcol_ptpcoll_collreq_t *collreq) -{ - collreq->requests = NULL; -} - -static void -collreq_destruct(mca_bcol_ptpcoll_collreq_t *collreq) -{ - if (NULL != collreq->requests) { - free(collreq->requests); - } -} - -OBJ_CLASS_INSTANCE(mca_bcol_ptpcoll_collreq_t, - opal_free_list_item_t, - collreq_construct, - collreq_destruct); - -/* - * Open the component - */ -static int ptpcoll_open(void) -{ - return OMPI_SUCCESS; -} - -/* - * Close the component - */ -static int ptpcoll_close(void) -{ - return OMPI_SUCCESS; -} - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_bcol_ptpcoll_init_query(bool enable_progress_threads, - bool enable_mpi_threads) -{ - /* at this stage there is no reason to disaulify this component */ - - /* done */ - return OMPI_SUCCESS; -} - -/* memory management routines */ - -/* allocte memory - this is a no-op function intended to work with - * mpool2, which will use malloc for allocation, if no other allocator - * is available. - */ -void * bcol_ptpcoll_allocate_memory(size_t length, size_t alignment, - struct mca_bcol_base_module_t *bcol_module) -{ - /* do nothing */ - return NULL; -} - -/* - * register memory - nothing to do - */ -int bcol_ptpcoll_register_memory(void * in_ptr, size_t length, size_t alignment, - struct mca_bcol_base_module_t *bcol_module) -{ - /* nothing to do */ - return OMPI_SUCCESS; -} - -/* deregister memory - nothing to do - */ -int bcol_ptpcoll_deregister_memory( void * in_ptr, - struct mca_bcol_base_module_t *bcol_module) -{ - /* nothing to do */ - return OMPI_SUCCESS; -} - -/* free memory - since we don't allocate, we also don't free */ -int bcol_ptpcoll_free_memory(void *ptr, - struct mca_bcol_base_module_t *bcol_module) -{ - /* nnthing to do */ - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_fanin.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_fanin.c deleted file mode 100644 index 57dafce7bd..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_fanin.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h" - -/* - * Fanin routines - no user data - */ - -int bcol_ptpcoll_fanin( bcol_function_args_t *input_args, - struct mca_bcol_base_module_t *module) -{ - /* local variable */ - int ret=OMPI_SUCCESS; - /* mca_bcol_ptpcoll_module_t *ptp_module=(mca_bcol_ptpcoll_module_t *) module; */ - - /* done */ - return ret; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_fanout.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_fanout.c deleted file mode 100644 index ae5739391b..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_fanout.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h" - -/* - * Fanin routines - no user data - */ - -int bcol_ptpcoll_fanout( bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - /* local variable */ - int ret = OMPI_SUCCESS; - /* TBD: - mca_bcol_ptpcoll_module_t *ptp_module=(mca_bcol_ptpcoll_module_t *) const_args->bcol_module; - */ - - /* done */ - return ret; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.c deleted file mode 100644 index 57caf7c110..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.c +++ /dev/null @@ -1,197 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include -#include -#include -#include - -#include "bcol_ptpcoll_mca.h" -#include "bcol_ptpcoll.h" - -/* - * Local flags - */ -enum { - REGINT_NEG_ONE_OK = 0x01, - REGINT_GE_ZERO = 0x02, - REGINT_GE_ONE = 0x04, - REGINT_NONZERO = 0x08, - REGINT_MAX = 0x88 -}; - -enum { - REGSTR_EMPTY_OK = 0x01, - - REGSTR_MAX = 0x88 -}; - -#if 0 /* Pasha: we will be need this function in future */ -/* - * utility routine for string parameter registration - */ -static int reg_string(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - const char* default_value, char **storage, - int flags) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_bcol_ptpcoll_component.super.bcol_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_STRING, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "bcol", "ptpcoll", - deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGSTR_EMPTY_OK) && (NULL == *storage || 0 == strlen(*storage))) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} -#endif - -/* - * utility routine for integer parameter registration - */ -static int reg_int(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - int default_value, int *storage, int flags) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_bcol_ptpcoll_component.super.bcol_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_INT, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "bcol", "ptpcoll", - deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) { - return OMPI_SUCCESS; - } - if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) || - (0 != (flags & REGINT_GE_ONE) && *storage < 1) || - (0 != (flags & REGINT_NONZERO) && 0 == *storage)) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} - -static int reg_bool(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - bool default_value, bool *storage) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_bcol_ptpcoll_component.super.bcol_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (0 > index) { - return index; - } - - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "bcol", "ptpcoll", - deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - return OMPI_SUCCESS; -} - -int mca_bcol_ptpcoll_register_mca_params(void) -{ - int ret, tmp; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - - ret = OMPI_SUCCESS; -#define CHECK(expr) do {\ - tmp = (expr); \ - if (OMPI_SUCCESS != tmp) ret = tmp; \ - } while (0) - - CHECK(reg_int("priority", NULL, - "PTPCOLL component priority" - "(from 0(low) to 90 (high))", 90, &cm->super.priority, 0)); - - CHECK(reg_int("verbose", NULL, - "Output some verbose PTPCOLL information " - "(0 = no output, nonzero = output)", 0, &cm->verbose, REGINT_GE_ZERO)); - - CHECK(reg_int("k_nomial_radix", NULL, - "The radix of K-Nomial Tree " - "(starts from 2)", 2, &cm->k_nomial_radix, REGINT_GE_ONE)); - - CHECK(reg_int("narray_radix", NULL, - "The radix of Narray Tree " - "(starts from 2)", 2, &cm->narray_radix, REGINT_GE_ONE)); - - CHECK(reg_int("narray_knomial_radix", NULL, - "The radix of Narray/Knomial Tree for scatther-gather type algorithms" - "(starts from 2)", 2, &cm->narray_knomial_radix, REGINT_GE_ONE)); - - CHECK(reg_int("num_to_probe", NULL, - "Number of probe operation in single source data check" - "(starts from 8)", 8, &cm->num_to_probe, REGINT_GE_ONE)); - - CHECK(reg_int("bcast_small_msg_known_root_alg", NULL, - "Algorithm selection for bcast small messages known root" - "(1 - K-nomial, 2 - N-array)", 1, &cm->bcast_small_messages_known_root_alg, - REGINT_GE_ZERO)); - - CHECK(reg_int("bcast_large_msg_known_root_alg", NULL, - "Algorithm selection for bcast large messages known root" - "(1 - Binomial scatther-gather, 2 - N-array scather, K-nomial gather)", - 1, &cm->bcast_large_messages_known_root_alg, REGINT_GE_ZERO)); - - CHECK(reg_int("barrier_alg", NULL, - "Algorithm selection for Barrier" - "(1 - Recursive doubling, 2 - Recursive K-ing)", - 1, &cm->barrier_alg, REGINT_GE_ZERO)); - - /* register parmeters controlling message fragementation */ - CHECK(reg_int("min_frag_size", NULL, - "Minimum fragment size", - getpagesize(), &cm->super.min_frag_size, REGINT_GE_ONE)); - - CHECK(reg_int("max_frag_size", NULL, - "Maximum fragment size", - FRAG_SIZE_NO_LIMIT, &cm->super.max_frag_size, REGINT_NONZERO)); - - CHECK(reg_bool("can_use_user_buffers", NULL, - "User memory can be used by the collective algorithms", - 1, &cm->super.can_use_user_buffers)); - - return ret; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.h deleted file mode 100644 index 4d1067d9e4..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#ifndef MCA_BCOL_PTPCOLL_MCA_H -#define MCA_BCOL_PTPCOLL_MCA_H - -#include "ompi_config.h" - -BEGIN_C_DECLS - -int mca_bcol_ptpcoll_register_mca_params(void); - -END_C_DECLS -#endif diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_module.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_module.c deleted file mode 100644 index ca8c32ec8d..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_module.c +++ /dev/null @@ -1,760 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#include -#include -#include -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "opal/util/show_help.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/mca/pml/pml.h" /* need this for the max tag size */ - -#include "bcol_ptpcoll.h" -#include "bcol_ptpcoll_utils.h" -#include "bcol_ptpcoll_bcast.h" -#include "bcol_ptpcoll_allreduce.h" -#include "bcol_ptpcoll_reduce.h" - -#define BCOL_PTP_CACHE_LINE_SIZE 128 - -/* - * Local functions - */ -static int alloc_allreduce_offsets_array(mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - int rc = OMPI_SUCCESS, i = 0; - netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; - int n_exchanges = k_node->n_exchanges; - - /* Precalculate the allreduce offsets */ - if (0 < k_node->n_exchanges) { - ptpcoll_module->allgather_offsets = (int **) calloc (n_exchanges, sizeof(int *)); - - if (!ptpcoll_module->allgather_offsets) { - return OMPI_ERROR; - } - - for (i = 0; i < n_exchanges ; i++) { - ptpcoll_module->allgather_offsets[i] = (int *) calloc (NOFFSETS, sizeof(int)); - - if (!ptpcoll_module->allgather_offsets[i]){ - return OMPI_ERROR; - } - } - } - - return rc; -} - -static int free_allreduce_offsets_array(mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - int rc = OMPI_SUCCESS, i = 0; - netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree; - int n_exchanges = k_node->n_exchanges; - - if (ptpcoll_module->allgather_offsets) { - for (i=0; i < n_exchanges; i++) { - free (ptpcoll_module->allgather_offsets[i]); - } - } - - free(ptpcoll_module->allgather_offsets); - ptpcoll_module->allgather_offsets = NULL; - return rc; -} - -static void -mca_bcol_ptpcoll_module_construct(mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - uint64_t i; - /* Pointer to component */ - ptpcoll_module->narray_node = NULL; - ptpcoll_module->allgather_offsets = NULL; - ptpcoll_module->super.bcol_component = (mca_bcol_base_component_t *) &mca_bcol_ptpcoll_component; - ptpcoll_module->super.list_n_connected = NULL; - ptpcoll_module->super.hier_scather_offset = 0; - /* no header support in ptp */ - ptpcoll_module->super.header_size = 0; - /* No network context */ - ptpcoll_module->super.network_context = NULL; - /* set the upper limit on the tag */ - i = 2; - ptpcoll_module->tag_mask = 1; - while ( i <= (uint64_t) mca_pml.pml_max_tag && i > 0) { - i <<= 1; - } - ptpcoll_module->ml_mem.ml_buf_desc = NULL; - ptpcoll_module->tag_mask = i - 1; -} - -static void -mca_bcol_ptpcoll_module_destruct(mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - int i; - mca_bcol_ptpcoll_local_mlmem_desc_t *ml_mem = &ptpcoll_module->ml_mem; - - if (NULL != ml_mem->ml_buf_desc) { - /* Release the memory structs that were cache ML memory data */ - uint32_t i, j, ci; - for (i = 0; i < ml_mem->num_banks; i++) { - for (j = 0; j < ml_mem->num_buffers_per_bank; j++) { - ci = i * ml_mem->num_buffers_per_bank + j; - if (NULL != ml_mem->ml_buf_desc[ci].requests) { - free(ml_mem->ml_buf_desc[ci].requests); - } - } - } - /* release the buffer descriptor */ - free(ml_mem->ml_buf_desc); - ml_mem->ml_buf_desc = NULL; - } - - if (NULL != ptpcoll_module->allgather_offsets) { - free_allreduce_offsets_array(ptpcoll_module); - } - - if (NULL != ptpcoll_module->narray_node) { - for (i = 0; i < ptpcoll_module->group_size; i++) { - if (NULL != ptpcoll_module->narray_node[i].children_ranks) { - free(ptpcoll_module->narray_node[i].children_ranks); - } - } - - free(ptpcoll_module->narray_node); - ptpcoll_module->narray_node = NULL; - } - - OBJ_DESTRUCT(&ptpcoll_module->collreqs_free); - - if (NULL != ptpcoll_module->super.list_n_connected) { - free(ptpcoll_module->super.list_n_connected); - ptpcoll_module->super.list_n_connected = NULL; - } - - for (i = 0; i < BCOL_NUM_OF_FUNCTIONS; i++){ - OPAL_LIST_DESTRUCT((&ptpcoll_module->super.bcol_fns_table[i])); - } - - - if (NULL != ptpcoll_module->kn_proxy_extra_index) { - free(ptpcoll_module->kn_proxy_extra_index); - ptpcoll_module->kn_proxy_extra_index = NULL; - } - - if (NULL != ptpcoll_module->alltoall_iovec) { - free(ptpcoll_module->alltoall_iovec); - ptpcoll_module->alltoall_iovec = NULL; - } - - if (NULL != ptpcoll_module->narray_knomial_proxy_extra_index) { - free(ptpcoll_module->narray_knomial_proxy_extra_index); - ptpcoll_module->narray_knomial_proxy_extra_index = NULL; - } - - if (NULL != ptpcoll_module->narray_knomial_node) { - for(i = 0; i < ptpcoll_module->full_narray_tree_size; i++) { - netpatterns_cleanup_narray_knomial_tree (ptpcoll_module->narray_knomial_node + i); - } - free(ptpcoll_module->narray_knomial_node); - ptpcoll_module->narray_knomial_node = NULL; - } - - netpatterns_cleanup_recursive_knomial_allgather_tree_node(&ptpcoll_module->knomial_allgather_tree); - netpatterns_cleanup_recursive_knomial_tree_node(&ptpcoll_module->knomial_exchange_tree); - -} - -OBJ_CLASS_INSTANCE(mca_bcol_ptpcoll_module_t, - mca_bcol_base_module_t, - mca_bcol_ptpcoll_module_construct, - mca_bcol_ptpcoll_module_destruct); - -static int init_ml_buf_desc(mca_bcol_ptpcoll_ml_buffer_desc_t **desc, void *base_addr, uint32_t num_banks, - uint32_t num_buffers_per_bank, uint32_t size_buffer, uint32_t header_size, int group_size, int pow_k) -{ - uint32_t i, j, ci; - mca_bcol_ptpcoll_ml_buffer_desc_t *tmp_desc = NULL; - int k_nomial_radix = mca_bcol_ptpcoll_component.k_nomial_radix; - int pow_k_val = (0 == pow_k) ? 1 : pow_k; - int num_to_alloc = - ((k_nomial_radix - 1) * pow_k_val * 2 + 1 > mca_bcol_ptpcoll_component.narray_radix) ? - (k_nomial_radix - 1) * pow_k_val * 2 + 1 : - mca_bcol_ptpcoll_component.narray_radix * 2; - - - *desc = (mca_bcol_ptpcoll_ml_buffer_desc_t *)calloc(num_banks * num_buffers_per_bank, - sizeof(mca_bcol_ptpcoll_ml_buffer_desc_t)); - if (NULL == *desc) { - PTPCOLL_ERROR(("Failed to allocate memory")); - return OMPI_ERROR; - } - - tmp_desc = *desc; - - for (i = 0; i < num_banks; i++) { - for (j = 0; j < num_buffers_per_bank; j++) { - ci = i * num_buffers_per_bank + j; - tmp_desc[ci].bank_index = i; - tmp_desc[ci].buffer_index = j; - /* *2 is for gather session +1 for extra peer */ - tmp_desc[ci].requests = (ompi_request_t **) - calloc(num_to_alloc, sizeof(ompi_request_t *)); - if (NULL == tmp_desc[ci].requests) { - PTPCOLL_ERROR(("Failed to allocate memory for requests")); - return OMPI_ERROR; - } - /* - * ptpcoll don't have any header, but other bcols may to have. So - * we need to take it in account. - */ - tmp_desc[ci].data_addr = (void *) - ((unsigned char*)base_addr + ci * size_buffer + header_size); - PTPCOLL_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr)); - - /* init reduce implementation flags */ - tmp_desc[ci].reduce_init_called = false; - tmp_desc[ci].reduction_status = 0; - } - } - - return OMPI_SUCCESS; -} - -static void mca_bcol_ptpcoll_set_small_msg_thresholds(struct mca_bcol_base_module_t *super) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = - (mca_bcol_ptpcoll_module_t *) super; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - - /* Subtract out the maximum header size when calculating the thresholds. This - * will account for the headers used by the basesmuma component. If we do not - * take these headers into account we may overrun our buffer. */ - - /* Set the Allgather threshold equals to a ML buff size */ - super->small_message_thresholds[BCOL_ALLGATHER] = - (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / - ompi_comm_size(ptpcoll_module->super.sbgp_partner_module->group_comm); - - /* Set the Bcast threshold, all Bcast algths have the same threshold */ - super->small_message_thresholds[BCOL_BCAST] = - (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX); - - /* Set the Alltoall threshold, the Ring algth sets some limitation */ - super->small_message_thresholds[BCOL_ALLTOALL] = - (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / 2; - - /* Set the Allreduce threshold, NARRAY algth sets some limitation */ - super->small_message_thresholds[BCOL_ALLREDUCE] = - (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / ptpcoll_module->k_nomial_radix; - - /* Set the Reduce threshold, NARRAY algth sets some limitation */ - super->small_message_thresholds[BCOL_REDUCE] = - (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / cm->narray_radix; -} - -/* - * Cache information about ML memory - */ -static int mca_bcol_ptpcoll_cache_ml_memory_info(struct mca_bcol_base_memory_block_desc_t *payload_block, - uint32_t data_offset, - struct mca_bcol_base_module_t *bcol, - void *reg_data) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) bcol; - mca_bcol_ptpcoll_local_mlmem_desc_t *ml_mem = &ptpcoll_module->ml_mem; - struct mca_bcol_base_memory_block_desc_t *desc = payload_block; - int group_size = ptpcoll_module->super.sbgp_partner_module->group_size; - - PTPCOLL_VERBOSE(10, ("mca_bcol_ptpcoll_init_buffer_memory was called")); - - /* cache ml mem desc tunings localy */ - ml_mem->num_banks = desc->num_banks; - ml_mem->num_buffers_per_bank = desc->num_buffers_per_bank; - ml_mem->size_buffer = desc->size_buffer; - - PTPCOLL_VERBOSE(10, ("ML buffer configuration num banks %d num_per_bank %d size %d base addr %p", - desc->num_banks, desc->num_buffers_per_bank, desc->size_buffer, desc->block->base_addr)); - - /* Set first bank index for release */ - ml_mem->bank_index_for_release = 0; - - if (OMPI_SUCCESS != init_ml_buf_desc(&ml_mem->ml_buf_desc, - desc->block->base_addr, - ml_mem->num_banks, - ml_mem->num_buffers_per_bank, - ml_mem->size_buffer, - data_offset, - group_size, - ptpcoll_module->pow_k)) { - PTPCOLL_VERBOSE(10, ("Failed to allocate rdma memory descriptor\n")); - return OMPI_ERROR; - } - - PTPCOLL_VERBOSE(10, ("ptpcoll_module = %p, ml_mem_desc = %p.\n", - ptpcoll_module)); - - return OMPI_SUCCESS; -} - -/* - * Load ptpcoll bcol functions - */ -static void load_func(mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - int fnc; - - /* reset everything to NULL */ - for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) { - - /*ptpcoll_module->super.bcol_function_table[fnc] = NULL;*/ - ptpcoll_module->super.bcol_function_table[fnc] = NULL; - ptpcoll_module->super.bcol_function_init_table[fnc] = NULL; - } - - ptpcoll_module->super.bcol_function_init_table[BCOL_BARRIER] = bcol_ptpcoll_barrier_init; - - ptpcoll_module->super.bcol_function_init_table[BCOL_BCAST] = bcol_ptpcoll_bcast_init; - ptpcoll_module->super.bcol_function_init_table[BCOL_ALLREDUCE] = bcol_ptpcoll_allreduce_init; - ptpcoll_module->super.bcol_function_init_table[BCOL_ALLGATHER] = bcol_ptpcoll_allgather_init; - ptpcoll_module->super.bcol_function_table[BCOL_BCAST] = bcol_ptpcoll_bcast_k_nomial_anyroot; - ptpcoll_module->super.bcol_function_init_table[BCOL_ALLTOALL] = NULL; - ptpcoll_module->super.bcol_function_init_table[BCOL_SYNC] = mca_bcol_ptpcoll_memsync_init; - ptpcoll_module->super.bcol_function_init_table[BCOL_REDUCE] = bcol_ptpcoll_reduce_init; - - /* ML memory cacher */ - ptpcoll_module->super.bcol_memory_init = mca_bcol_ptpcoll_cache_ml_memory_info; - - /* Set thresholds */ - ptpcoll_module->super.set_small_msg_thresholds = mca_bcol_ptpcoll_set_small_msg_thresholds; - - /* setup recursive k-ing tree */ - ptpcoll_module->super.k_nomial_tree = mca_bcol_ptpcoll_setup_knomial_tree; -} - -int mca_bcol_ptpcoll_setup_knomial_tree(mca_bcol_base_module_t *super) -{ - mca_bcol_ptpcoll_module_t *p2p_module = (mca_bcol_ptpcoll_module_t *) super; - int rc = 0; - - rc = netpatterns_setup_recursive_knomial_allgather_tree_node( - p2p_module->super.sbgp_partner_module->group_size, - p2p_module->super.sbgp_partner_module->my_index, - mca_bcol_ptpcoll_component.k_nomial_radix, - super->list_n_connected, - &p2p_module->knomial_allgather_tree); - - return rc; -} - -/* The function used to calculate size */ -static int calc_full_tree_size(int radix, int group_size, int *num_leafs) -{ - int level_cnt = 1; - int total_cnt = 0; - - while( total_cnt < group_size ) { - total_cnt += level_cnt; - level_cnt *= radix; - } - - if (total_cnt > group_size) { - *num_leafs = level_cnt / radix; - return total_cnt - level_cnt / radix; - } else { - *num_leafs = level_cnt; - return group_size; - } -} - -/* Setup N-array scatter Knomial-gather static information */ -static int load_narray_knomial_tree (mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - int rc, i, peer; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - - ptpcoll_module->full_narray_tree_size = calc_full_tree_size( - cm->narray_knomial_radix, - ptpcoll_module->group_size, - &ptpcoll_module->full_narray_tree_num_leafs); - - ptpcoll_module->narray_knomial_proxy_extra_index = (int *) - malloc(sizeof(int) * (cm->narray_knomial_radix)); - if (NULL == ptpcoll_module->narray_knomial_proxy_extra_index) { - PTPCOLL_ERROR(("Failed to allocate memory")); - goto Error; - } - - ptpcoll_module->narray_knomial_node = calloc( - ptpcoll_module->full_narray_tree_size, - sizeof(netpatterns_narray_knomial_tree_node_t)); - if(NULL == ptpcoll_module->narray_knomial_node) { - goto Error; - } - - PTPCOLL_VERBOSE(10 ,("My type is proxy, full tree size = %d [%d]", - ptpcoll_module->full_narray_tree_size, - cm->narray_knomial_radix - )); - - if (ptpcoll_module->super.sbgp_partner_module->my_index < - ptpcoll_module->full_narray_tree_size) { - if (ptpcoll_module->super.sbgp_partner_module->my_index < - ptpcoll_module->group_size - ptpcoll_module->full_narray_tree_size) { - ptpcoll_module->narray_type = PTPCOLL_PROXY; - for (i = 0; i < cm->narray_knomial_radix; i++) { - peer = - ptpcoll_module->super.sbgp_partner_module->my_index * - cm->narray_knomial_radix + i + - ptpcoll_module->full_narray_tree_size; - if (peer >= ptpcoll_module->group_size) { - break; - } - ptpcoll_module->narray_knomial_proxy_extra_index[i] = peer; - } - ptpcoll_module->narray_knomial_proxy_num = i; - } else { - ptpcoll_module->narray_type = PTPCOLL_IN_GROUP;; - } - /* Setting node info */ - for(i = 0; i < ptpcoll_module->full_narray_tree_size; i++) { - rc = netpatterns_setup_narray_knomial_tree( - cm->narray_knomial_radix, - i, - ptpcoll_module->full_narray_tree_size, - &ptpcoll_module->narray_knomial_node[i]); - if(OMPI_SUCCESS != rc) { - goto Error; - } - } - } else { - ptpcoll_module->narray_type = PTPCOLL_EXTRA; - ptpcoll_module->narray_knomial_proxy_extra_index[0] = - (ptpcoll_module->super.sbgp_partner_module->my_index - - ptpcoll_module->full_narray_tree_size) / - cm->narray_knomial_radix; - } - - return OMPI_SUCCESS; - -Error: - if (NULL != ptpcoll_module->narray_knomial_node) { - free(ptpcoll_module->narray_knomial_node); - } - if (NULL != ptpcoll_module->narray_knomial_proxy_extra_index) { - free(ptpcoll_module->narray_knomial_proxy_extra_index); - } - return OMPI_ERROR; -} - -/* Setup N-array static information */ -static int load_narray_tree(mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - int rc, i; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - - ptpcoll_module->narray_node = calloc(ptpcoll_module->group_size, - sizeof(netpatterns_tree_node_t)); - if(NULL == ptpcoll_module->narray_node ) { - goto Error; - } - - for(i = 0; i < ptpcoll_module->group_size; i++) { - rc = netpatterns_setup_narray_tree( - cm->narray_radix, - i, - ptpcoll_module->group_size, - &ptpcoll_module->narray_node[i]); - if(OMPI_SUCCESS != rc) { - goto Error; - } - } - - return OMPI_SUCCESS; - -Error: - if (NULL != ptpcoll_module->narray_node) { - free(ptpcoll_module->narray_node); - } - return OMPI_ERROR; -} - -static int load_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - int i; - mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; - - ptpcoll_module->k_nomial_radix = - cm->k_nomial_radix > ptpcoll_module->group_size ? - ptpcoll_module->group_size : - cm->k_nomial_radix; - - ptpcoll_module->pow_k = pow_k_calc(ptpcoll_module->k_nomial_radix, - ptpcoll_module->group_size, - &ptpcoll_module->pow_knum); - - ptpcoll_module->kn_proxy_extra_index = (int *) - malloc(sizeof(int) * (ptpcoll_module->k_nomial_radix - 1)); - if (NULL == ptpcoll_module->kn_proxy_extra_index) { - PTPCOLL_ERROR(("Failed to allocate memory")); - goto Error; - } - - /* Setting peer type for K-nomial algorithm*/ - if (ptpcoll_module->super.sbgp_partner_module->my_index < ptpcoll_module->pow_knum ) { - if (ptpcoll_module->super.sbgp_partner_module->my_index < - ptpcoll_module->group_size - ptpcoll_module->pow_knum) { - for (i = 0; - i < (ptpcoll_module->k_nomial_radix - 1) && - ptpcoll_module->super.sbgp_partner_module->my_index * - (ptpcoll_module->k_nomial_radix - 1) + - i + ptpcoll_module->pow_knum < ptpcoll_module->group_size - ; i++) { - ptpcoll_module->pow_ktype = PTPCOLL_KN_PROXY; - ptpcoll_module->kn_proxy_extra_index[i] = - ptpcoll_module->super.sbgp_partner_module->my_index * - (ptpcoll_module->k_nomial_radix - 1) + - i + ptpcoll_module->pow_knum; - PTPCOLL_VERBOSE(10 ,("My type is proxy, pow_knum = %d [%d] my extra %d", - ptpcoll_module->pow_knum, - ptpcoll_module->pow_k, - ptpcoll_module->kn_proxy_extra_index[i])); - } - ptpcoll_module->kn_proxy_extra_num = i; - } else { - PTPCOLL_VERBOSE(10 ,("My type is in group, pow_knum = %d [%d]", ptpcoll_module->pow_knum, - ptpcoll_module->pow_k)); - ptpcoll_module->pow_ktype = PTPCOLL_KN_IN_GROUP; - } - } else { - ptpcoll_module->pow_ktype = PTPCOLL_KN_EXTRA; - ptpcoll_module->kn_proxy_extra_index[0] = (ptpcoll_module->super.sbgp_partner_module->my_index - - ptpcoll_module->pow_knum) / (ptpcoll_module->k_nomial_radix - 1); - PTPCOLL_VERBOSE(10 ,("My type is extra , pow_knum = %d [%d] my proxy %d", - ptpcoll_module->pow_knum, - ptpcoll_module->pow_k, - ptpcoll_module->kn_proxy_extra_index[0])); - } - - return OMPI_SUCCESS; - -Error: - if (NULL == ptpcoll_module->kn_proxy_extra_index) { - free(ptpcoll_module->kn_proxy_extra_index); - } - - return OMPI_ERROR; -} - -static int load_binomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - ptpcoll_module->pow_2 = pow_k_calc(2, - ptpcoll_module->group_size, - &ptpcoll_module->pow_2num); - - assert(ptpcoll_module->pow_2num == 1 << ptpcoll_module->pow_2); - assert(ptpcoll_module->pow_2num <= ptpcoll_module->group_size); - - /* Setting peer type for binary algorithm*/ - if (ptpcoll_module->super.sbgp_partner_module->my_index < ptpcoll_module->pow_2num ) { - if (ptpcoll_module->super.sbgp_partner_module->my_index < - ptpcoll_module->group_size - ptpcoll_module->pow_2num) { - PTPCOLL_VERBOSE(10 ,("My type is proxy, pow_2num = %d [%d]", ptpcoll_module->pow_2num, - ptpcoll_module->pow_2)); - ptpcoll_module->pow_2type = PTPCOLL_PROXY; - ptpcoll_module->proxy_extra_index = ptpcoll_module->super.sbgp_partner_module->my_index + - ptpcoll_module->pow_2num; - } else { - PTPCOLL_VERBOSE(10 ,("My type is in group, pow_2num = %d [%d]", ptpcoll_module->pow_2num, - ptpcoll_module->pow_2)); - ptpcoll_module->pow_2type = PTPCOLL_IN_GROUP; - } - } else { - PTPCOLL_VERBOSE(10 ,("My type is extra , pow_2num = %d [%d]", ptpcoll_module->pow_2num, - ptpcoll_module->pow_2)); - ptpcoll_module->pow_2type = PTPCOLL_EXTRA; - ptpcoll_module->proxy_extra_index = ptpcoll_module->super.sbgp_partner_module->my_index - - ptpcoll_module->pow_2num; - } - return OMPI_SUCCESS; -} - -static int load_recursive_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module) -{ - int rc = OMPI_SUCCESS; - rc = netpatterns_setup_recursive_knomial_tree_node( - ptpcoll_module->group_size, - ptpcoll_module->super.sbgp_partner_module->my_index, - mca_bcol_ptpcoll_component.k_nomial_radix, - &ptpcoll_module->knomial_exchange_tree); - return rc; -} - -static int bcol_ptpcoll_collreq_init(opal_free_list_item_t *item, void* ctx) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module= (mca_bcol_ptpcoll_module_t *) ctx; - mca_bcol_ptpcoll_collreq_t *collreq = (mca_bcol_ptpcoll_collreq_t *) item; - - switch(mca_bcol_ptpcoll_component.barrier_alg) { - case 1: - collreq->requests = (ompi_request_t **) - calloc(2, sizeof(ompi_request_t *)); - break; - case 2: - collreq->requests = (ompi_request_t **) - calloc(2 * ptpcoll_module->k_nomial_radix, sizeof(ompi_request_t *)); - break; - } - - if (NULL == collreq->requests) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - return OPAL_SUCCESS; -} - -/* query to see if the module is available for use on the given - * communicator, and if so, what it's priority is. This is where - * the backing shared-memory file is created. - */ -mca_bcol_base_module_t **mca_bcol_ptpcoll_comm_query(mca_sbgp_base_module_t *sbgp, - int *num_modules) -{ - int rc; - /* local variables */ - struct ompi_communicator_t *comm = sbgp->group_comm; - mca_bcol_ptpcoll_module_t *ptpcoll_module = NULL; - mca_bcol_base_module_t **ptpcoll_modules = NULL; - int iovec_size; - - /* initialize local variables */ - *num_modules = 0; - - /* - * This is activated only for intra-communicators - */ - if (OMPI_COMM_IS_INTER(comm) ) { - return NULL; - } - - /* allocate and initialize an sm-v2 module */ - ptpcoll_modules = (mca_bcol_base_module_t **) malloc(sizeof(mca_bcol_base_module_t *)); - if (NULL == ptpcoll_modules) { - return NULL; - } - - ptpcoll_module = OBJ_NEW(mca_bcol_ptpcoll_module_t); - if (NULL == ptpcoll_module) { - free(ptpcoll_modules); - return NULL; - } - - /* On this stage we support only one single module */ - ptpcoll_modules[*num_modules] = &(ptpcoll_module->super); - - (*num_modules)++; - /* set the subgroup */ - ptpcoll_module->super.sbgp_partner_module = sbgp; - /* caching some useful information */ - ptpcoll_module->group_size = - ptpcoll_module->super.sbgp_partner_module->group_size; - - rc = load_binomial_info(ptpcoll_module); - if (OMPI_SUCCESS != rc) { - PTPCOLL_VERBOSE(10, ("Failed to load knomial info")); - goto CLEANUP; - } - - rc = load_knomial_info(ptpcoll_module); - if (OMPI_SUCCESS != rc) { - PTPCOLL_VERBOSE(10, ("Failed to load knomial info")); - goto CLEANUP; - } - - rc = load_narray_tree(ptpcoll_module); - if (OMPI_SUCCESS != rc) { - PTPCOLL_VERBOSE(10, ("Failed to load narray tree")); - goto CLEANUP; - } - - rc = load_narray_knomial_tree(ptpcoll_module); - if (OMPI_SUCCESS != rc) { - PTPCOLL_VERBOSE(10, ("Failed to load narray-knomila tree")); - goto CLEANUP; - } - - rc = load_recursive_knomial_info(ptpcoll_module); - if (OMPI_SUCCESS != rc) { - PTPCOLL_VERBOSE(10, ("Failed to load recursive knomial tree")); - goto CLEANUP; - } - - /* creating collfrag free list */ - OBJ_CONSTRUCT(&ptpcoll_module->collreqs_free, opal_free_list_t); - rc = opal_free_list_init (&ptpcoll_module->collreqs_free, - sizeof(mca_bcol_ptpcoll_collreq_t), - BCOL_PTP_CACHE_LINE_SIZE, - OBJ_CLASS(mca_bcol_ptpcoll_collreq_t), - 0, BCOL_PTP_CACHE_LINE_SIZE, - 256 /* free_list_num */, - -1 /* free_list_max, -1 = infinite */, - 32 /* free_list_inc */, - NULL, 0, NULL, - bcol_ptpcoll_collreq_init, - ptpcoll_module); - if (OMPI_SUCCESS != rc) { - goto CLEANUP; - } - - load_func(ptpcoll_module); - - rc = alloc_allreduce_offsets_array(ptpcoll_module); - if (OMPI_SUCCESS != rc) { - goto CLEANUP; - } - - /* Allocating iovec for PTP alltoall */ - iovec_size = ptpcoll_module->group_size / 2 + ptpcoll_module->group_size % 2; - ptpcoll_module->alltoall_iovec = (struct iovec *) malloc(sizeof(struct iovec) - * iovec_size); - ptpcoll_module->log_group_size = lognum(ptpcoll_module->group_size); - - rc = mca_bcol_base_bcol_fns_table_init(&(ptpcoll_module->super)); - if (OMPI_SUCCESS != rc) { - goto CLEANUP; - } - - /* Zero copy is supported */ - ptpcoll_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY; - - /* return */ - return ptpcoll_modules; - -CLEANUP: - - OBJ_RELEASE(ptpcoll_module); - free(ptpcoll_modules); - return NULL; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.c deleted file mode 100644 index d8fe566543..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.c +++ /dev/null @@ -1,405 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/bcol/bcol.h" -#include "bcol_ptpcoll_reduce.h" -#include "bcol_ptpcoll_utils.h" - -static int bcol_ptpcoll_reduce_narray_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - -static int bcol_ptpcoll_reduce_narray(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args); - - -#define NARRAY_RECV_NB(narray_node, process_shift, group_size, \ - recv_buffer, pack_len, tag, comm, recv_requests, \ - num_pending_recvs) \ -do { \ - int n, rc = OMPI_SUCCESS; \ - int dst; \ - int comm_dst; \ - int offset = 0 ; \ - \ - /* Recieve data from all relevant childrens */ \ - for (n = 0; n < narray_node->n_children; n++) { \ - \ - dst = narray_node->children_ranks[n] + process_shift; \ - if (dst >= group_size) { \ - dst -= group_size; \ - } \ - comm_dst = group_list[dst]; \ - \ - /* Non blocking send .... */ \ - PTPCOLL_VERBOSE(1 , ("Reduce, Irecv data to %d[%d], count %d, tag %d, addr %p", \ - dst, comm_dst, pack_len, tag, \ - data_buffer)); \ - rc = MCA_PML_CALL(irecv((void *)((unsigned char*)recv_buffer + offset), pack_len, MPI_BYTE, \ - comm_dst, tag, comm, \ - &(recv_requests[*num_pending_recvs]))); \ - if( OMPI_SUCCESS != rc ) { \ - PTPCOLL_VERBOSE(10, ("Failed to start non-blocking receive")); \ - return OMPI_ERROR; \ - } \ - ++(*num_pending_recvs); \ - offset += pack_len; \ - } \ -} while(0) - - -static inline int narray_reduce(void *data_buffer, void *recv_buffer, - int nrecvs, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int *reduction_status) { - int pack_len = count * dtype->super.size; - int i = 0; - void *source_buffer = NULL, *result_buffer = NULL; - - source_buffer = data_buffer; - result_buffer = recv_buffer; - - for (i = 0; i < nrecvs; i++) { - ompi_op_reduce(op, (void*)((unsigned char*) source_buffer) , - (void*)((unsigned char*) result_buffer), - count,dtype); - - source_buffer = (void *)((unsigned char*)recv_buffer - + (i+1) * pack_len); - } - - *reduction_status = 1; - return OMPI_SUCCESS; -} -static int bcol_ptpcoll_reduce_narray_progress(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag = -1; - int rc; - int group_size = ptpcoll_module->group_size; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - uint32_t buffer_index = input_args->buffer_index; - struct ompi_op_t *op = input_args->op; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **send_request = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; - ompi_request_t **recv_requests = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[1]; - void *data_buffer = NULL; - void *src_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - void *recv_buffer = (void *) ( - (unsigned char *)input_args->rbuf + - (size_t)input_args->rbuf_offset); - int count = input_args->count; - struct ompi_datatype_t *dtype = input_args->dtype; - int pack_len = input_args->count * input_args->dtype->super.size; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int matched = false; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int relative_group_index = 0; - netpatterns_tree_node_t *narray_node = NULL; - bool not_sent = false; - int parent_rank = -1, comm_parent_rank = -1; - int group_root_index = input_args->root; - - if (!ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduce_init_called) { - bcol_ptpcoll_reduce_narray(input_args, const_args); - } - /* - * By default the src buffer is the data buffer, - * only after reduction, the recv buffer becomes the - * data buffer - */ - data_buffer = src_buffer; - - relative_group_index = my_group_index - group_root_index; - if (relative_group_index < 0) { - relative_group_index +=group_size; - } - - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level tags */ - tag = -tag; - - narray_node = &ptpcoll_module->narray_node[relative_group_index]; - - PTPCOLL_VERBOSE(3, ("reduce, Narray tree Progress")); - - PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_reduce_narray, buffer index: %d " - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d [%d]" - "buff: %p ", - buffer_index, tag, - ptpcoll_module->tag_mask, input_args->sequence_num, - input_args->root_flag, input_args->root_route->rank, - data_buffer)); - - /* - Check if the data was received - */ - if (0 != *active_requests) { - matched = mca_bcol_ptpcoll_test_all_for_match - (active_requests, recv_requests, &rc); - if (OMPI_SUCCESS != rc) { - return OMPI_ERROR; - } - - - /* All data was received, then do a reduction*/ - if(matched) { - narray_reduce(data_buffer, recv_buffer, narray_node->n_children, count, dtype, op, - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status); - - /* - * The reduction result is in the recv buffer, so it is the new data - * buffer - */ - data_buffer = recv_buffer; - - /* If not reduced, means also, you might not posted a send */ - not_sent = true; - } else { - PTPCOLL_VERBOSE(10, ("reduce root is started")); - return BCOL_FN_STARTED; - } - } - - /* I'm root, I'm done */ - if (input_args->root_flag) { - return BCOL_FN_COMPLETE; - } - - PTPCOLL_VERBOSE(1,("Testing Sending Match")); - - /* If send was not posted */ - /* Manju: Leaf node should never post in the progress logic */ - if (not_sent) { - parent_rank = - ptpcoll_module->narray_node[relative_group_index].parent_rank + - group_root_index; - if (parent_rank >= group_size) { - parent_rank -= group_size; - } - - comm_parent_rank = group_list[parent_rank]; - PTPCOLL_VERBOSE(1,("Sending data to %d ",comm_parent_rank)); - - rc = MCA_PML_CALL(isend(data_buffer, pack_len, MPI_BYTE, - comm_parent_rank, - tag, MCA_PML_BASE_SEND_STANDARD, comm, send_request)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - } - - if (0 == mca_bcol_ptpcoll_test_for_match(send_request, &rc)) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - /* Data has not been sent. Return that the collective has been stated - * because we MUST call test on this request once it is finished to - * ensure that it is properly freed. */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - return BCOL_FN_COMPLETE; -} - -static int bcol_ptpcoll_reduce_narray(bcol_function_args_t *input_args, - struct mca_bcol_base_function_t *const_args) -{ - mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; - - int tag; - int rc; - int group_size = ptpcoll_module->group_size; - int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; - uint32_t buffer_index = input_args->buffer_index; - - struct ompi_op_t *op = input_args->op; - ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; - ompi_request_t **recv_requests = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[1]; - ompi_request_t **send_request = - &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; - - void *data_buffer = NULL; - void *src_buffer = (void *) ( - (unsigned char *)input_args->sbuf + - (size_t)input_args->sbuf_offset); - void *recv_buffer = (void *) ( - (unsigned char *)input_args->rbuf + - (size_t)input_args->rbuf_offset); - int count = input_args->count; - struct ompi_datatype_t *dtype = input_args->dtype; - int pack_len = input_args->count * input_args->dtype->super.size; - int *active_requests = - &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); - int matched = true; - int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; - int group_root_index = -1; - int relative_group_index = 0; - netpatterns_tree_node_t *narray_node = NULL; - int parent_rank = -1, comm_parent_rank = -1; - - - /* This is first function call that should be called, not progress. - * The fragmentation code does this, so switch from progress to here. - * The flag indicates whether, we have entered this code * - */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduce_init_called = true; - - PTPCOLL_VERBOSE(1, ("Reduce, Narray tree")); - /* reset active request counter */ - (*active_requests) = 0; - /* keep tag within the limit support by the pml */ - tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); - /* mark this as a collective tag, to avoid conflict with user-level flags */ - tag = -tag; - - PTPCOLL_VERBOSE(1, ("bcol_ptpcoll_reduce_narray, buffer index: %d " - "tag: %d " - "tag_mask: %d " - "sn: %d " - "root: %d " - "buff: %p ", - buffer_index, tag, - ptpcoll_module->tag_mask, input_args->sequence_num, - input_args->root_flag, - src_buffer)); - - /* Compute Root Index Shift */ - group_root_index = input_args->root; - relative_group_index = my_group_index - group_root_index; - if (relative_group_index < 0) { - relative_group_index += group_size; - } - - narray_node = &ptpcoll_module->narray_node[relative_group_index]; - - if (0 == narray_node->n_children) { - PTPCOLL_VERBOSE(10, ("I'm leaf of the data")); - /* - * I'm root of the operation - * send data to N childrens - */ - data_buffer = src_buffer; - goto NARRAY_SEND_DATA; - } - - /* Not leaf, either an internal node or root */ - NARRAY_RECV_NB(narray_node, group_root_index, group_size, - recv_buffer, pack_len, tag, comm, recv_requests, - active_requests); - - - /* We have not done reduction, yet */ - ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status = 0; - - /* We can not block. So run couple of test for data arrival */ - matched = mca_bcol_ptpcoll_test_all_for_match - (active_requests, recv_requests, &rc); - - /* Check if received the data */ - if(matched) { - - narray_reduce(src_buffer, recv_buffer, narray_node->n_children, - count, dtype, op, &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status); - PTPCOLL_VERBOSE(1, ("Reduce, received data from all childrend ")); - data_buffer = recv_buffer; - - } else { - - PTPCOLL_VERBOSE(1, ("reduce root is started")); - return BCOL_FN_STARTED; - } - - /* I'm root, I'm done */ - if (input_args->root_flag) { - return BCOL_FN_COMPLETE; - } - - -NARRAY_SEND_DATA: - - /* - * Send the data (reduce in case of internal nodes, or just data in - * case of leaf nodes) to the parent - */ - narray_node = &ptpcoll_module->narray_node[relative_group_index]; - - parent_rank = - ptpcoll_module->narray_node[relative_group_index].parent_rank + - group_root_index; - if (parent_rank >= group_size) { - parent_rank -= group_size; - } - - comm_parent_rank = group_list[parent_rank]; - PTPCOLL_VERBOSE(1,("Sending data to %d ",comm_parent_rank)); - - rc = MCA_PML_CALL(isend(data_buffer, pack_len, MPI_BYTE, - comm_parent_rank, - tag, MCA_PML_BASE_SEND_STANDARD, comm, send_request)); - if( OMPI_SUCCESS != rc ) { - PTPCOLL_VERBOSE(10, ("Failed to send data")); - return OMPI_ERROR; - } - - /* We can not block. So run couple of test for data arrival */ - if (0 == mca_bcol_ptpcoll_test_for_match(send_request, &rc)) { - PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); - /* No data was received, return no match error */ - return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; - } - - return BCOL_FN_COMPLETE; -} - - -int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super) -{ - mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; - mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; - - PTPCOLL_VERBOSE(1,("Initialization Reduce - Narray")); - comm_attribs.bcoll_type = BCOL_REDUCE; - comm_attribs.comm_size_min = 0; - comm_attribs.comm_size_max = 1024 * 1024; - comm_attribs.waiting_semantics = NON_BLOCKING; - - inv_attribs.bcol_msg_min = 0; - inv_attribs.bcol_msg_max = 20000; /* range 1 */ - - inv_attribs.datatype_bitmap = 0xffffffff; - inv_attribs.op_types_bitmap = 0xffffffff; - - - comm_attribs.data_src = DATA_SRC_KNOWN; - mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, - bcol_ptpcoll_reduce_narray, - bcol_ptpcoll_reduce_narray_progress); - - comm_attribs.data_src = DATA_SRC_KNOWN; - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.h deleted file mode 100644 index 195ce7fad9..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.h +++ /dev/null @@ -1,25 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_PTPCOLL_REDUCE_H -#define MCA_BCOL_PTPCOLL_REDUCE_H - -#include "ompi_config.h" -#include "bcol_ptpcoll.h" -#include "bcol_ptpcoll_utils.h" - -BEGIN_C_DECLS - -int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super); - -int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super); - -#endif /* MCA_BCOL_PTPCOLL_REDUCE_H */ diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_utils.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_utils.c deleted file mode 100644 index 9677c4ba93..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_utils.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#include "ompi_config.h" - -#include "bcol_ptpcoll.h" -#include "bcol_ptpcoll_utils.h" - -/* - * Return closet power of K, for the number, and the number - */ -int pow_k_calc(int k, int number, int *out_number) -{ - int power = 0; - int n = 1; - - while (n < number) { - n *= k; - ++power; - } - - if (n > number) { - n /= k; - --power; - } - if (NULL != out_number) { - *out_number = n; - } - - return power; -} - -/* - * Communicator rank to group index conversion function for K-nomial tree. - * Complexity: (K-1) Log _base_K N - * - * Input: - * my_group_index - my process index in the group - * comm_source - the communicator rank of the source of data - * radix - radix of K-nomial tree - * group_size - the size of my group - * group_array[] - one to one map from group index to communicator rank - * - * Output: - * Group index for comm_source. - */ - -int get_group_index_and_distance_for_binomial(int my_group_index, int comm_source, - int group_size, int *group_array, int *pow_distance) -{ - int group_index; - int i; - *pow_distance = 0; - - for (i = 1; i < group_size; i<<=1, (*pow_distance)++) { - group_index = my_group_index ^ i; - if (comm_source == group_array[group_index]) { - return group_index; - } - } - - *pow_distance = -1; - return -1; -} - -int get_group_index_and_distance_for_k_nomial(int my_group_index, int comm_source, int radix, - int group_size, int *group_array, int *pow_distance) -{ - int group_index; - int offset = 1; /* offset equal to 1 (radix_power) */ - int radix_power = 1; /* radix power 0 */ - *pow_distance = 0; - - /* - * Go trough range of possible offsets from my rank, - * for each offset we calculate k-nomial tree root. - */ - while(offset < group_size) { - /* K-nomial tree root calculation for the offset */ - if (offset % (radix * radix_power)) { - group_index = my_group_index - offset; - /* wrap around if the group is negative */ - if (group_index < 0) { - group_index += group_size; - } - PTPCOLL_VERBOSE(10, ("Checking %d", group_index)); - if (comm_source == group_array[group_index]) { - return group_index; - } - offset += radix_power; - } else { - /* we done with the section of the tree, go to next one */ - radix_power *= radix; - (*pow_distance)++; - } - } - - /* No source was found, return -1 */ - *pow_distance = -1; - return -1; -} - -int get_group_index_for_k_nomial(int my_group_index, int comm_source, int radix, int group_size, int *group_array) -{ - int group_index; - int radix_power = 1; /* radix power 0 */ - int offset = 1; /* offset equal to 1 (radix_power) */ - - /* - * Go trough range of possible offsets from my rank, - * for each offset we calculate k-nomial tree root. - */ - while(offset < group_size) { - /* K-nomial tree root calculation for the offset */ - if (offset % (radix * radix_power)) { - group_index = my_group_index - offset; - /* wrap around if the group is negative */ - if (group_index < 0) { - group_index += group_size; - } - if (comm_source == group_array[group_index]) { - return group_index; - } - offset += radix_power; - } else { - /* we done with the section of the tree, go to next one */ - radix_power *= radix; - } - } - - /* No source was found, return -1 */ - return -1; -} diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_utils.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_utils.h deleted file mode 100644 index 231a9f139b..0000000000 --- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_utils.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_PTPCOLL_UTILS_H -#define MCA_BCOL_PTPCOLL_UTILS_H - -#include "ompi_config.h" - -#include "ompi/mca/rte/rte.h" - -BEGIN_C_DECLS - -/* - * Return closet power of K, for the number - */ -int pow_k_calc(int k, int number, int *out_number); - -/* - * Communicator rank to group index conversion function for K-nomial tree. - */ -int get_group_index_for_k_nomial(int my_group_index, int comm_source, int radix, int group_size, int *group_array); - -/* the same like above, just more information on return */ -int get_group_index_and_distance_for_k_nomial(int my_group_index, int comm_source, int radix, - int group_size, int *group_array, int *pow_distance); - -int get_group_index_and_distance_for_binomial(int my_group_index, int comm_source, - int group_size, int *group_array, int *pow_distance); -/* - * Error and debug Macros/Functions - */ -static inline int mca_bcol_ptpcoll_err(const char* fmt, ...) -{ - va_list list; - int ret; - - va_start(list, fmt); - ret = vfprintf(stderr, fmt, list); - va_end(list); - return ret; -} - -#define PTPCOLL_ERROR(args) \ - do { \ - mca_bcol_ptpcoll_err("[%s]%s[%s:%d:%s] PTPCOLL ", \ - ompi_process_info.nodename, \ - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - mca_bcol_ptpcoll_err args; \ - mca_bcol_ptpcoll_err("\n"); \ - } while(0) - -#if OPAL_ENABLE_DEBUG -#define PTPCOLL_VERBOSE(level, args) \ - do { \ - if (mca_bcol_ptpcoll_component.verbose >= level) { \ - mca_bcol_ptpcoll_err("[%s]%s[%s:%d:%s] PTPCOLL ", \ - ompi_process_info.nodename, \ - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - mca_bcol_ptpcoll_err args; \ - mca_bcol_ptpcoll_err("\n"); \ - } \ - } while(0) -#else -#define PTPCOLL_VERBOSE(level, args) -#endif - -END_C_DECLS - -#endif diff --git a/ompi/mca/bcol/ptpcoll/owner.txt b/ompi/mca/bcol/ptpcoll/owner.txt deleted file mode 100644 index 1c86df367b..0000000000 --- a/ompi/mca/bcol/ptpcoll/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL -status: unmaintained diff --git a/ompi/mca/coll/ml/Makefile.am b/ompi/mca/coll/ml/Makefile.am deleted file mode 100644 index a4022ebf7d..0000000000 --- a/ompi/mca/coll/ml/Makefile.am +++ /dev/null @@ -1,89 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights -# reserved. -# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2016 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -AM_LFLAGS = -Pcoll_ml_config_yy -LEX_OUTPUT_ROOT = lex.coll_ml_config_yy - -dist_ompidata_DATA = \ - mca-coll-ml.config \ - help-mpi-coll-ml.txt - -sources = coll_ml.h \ - coll_ml_inlines.h \ - coll_ml_module.c \ - coll_ml_allocation.h \ - coll_ml_allocation.c \ - coll_ml_barrier.c \ - coll_ml_bcast.c \ - coll_ml_colls.h \ - coll_ml_component.c \ - coll_ml_copy_fns.c \ - coll_ml_descriptors.c \ - coll_ml_functions.h \ - coll_ml_hier_algorithms.c \ - coll_ml_hier_algorithms_setup.c \ - coll_ml_hier_algorithms_bcast_setup.c \ - coll_ml_hier_algorithms_allreduce_setup.c \ - coll_ml_hier_algorithms_reduce_setup.c \ - coll_ml_hier_algorithms_common_setup.c \ - coll_ml_hier_algorithms_common_setup.h \ - coll_ml_hier_algorithms_allgather_setup.c \ - coll_ml_hier_algorithm_memsync_setup.c \ - coll_ml_custom_utils.h \ - coll_ml_custom_utils.c \ - coll_ml_progress.c \ - coll_ml_reduce.c \ - coll_ml_allreduce.c \ - coll_ml_allgather.c \ - coll_ml_mca.h \ - coll_ml_mca.c \ - coll_ml_lmngr.h \ - coll_ml_lmngr.c \ - coll_ml_hier_algorithms_barrier_setup.c \ - coll_ml_select.h \ - coll_ml_select.c \ - coll_ml_memsync.c \ - coll_ml_lex.h \ - coll_ml_lex.l \ - coll_ml_config.c \ - coll_ml_config.h - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -component_noinst = -component_install = -if MCA_BUILD_ompi_coll_ml_DSO -component_install += mca_coll_ml.la -else -component_noinst += libmca_coll_ml.la -endif - -# See ompi/mca/btl/ml/Makefile.am for an explanation of -# libmca_common_ml.la. - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_coll_ml_la_SOURCES = $(sources) -mca_coll_ml_la_LDFLAGS = -module -avoid-version -mca_coll_ml_la_LIBADD = - - -noinst_LTLIBRARIES = $(component_noinst) -libmca_coll_ml_la_SOURCES =$(sources) -libmca_coll_ml_la_LDFLAGS = -module -avoid-version - -maintainer-clean-local: - rm -f coll_ml_lex.c diff --git a/ompi/mca/coll/ml/coll_ml.h b/ompi/mca/coll/ml/coll_ml.h deleted file mode 100644 index ab03c4f3e4..0000000000 --- a/ompi/mca/coll/ml/coll_ml.h +++ /dev/null @@ -1,1022 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#ifndef MCA_COLL_ML_ML_H -#define MCA_COLL_ML_ML_H - -#include "ompi_config.h" - -#include "ompi/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/datatype/opal_convertor.h" -#include "opal/threads/mutex.h" - -#include "ompi/mca/coll/coll.h" -#include "ompi/request/request.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/sbgp/sbgp.h" -#include "ompi/op/op.h" -#include "opal/class/opal_free_list.h" - -#include "coll_ml_lmngr.h" -#include "coll_ml_functions.h" -#include "coll_ml_colls.h" -#include "coll_ml_allocation.h" -#include "coll_ml_config.h" - -BEGIN_C_DECLS - -/* macros for return status */ -enum { - ML_OMPI_COMPLETE = 1, - ML_OMPI_INCOMPLETE -}; - -enum { - ML_SMALL_MSG, - ML_LARGE_MSG, - ML_NUM_MSG -}; - -/* ML collectives IDs */ -enum { - /* blocking functions */ - ML_ALLGATHER, - ML_ALLGATHERV, - ML_ALLREDUCE, - ML_ALLTOALL, - ML_ALLTOALLV, - ML_ALLTOALLW, - ML_BARRIER, - ML_BCAST, - ML_EXSCAN, - ML_GATHER, - ML_GATHERV, - ML_REDUCE, - ML_REDUCE_SCATTER, - ML_SCAN, - ML_SCATTER, - ML_SCATTERV, - ML_FANIN, - ML_FANOUT, - - /* nonblocking functions */ - ML_IALLGATHER, - ML_IALLGATHERV, - ML_IALLREDUCE, - ML_IALLTOALL, - ML_IALLTOALLV, - ML_IALLTOALLW, - ML_IBARRIER, - ML_IBCAST, - ML_IEXSCAN, - ML_IGATHER, - ML_IGATHERV, - ML_IREDUCE, - ML_IREDUCE_SCATTER, - ML_ISCAN, - ML_ISCATTER, - ML_ISCATTERV, - ML_IFANIN, - ML_IFANOUT, - ML_NUM_OF_FUNCTIONS -}; - -/* ML broadcast algorithms */ -enum { - COLL_ML_STATIC_BCAST, - COLL_ML_SEQ_BCAST, - COLL_ML_UNKNOWN_BCAST, -}; - -struct mca_bcol_base_module_t; - -/* collective function arguments - gives - * one function signature for calling all collective setup - * routines, with the initial call to a collective function having - * the context to access the right parts of the data structure. - * this information is used by each of the setup functions to - * setup the correct information for each of the functions in the - * hierarchy that will be called. */ - -/* RLG NOTE: Need to figure out what arguments to store here, - * and which ones directly in the message descriptor - */ -struct mpi_coll_fn_params_t { - union { - struct { - ompi_communicator_t *comm; - int n_fanin_steps; - int n_fanout_steps; - int n_recursive_doubling_steps; - } ibarrier_recursive_doubling; - - struct { - int root; - ompi_communicator_t *comm; - struct ompi_datatype_t *datatype; - } ibcast; - } coll_fn; -}; -typedef struct mpi_coll_fn_params_t mpi_coll_fn_params_t; - -/* algorithm parmeters needed for the setup function */ -struct mpi_coll_algorithm_params_t { - union { - struct { - int n_fanin_steps; - int n_fanout_steps; - int n_recursive_doubling_steps; - } ibarrier_recursive_doubling; - - struct { - int place_holder; - } ibcast; - } coll_fn; -}; -typedef struct mpi_coll_algorithm_params_t mpi_coll_algorithm_params_t; - -/* setup function - used to setup each segment (or fragment) - * to be processed - */ -struct mca_coll_ml_module_t; -struct mca_coll_ml_topology_t; - -typedef int (*coll_fragment_comm_setup_fn)(struct mca_coll_ml_module_t *ml_module, - mpi_coll_fn_params_t *fn_params, mpi_coll_algorithm_params_t *algorithm_params); -/* full collective description */ -struct coll_ml_collective_description_t { - /* number of temp buffers */ - int n_buffers; - - /* description size */ - int n_functions; - - /* collective setup function - called for every non-blocking - * function, and for each fragment of such a message - */ - coll_fragment_comm_setup_fn *coll_fn_setup_fn; - - /* algorithm parameters */ - mpi_coll_algorithm_params_t alg_params; - - /* list of functions */ - mca_bcol_base_function_t *functions; - - /* function names - for debugging */ - char **function_names; - - /* Signalling collective completion */ - bool completion_flag; -}; - -typedef struct coll_ml_collective_description_t coll_ml_collective_description_t; - -/* Utility data structure */ -struct rank_properties_t { - int rank; - int leaf; - int num_of_ranks_represented; -}; typedef struct rank_properties_t rank_properties_t; - -/* data structure for holding node information for the nodes of the - * hierarchical communications tree. - */ -struct sub_group_params_t { - /* rank of root in the communicator */ - int root_rank_in_comm; - - /* index in subgroup */ - int root_index; - - /* number of ranks in subgroup */ - int n_ranks; - - /* index of the first element in the subgroup. The - * assumption is that - * ranks for all subgroups are stored in a single - * linear array - */ - int index_of_first_element; - - /* - * level in the hierarchy - subgroups at the same - * level don't overlap. May not be the same as the - * sbgp level. - */ - int level_in_hierarchy; - - /* - * Information on the ranks in the subgroup. This includes - * the rank, and wether or not the rank is a source/sink of - * of data in this subgroup, or just a "pass through". - */ - rank_properties_t *rank_data; - - /* level one index - for example, - for( i = 0; i < level_one_index; i++) will loop - through all level one subgroups, this is significant - since level one is a disjoint partitioning of all ranks - i.e. all ranks appear once and only once at level one - */ - int level_one_index; -}; -typedef struct sub_group_params_t sub_group_params_t; - -/* function to setup information on the order of a given bcol within - * a specific ML-level algorithm. - */ -int mca_coll_ml_setup_scratch_vals(mca_coll_ml_compound_functions_t *func_list, - int *scratch_indx, int *scratch_num, int n_hiers); - -/* driver for setting up collective communication description */ - -int ml_coll_schedule_setup(struct mca_coll_ml_module_t *ml_module); - -int ml_coll_up_and_down_hier_setup( - struct mca_coll_ml_module_t *ml_module, - struct mca_coll_ml_topology_t *topo_info, - int up_function_idx, - int top_function_idx, - int down_function_idx, - int collective); - -int ml_coll_barrier_constant_group_data_setup( - struct mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t *schedule); - -/* Barrier */ -int ml_coll_hier_barrier_setup(struct mca_coll_ml_module_t *ml_module); - -/* allreduce */ -int ml_coll_hier_allreduce_setup(struct mca_coll_ml_module_t *ml_module); -int ml_coll_hier_allreduce_setup_new(struct mca_coll_ml_module_t *ml_module); -void ml_coll_hier_allreduce_cleanup_new(struct mca_coll_ml_module_t *ml_module); - -/* alltoall */ -int ml_coll_hier_alltoall_setup(struct mca_coll_ml_module_t *ml_module); -int ml_coll_hier_alltoall_setup_new(struct mca_coll_ml_module_t *ml_module); - -/* allgather */ -int ml_coll_hier_allgather_setup(struct mca_coll_ml_module_t *ml_module); -void ml_coll_hier_allgather_cleanup(struct mca_coll_ml_module_t *ml_module); - -/* gather */ -int ml_coll_hier_gather_setup(struct mca_coll_ml_module_t *ml_module); - -/* broadcast */ -int ml_coll_hier_bcast_setup(struct mca_coll_ml_module_t *ml_module); -void ml_coll_hier_bcast_cleanup(struct mca_coll_ml_module_t *ml_module); - -/* reduce */ -int ml_coll_hier_reduce_setup(struct mca_coll_ml_module_t *ml_module); -void ml_coll_hier_reduce_cleanup(struct mca_coll_ml_module_t *ml_module); - -/* reduce */ -int ml_coll_hier_scatter_setup(struct mca_coll_ml_module_t *ml_module); - -/* alltoall */ -int mca_coll_ml_alltoall(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_ml_alltoall_nb(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); - - -/* allgather */ -int mca_coll_ml_allgather(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -/* non-blocking allgather */ -int mca_coll_ml_allgather_nb(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); - -/* gather */ -int mca_coll_ml_gather(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -/* nonblocking Barrier */ -int ml_coll_hier_nonblocking_barrier_setup(struct mca_coll_ml_module_t *ml_module, struct mca_coll_ml_topology_t *topo_info); - -/* Memory syncronization collective setup */ -int ml_coll_memsync_setup(struct mca_coll_ml_module_t *ml_module); - -/* Fragment descriptor */ -struct mca_coll_ml_descriptor_t; -struct mca_coll_ml_fragment_t { - opal_list_item_t super; - - struct mca_coll_ml_descriptor_t *full_msg_descriptor; - int offset; /*offset for progress pointer*/ - int length; /*fragment length I assume*/ - opal_convertor_t convertor; /*convertor for copy/pack data*/ - - /* current function index */ - int current_fn_index; - - /* array of function arguments */ - struct bcol_function_args_t *fn_args; - -}; -typedef struct mca_coll_ml_fragment_t mca_coll_ml_fragment_t; -OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_ml_fragment_t); - -#define MCA_COLL_ML_NO_BUFFER -1 - -#define MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, index, desc) \ -do { \ - (coll_op)->variable_fn_params.buffer_index = index; \ - (coll_op)->fragment_data.buffer_desc = desc; \ - /* pasha - why we duplicate it ? */ \ - (coll_op)->variable_fn_params.src_desc = desc; \ - (coll_op)->variable_fn_params.hier_factor = 1; \ - (coll_op)->variable_fn_params.need_dt_support = false; \ -} while (0) - -/*Full message descriptor*/ -struct mca_coll_ml_descriptor_t { - ompi_request_t super; /*base request*/ - struct ompi_datatype_t *datatype; /*ompi datatype*/ - size_t count; /*count of user datatype elements*/ - uint32_t sequence_num; /*sequence number for collective operation*/ - size_t frags_limit; /*upper limit on # of fragments*/ - size_t frags_start; /*number of fragments started*/ - - /*number of fragments completed*/ - size_t frags_complete; - - /* number of fragments needed to process this message */ - size_t n_fragments; - - volatile bool free_resource; /*signals release resource*/ - - /*pointer to reduction operation, e.g. MPI_MIN - need to handle - * user defined functions also */ - /* ompi_predefined_op_t *operation; */ - - /*pointer to a communication schedule, data struct undefined*/ - struct coll_ml_collective_description_t *local_comm_description; - - /* fragment descriptor - we always have a fragment descriptor - * if we get a full message descriptor. Optimization for - * small messages */ - mca_coll_ml_fragment_t fragment; - /* The ML memory buffer index that should consist the send and - recv information - if the index is -1, it means no buffer was allocated */ - uint64_t buffer_index; -}; -typedef struct mca_coll_ml_descriptor_t mca_coll_ml_descriptor_t; -OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_ml_descriptor_t); - -/* sbgp and bcol module pairs */ -struct hierarchy_pairs { - mca_sbgp_base_module_t *subgroup_module; - struct mca_bcol_base_module_t **bcol_modules; - int num_bcol_modules; - int bcol_index; - mca_bcol_base_component_t *bcol_component; -}; -typedef struct hierarchy_pairs hierarchy_pairs; - -/* list of ranks in each group */ -struct ml_level_t { - int n_modules; - hierarchy_pairs *modules; -}; - -typedef struct ml_level_t ml_level_t; - -enum { - COLL_ML_HR_FULL, /* Full hierarchy topology, all bcols and sbgps attends in discovery */ - COLL_ML_HR_ALLREDUCE, - COLL_ML_HR_NBS, /* All hierarchy except base socket */ - COLL_ML_HR_SINGLE_PTP, /* Single flat ptp hierarchy */ - COLL_ML_HR_SINGLE_IBOFFLOAD, /* Single flat iboffload hierarchy */ - COLL_ML_TOPO_MAX -}; - -/* Topology-hierarchy discovery function */ -struct mca_coll_ml_module_t; /* forward declaration for the function */ - -typedef int (* mca_coll_topo_discovery_fn_t) - (struct mca_coll_ml_module_t *ml_module, int n_hierarchies); - -typedef enum { - COLL_ML_TOPO_DISABLED = 0, - COLL_ML_TOPO_ENABLED = 1 -} topo_status_t; - -/** - * Structure to hold the sm coll component. First it holds the - * base coll component, and then holds a bunch of - * sm-coll-component-specific stuff (e.g., current MCA param - * values). - */ -struct mca_coll_ml_component_t { - /** Base coll component */ - mca_coll_base_component_2_0_0_t super; - - /** MCA parameter: Priority of this component */ - int ml_priority; - - /** MCA parameter: subgrouping components to use */ - char *subgroups_string; - - /** MCA parameter: basic collectives components to use */ - char *bcols_string; - - /** verbosity level */ - int verbose; - - /** max of communicators available to run ML */ - unsigned int max_comm; - - /** min size of comm to be available to run ML */ - int min_comm_size; - - /* base sequence number to use - the expectation is that - * this will be used as a basis for generating IDs for - * specific collective operations - */ - int64_t base_sequence_number; - - /** memory pool */ - mca_coll_ml_lmngr_t memory_manager; - - /* We need it because some bcols cannot - support all possible allreduce data types */ - bool need_allreduce_support; - - int use_knomial_allreduce; - - /* use hdl_framework */ - bool use_hdl_bcast; - - /* Enable / Disable fragmentation (0 - off, 1 - on, 2 - auto) */ - int enable_fragmentation; - - /* Broadcast algorithm */ - int bcast_algorithm; - - /* frag size that is used by list memory_manager */ - size_t lmngr_block_size; - - /* alignment that is used by list memory_manager */ - size_t lmngr_alignment; - - /* list size for memory_manager */ - size_t lmngr_size; - - /* number of payload memory banks */ - int n_payload_mem_banks; - - /* number of payload buffers per bank */ - int n_payload_buffs_per_bank; - - /* size of payload buffer */ - unsigned long long payload_buffer_size; - - /* pipeline depth for msg fragmentation */ - int pipeline_depth; - - /* Free list tunings */ - int free_list_init_size; - - int free_list_grow_size; - - int free_list_max_size; - - /* - * queues for asynchronous collective progress - */ - /* tasks that have not started, either because dependencies are not - * statisfied, or resources are lacking - */ - opal_list_t pending_tasks; - opal_mutex_t pending_tasks_mutex; - - /* active incomplete tasks */ - opal_list_t active_tasks; - opal_mutex_t active_tasks_mutex; - - /* sequential collectives to progress */ - opal_list_t sequential_collectives; - opal_mutex_t sequential_collectives_mutex; - - bool progress_is_busy; - - /* Temporary hack for IMB test - not all bcols have allgather */ - bool disable_allgather; - - /* Temporary hack for IMB test - not all bcols have alltoall */ - bool disable_alltoall; - - /* Disable Reduce */ - bool disable_reduce; - - /* Brucks alltoall mca and other params */ - int use_brucks_smsg_alltoall; - - mca_coll_topo_discovery_fn_t topo_discovery_fn[COLL_ML_TOPO_MAX]; - - /* Configure file for collectives */ - char *config_file_name; - - per_collective_configuration_t coll_config[ML_NUM_OF_FUNCTIONS][ML_NUM_MSG]; -}; - -/** - * Convenience typedef - */ -typedef struct mca_coll_ml_component_t mca_coll_ml_component_t; - -/** - * Global component instance - */ -OMPI_MODULE_DECLSPEC extern mca_coll_ml_component_t mca_coll_ml_component; - -struct mca_coll_ml_leader_offset_info_t { - size_t offset; - int level_one_index; - bool leader; -}; -typedef struct mca_coll_ml_leader_offset_info_t mca_coll_ml_leader_offset_info_t; - -/* Topolody data structure */ -struct mca_coll_ml_topology_t { - topo_status_t status; /* 0 - enabled , 1 - disabled */ - /* information on the selected groups - needed for collective - ** algorithms */ - int32_t global_lowest_hier_group_index; - int32_t global_highest_hier_group_index; - int number_of_all_subgroups; - int n_levels; - /* bcols bits that describe supported features/modes */ - uint64_t all_bcols_mode; - mca_bcol_base_route_info_t *route_vector; - coll_ml_collective_description_t *hierarchical_algorithms[BCOL_NUM_OF_FUNCTIONS]; - sub_group_params_t *array_of_all_subgroups; - /* (sbgp, bcol) pairs */ - hierarchy_pairs *component_pairs; - /* ordering of ranks when I am the root of the operation. - * This ordering guarantees that data need to be re-ordered - * only at the first or last step in rooted operations, - * depending on whether the opearation is a scatter or - * gather operation. - */ - int *sort_list; - mca_coll_ml_leader_offset_info_t *hier_layout_info; - /* are ranks laid out contiguously */ - bool ranks_contiguous; - struct ordering_info_t { - int next_inorder; - int next_order_num; - int num_bcols_need_ordering; - } topo_ordering_info; -}; -typedef struct mca_coll_ml_topology_t mca_coll_ml_topology_t; - -struct mca_coll_ml_bcol_list_item_t { - opal_list_item_t super; - mca_bcol_base_module_t *bcol_module; -}; -typedef struct mca_coll_ml_bcol_list_item_t mca_coll_ml_bcol_list_item_t; -OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_ml_bcol_list_item_t); - -#define MCA_COLL_MAX_NUM_COLLECTIVES 40 /* ... I do not remember how much exactly collectives do we have */ -#define MCA_COLL_MAX_NUM_SUBTYPES 15 /* Maximum number of algorithms per collective */ - -struct mca_coll_ml_module_t { - /* base structure */ - mca_coll_base_module_t super; - - /* ML module status - 0 was not initialized, 1 - was initialized */ - bool initialized; - /* communicator */ - struct ompi_communicator_t *comm; - - /* reference convertor */ - opal_convertor_t *reference_convertor; - - mca_coll_ml_topology_t topo_list[COLL_ML_TOPO_MAX]; - - /* Collectives - Topology map */ - int collectives_topology_map - [MCA_COLL_MAX_NUM_COLLECTIVES][MCA_COLL_MAX_NUM_SUBTYPES]; - - /* largest number of function calls for the collective routines. - * This is used to allocate resources */ - int max_fn_calls; - - /* collective sequence number - unique id for barrier type operations */ - int32_t no_data_collective_sequence_num; - - /* collective sequence number - unique id for each collective */ - int32_t collective_sequence_num; - - /** ompi free list of full message descriptors **/ - opal_free_list_t message_descriptors; - - /** ompi free list of message fragment descriptors **/ - opal_free_list_t fragment_descriptors; - - /** pointer to the payload memory block **/ - struct mca_bcol_base_memory_block_desc_t *payload_block; - - /** the maximum size of collective function description */ - int max_dag_size; - - /** data used to initialize coll_ml_collective_descriptors */ - struct coll_desc_init { - int max_dag_size; - size_t max_n_bytes_per_proc_total; - mca_coll_base_module_t *bcol_base_module; - } coll_desc_init_data; - - /** collective operation descriptor free list - used to manage a single - * collective operation. */ - opal_free_list_t coll_ml_collective_descriptors; - - /** multiple function collective operation support */ - /** broadcast */ - mca_coll_ml_collective_operation_description_t * - coll_ml_bcast_functions[ML_NUM_BCAST_FUNCTIONS]; - - /* bcast size selection criteria - cutoff for the largest size of - * data for which to apply the specified collective operation. - * This gives us the ability to choose algorithm based on size */ - size_t bcast_cutoff_size[ML_N_DATASIZE_BINS]; - - /** Allreduce functions */ - mca_coll_ml_collective_operation_description_t * - coll_ml_allreduce_functions[ML_NUM_ALLREDUCE_FUNCTIONS]; - - /** Reduce functions */ - mca_coll_ml_collective_operation_description_t * - coll_ml_reduce_functions[ML_NUM_REDUCE_FUNCTIONS]; - - - /** scatter */ - mca_coll_ml_collective_operation_description_t * - coll_ml_scatter_functions[ML_NUM_SCATTER_FUNCTIONS]; - - /** alltoall */ - mca_coll_ml_collective_operation_description_t * - coll_ml_alltoall_functions[ML_NUM_ALLTOALL_FUNCTIONS]; - - /** allgather */ - mca_coll_ml_collective_operation_description_t * - coll_ml_allgather_functions[ML_NUM_ALLGATHER_FUNCTIONS]; - - /** gather */ - mca_coll_ml_collective_operation_description_t * - coll_ml_gather_functions[ML_NUM_GATHER_FUNCTIONS]; - - /** Barrier */ - mca_coll_ml_collective_operation_description_t * - coll_ml_barrier_function; - - /** ML Memory Syncronization collective operation */ - mca_coll_ml_collective_operation_description_t * - coll_ml_memsync_function; - - /** The table of allreduce functions for specific type and op **/ - bool allreduce_matrix[OMPI_OP_NUM_OF_TYPES][OMPI_DATATYPE_MAX_PREDEFINED][BCOL_NUM_OF_ELEM_TYPES]; - - /* data offset from ML */ - int32_t data_offset; - - int small_message_thresholds[BCOL_NUM_OF_FUNCTIONS]; - - /* fragmenation parameters */ - int use_user_buffers; - uint64_t fragment_size; - uint32_t ml_fragment_size; - - /* Bcast index table. Pasha: Do we need to define something more generic ? - the table x 2 (large/small)*/ - int bcast_fn_index_table[2]; - - /* List of pointer to bcols that have been initilized and used. - * So far we use it only for ML memory management */ - opal_list_t active_bcols_list; - - /* Buffer size required for Bruck's algorithm */ - int brucks_buffer_threshold_const; - - /* log comm size */ - /* We require this for alltoall algorithm */ - int log_comm_size; - /* On this list we keep coll_op descriptors that were not - * be able to start, since no ml buffers were available */ - opal_list_t waiting_for_memory_list; - - /* fallback collectives */ - mca_coll_base_comm_coll_t fallback; -}; - -typedef struct mca_coll_ml_module_t mca_coll_ml_module_t; -OBJ_CLASS_DECLARATION(mca_coll_ml_module_t); - - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_coll_ml_init_query(bool enable_progress_threads, - bool enable_mpi_threads); - -/* query to see if the module is available for use on the given - * communicator, and if so, what it's priority is. This is where - * the backing shared-memory file is created. - */ -mca_coll_base_module_t * -mca_coll_ml_comm_query(struct ompi_communicator_t *comm, int *priority); - -/* Barrier - blocking */ -int mca_coll_ml_barrier_intra(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -/* Barrier - non-blocking */ -int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); - -/* Allreduce with EXTRA TOPO using - blocking */ -int mca_coll_ml_allreduce_dispatch(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - struct ompi_communicator_t *comm, mca_coll_base_module_t *module); - -/* Allreduce with EXTRA TOPO using - Non-blocking */ -int mca_coll_ml_allreduce_dispatch_nb(const void *sbuf, void *rbuf, int count, - ompi_datatype_t *dtype, ompi_op_t *op, - ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); - -/* Allreduce - blocking */ -int mca_coll_ml_allreduce(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -/* Allreduce - Non-blocking */ -int mca_coll_ml_allreduce_nb(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); - -/* Reduce - Blocking */ -int mca_coll_ml_reduce(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -int mca_coll_ml_reduce_nb(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); - -int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *module, int bank_index); - - -int coll_ml_progress_individual_message(mca_coll_ml_fragment_t *frag_descriptor); - -/* - * the ml entry point for the broadcast function - */ -int mca_coll_ml_parallel_bcast(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int mca_coll_ml_parallel_bcast_nb(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); -int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -/* - * The ml function interface for non-blocking routines - */ -int mca_coll_ml_bcast_unknown_root_nb(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); - -int mca_coll_ml_bcast_known_root_nb(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module); -OMPI_DECLSPEC int mca_coll_ml_bcast_unknown_root_with_frags_nb(void *buf, int count, - struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - ompi_request_t **req, mca_coll_base_module_t *module); - -/* This routine sets up a sequential hierarchical scatter algorithm. The - * assumptions are that each rank knows in which sub-group that data will show - * up first, and that the scatter is executed sequentially, one subgroup at a - * time. This is needed, when the full collective needs to be specified before - * the collective operation starts up. The algorithm handles all data sizes - * and data types. - */ - -OMPI_DECLSPEC int mca_coll_ml_scatter_sequential( - void *sbuf, int scount, struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, struct ompi_datatype_t *rdtype, - int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); - -#if 0 -int mca_coll_ml_bcast_small_dynamic_root(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int mca_coll_ml_bcast_small_known_root(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -#endif - -/* Topology discovery function */ - -int mca_coll_ml_fulltree_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies); -int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies); -int mca_coll_ml_fulltree_exclude_basesmsocket_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies); -int mca_coll_ml_fulltree_ptp_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies); -int mca_coll_ml_fulltree_iboffload_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies); - -void mca_coll_ml_allreduce_matrix_init(mca_coll_ml_module_t *ml_module, - const mca_bcol_base_component_2_0_0_t *bcol_component); -static inline int mca_coll_ml_err(const char* fmt, ...) -{ - va_list list; - int ret; - - va_start(list, fmt); - ret = vfprintf(stderr, fmt, list); - va_end(list); - return ret; -} - - -#define ML_ERROR(args) \ -do { \ - mca_coll_ml_err("[%s]%s[%s:%d:%s] COLL-ML ", \ - ompi_process_info.nodename, \ - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - mca_coll_ml_err args; \ - mca_coll_ml_err("\n"); \ -} while(0) - -#if OPAL_ENABLE_DEBUG -#define ML_VERBOSE(level, args) \ -do { \ - if(mca_coll_ml_component.verbose >= level) { \ - mca_coll_ml_err("[%s]%s[%s:%d:%s] COLL-ML ", \ - ompi_process_info.nodename, \ - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - mca_coll_ml_err args; \ - mca_coll_ml_err("\n"); \ - } \ -} while(0) -#else -#define ML_VERBOSE(level, args) -#endif - -#define IS_BCOL_TYPE_IDENTICAL(bcol1, bcol2) \ - ( (NULL != bcol1 && NULL != bcol2) && \ - ( /* chech if the len is the same */ \ - (strlen(((mca_base_component_t *)((bcol1)->bcol_component))->mca_component_name) == \ - strlen(((mca_base_component_t *)((bcol2)->bcol_component))->mca_component_name)) \ - && /* check if the string are identical */ \ - (0 == strncmp(((mca_base_component_t *)((bcol1)->bcol_component))->mca_component_name, \ - ((mca_base_component_t *)((bcol2)->bcol_component))->mca_component_name, \ - strlen(((mca_base_component_t *)((bcol2)->bcol_component))->mca_component_name))) \ - ) ? true : false) - -#define GET_BCOL(module, indx) ((module)->component_pairs[(indx)].bcol_modules[0]) - -#define GET_BCOL_SYNC_FN(bcol) ((bcol)->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING] \ - [BCOL_SYNC][1][0][0]) - -/* Allocator macros */ -#define BUFFER_INDEX(bank,nbuffs,buffer) (bank*nbuffs+buffer) - -#define ML_GET_FRAG_SIZE(op, coll) \ - ((op)->fragment_data.message_descriptor->n_bytes_total - \ - (op)->fragment_data.message_descriptor->n_bytes_scheduled < \ - (size_t) OP_ML_MODULE((op))->small_message_thresholds[coll] ? \ - (op)->fragment_data.message_descriptor->n_bytes_total - \ - (op)->fragment_data.message_descriptor->n_bytes_scheduled : \ - (size_t) OP_ML_MODULE((op))->small_message_thresholds[coll]) - -/* Abort mpi process in case of fatal error */ -void mca_coll_ml_abort_ml(char *message); - -#define ML_SET_VARIABLE_PARAMS_BCAST(op, ml, cnt, datatype, b_desc, \ - s_offset, r_offset, frag_len, buf) \ -do { \ - op->variable_fn_params.sequence_num = \ - OPAL_THREAD_ADD32(&((ml)->collective_sequence_num), 1); \ - op->variable_fn_params.count = cnt; \ - op->variable_fn_params.dtype = datatype; \ - op->variable_fn_params.buffer_index = (b_desc)->buffer_index; \ - op->variable_fn_params.src_desc = (b_desc); \ - op->variable_fn_params.sbuf_offset = s_offset; \ - op->variable_fn_params.rbuf_offset = r_offset; \ - op->variable_fn_params.frag_size = frag_len; \ - op->variable_fn_params.sbuf = buf; \ -} while (0) - -#define MCA_COLL_ML_OP_BASIC_SETUP(op, total_bytes, offset_into_user_buff, src, dst, collective_schedule) \ - do { \ - op->coll_schedule = collective_schedule; \ - op->process_fn = NULL; \ - op->full_message.n_bytes_total = total_bytes; \ - op->full_message.n_bytes_delivered = 0; \ - op->full_message.n_bytes_scheduled = 0; \ - op->full_message.dest_user_addr = dst; \ - op->full_message.src_user_addr = src; \ - op->full_message.n_active = 0; \ - op->full_message.n_bytes_per_proc_total = 0; \ - op->full_message.send_count = 0; \ - op->full_message.recv_count = 0; \ - op->full_message.send_extent = 0; \ - op->full_message.recv_extent = 0; \ - op->full_message.offset_into_send_buffer = 0; \ - op->full_message.offset_into_recv_buffer = 0; \ - op->full_message.send_data_type = 0; \ - op->full_message.recv_data_type = 0; \ - op->full_message.fragment_launcher = 0; \ - op->sequential_routine.current_active_bcol_fn = 0; \ - op->sequential_routine.current_bcol_status = SEQ_TASK_NOT_STARTED; \ - \ - op->fragment_data.offset_into_user_buffer = offset_into_user_buff; \ - /* Pasha, is it constant ? what to put here */ \ - op->fragment_data.fragment_size = total_bytes; \ - op->fragment_data.message_descriptor = &op->full_message; \ - op->fragment_data.current_coll_op = -1; \ - } while (0) - -/* This routine re-orders and packs user data. The assumption is that - * there is per-process data, the amount of data is the same for all * ranks, - * and the user data is contigous. - */ -int mca_coll_ml_pack_reorder_contiguous_data( - mca_coll_ml_collective_operation_progress_t *coll_op); - -/* This routine re-orders and packs user data. The assumption is that - * there is per-process data, the amount of data is the same for all * ranks, - * and the user data is noncontigous. - */ -int mca_coll_ml_pack_reorder_noncontiguous_data( - mca_coll_ml_collective_operation_progress_t *coll_op); - -END_C_DECLS - - -#endif /* MCA_COLL_ML_ML_H */ diff --git a/ompi/mca/coll/ml/coll_ml_allgather.c b/ompi/mca/coll/ml/coll_ml_allgather.c deleted file mode 100644 index a1c71322cd..0000000000 --- a/ompi/mca/coll/ml/coll_ml_allgather.c +++ /dev/null @@ -1,633 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#include "ompi_config.h" - -#include - -#include "ompi/constants.h" -#include "opal/threads/mutex.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/bcol/bcol.h" -#include "opal/sys/atomic.h" -#include "coll_ml.h" -#include "coll_ml_select.h" -#include "coll_ml_allocation.h" - -static int mca_coll_ml_allgather_small_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - bool rcontig = coll_op->full_message.recv_data_continguous; - int n_ranks_in_comm = ompi_comm_size(OP_ML_MODULE(coll_op)->comm); - - void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr + - (uintptr_t)coll_op->full_message.n_bytes_delivered); - void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr + - (size_t)coll_op->variable_fn_params.rbuf_offset); - - if (rcontig) { - memcpy(dest, src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled); - } else { - mca_coll_ml_convertor_unpack(src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled, - &coll_op->fragment_data.message_descriptor->recv_convertor); - } - - return OMPI_SUCCESS; -} - -static inline void copy_data (mca_coll_ml_collective_operation_progress_t *coll_op, rank_properties_t *rank_props, int soffset) { - bool rcontig = coll_op->fragment_data.message_descriptor->recv_data_continguous; - size_t total_bytes = coll_op->fragment_data.message_descriptor->n_bytes_total; - size_t pack_len = coll_op->fragment_data.fragment_size; - int doffset = rank_props->rank; - void *dest, *src; - - src = (void *) ((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr + - (size_t)coll_op->variable_fn_params.rbuf_offset + soffset * pack_len); - - if (rcontig) { - dest = (void *) ((uintptr_t) coll_op->full_message.dest_user_addr + - (uintptr_t) coll_op->fragment_data.offset_into_user_buffer + - doffset * total_bytes); - - memcpy(dest, src, pack_len); - } else { - size_t position; - opal_convertor_t *recv_convertor = - &coll_op->fragment_data.message_descriptor->recv_convertor; - - position = (size_t) coll_op->fragment_data.offset_into_user_buffer + - doffset * total_bytes; - - opal_convertor_set_position(recv_convertor, &position); - mca_coll_ml_convertor_unpack(src, pack_len, recv_convertor); - } -} - -static int mca_coll_ml_allgather_noncontiguous_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - int i, j, n_level_one_sbgps; - size_t soffset; - - mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info; - sub_group_params_t *array_of_all_subgroup_ranks = topo_info->array_of_all_subgroups; - - n_level_one_sbgps = array_of_all_subgroup_ranks->level_one_index; - - for (i = 0 ; i < n_level_one_sbgps; i++) { - /* determine where in the source buffer the data can be found */ - soffset = array_of_all_subgroup_ranks[i].index_of_first_element; - for (j = 0 ; j < array_of_all_subgroup_ranks[i].n_ranks; j++, ++soffset) { - copy_data (coll_op, array_of_all_subgroup_ranks[i].rank_data + j, soffset); - } - } - - return OMPI_SUCCESS; -} - -/* Allgather dependencies seem easy, everyone needs to work from the "bottom up". - * Following Pasha, I too will put the simplest dependencies graph and change it later - * when we add hierarchy. Basically, allgather has the same dependency profile as the - * sequential broadcast except that there is only a single ordering of tasks. - */ -static int mca_coll_ml_allgather_task_setup(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - int fn_idx, h_level, my_index, root; - mca_sbgp_base_module_t *sbgp; - mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info; - - fn_idx = coll_op->sequential_routine.current_active_bcol_fn; - h_level = coll_op->coll_schedule->component_functions[fn_idx].h_level; - sbgp = topo->component_pairs[h_level]. - subgroup_module; - my_index = sbgp->my_index; - - /* In the case of allgather, the local leader is always the root */ - root = 0; - if (my_index == root) { - coll_op->variable_fn_params.root_flag = true; - coll_op->variable_fn_params.root_route = NULL; - } else { - coll_op->variable_fn_params.root_flag = false; - coll_op->variable_fn_params.root_route = &topo->route_vector[root]; - } - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_allgather_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - /* local variables */ - int ret; - size_t frag_len, dt_size; - - const void *buf; - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; - mca_coll_ml_collective_operation_progress_t *new_op; - - mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op); - bool scontig = coll_op->fragment_data.message_descriptor->send_data_continguous; - - ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size); - /* Keep the pipeline filled with fragments */ - while (coll_op->fragment_data.message_descriptor->n_active < - coll_op->fragment_data.message_descriptor->pipeline_depth) { - /* If an active fragment happens to have completed the collective during - * a hop into the progress engine, then don't launch a new fragment, - * instead break and return. - */ - if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled - == coll_op->fragment_data.message_descriptor->n_bytes_total) { - break; - } - /* Get an ml buffer */ - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - if (NULL == src_buffer_desc) { - /* If there exist outstanding fragments, then break out - * and let an active fragment deal with this later, - * there are no buffers available. - */ - if (0 < coll_op->fragment_data.message_descriptor->n_active) { - return OMPI_SUCCESS; - } else { - /* The fragment is already on list and - * the we still have no ml resources - * Return busy */ - if (coll_op->pending & REQ_OUT_OF_MEMORY) { - ML_VERBOSE(10,("Out of resources %p", coll_op)); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - coll_op->pending |= REQ_OUT_OF_MEMORY; - opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list), - (opal_list_item_t *)coll_op); - ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op)); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - } - - /* Get a new collective descriptor and initialize it */ - new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], - coll_op->fragment_data.message_descriptor->src_user_addr, - coll_op->fragment_data.message_descriptor->dest_user_addr, - coll_op->fragment_data.message_descriptor->n_bytes_total, - coll_op->fragment_data.message_descriptor->n_bytes_scheduled); - - new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op; - new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor; - - /* set the task setup callback */ - new_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; - - /* - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op, - src_buffer_desc->buffer_index, src_buffer_desc); - */ - - /* We need this address for pointer arithmetic in memcpy */ - buf = coll_op->fragment_data.message_descriptor->src_user_addr; - - if (!scontig) { - frag_len = ml_module->small_message_thresholds[BCOL_ALLGATHER]; - mca_coll_ml_convertor_get_send_frag_size( - ml_module, &frag_len, - coll_op->fragment_data.message_descriptor); - - mca_coll_ml_convertor_pack( - (void *) ((uintptr_t) src_buffer_desc->data_addr + - frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + - frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index), - frag_len, &coll_op->fragment_data.message_descriptor->send_convertor); - } else { - /* calculate new frag length, there are some issues here */ - frag_len = (coll_op->fragment_data.message_descriptor->n_bytes_total - - coll_op->fragment_data.message_descriptor->n_bytes_scheduled < - coll_op->fragment_data.fragment_size ? - coll_op->fragment_data.message_descriptor->n_bytes_total - - coll_op->fragment_data.message_descriptor->n_bytes_scheduled : - coll_op->fragment_data.fragment_size); - - /* everybody copies in, based on the new values */ - memcpy((void *) ((uintptr_t)src_buffer_desc->data_addr + - frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].offset + - frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index), - (void *) ((uintptr_t) buf + (uintptr_t) - coll_op->fragment_data.message_descriptor->n_bytes_scheduled), frag_len); - } - - new_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; - new_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; - - /* update the number of bytes scheduled */ - new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; - /* everyone needs an unpack function */ - new_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; - - new_op->fragment_data.fragment_size = frag_len; - new_op->fragment_data.buffer_desc = src_buffer_desc; - - /* Setup fragment specific data */ - ++(new_op->fragment_data.message_descriptor->n_active); - - ML_VERBOSE(10, ("Start more, My index %d ", - new_op->fragment_data.buffer_desc->buffer_index)); - - /* this is a bit buggy */ - ML_SET_VARIABLE_PARAMS_BCAST( - new_op, - OP_ML_MODULE(new_op), - frag_len /* yes, we have consistent units, so this makes sense */, - MPI_BYTE /* we fragment according to buffer size - * we don't reduce the data thus we needn't - * keep "whole" datatypes, we may freely - * fragment without regard for multiples - * of any specific datatype - */, - src_buffer_desc, - 0, - 0, - frag_len, - src_buffer_desc->data_addr); - /* initialize first coll */ - ret = new_op->sequential_routine.seq_task_setup(new_op); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(3, ("Fragment failed to initialize itself")); - return ret; - } - - new_op->variable_fn_params.buffer_size = frag_len; - new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; - new_op->variable_fn_params.root = 0; - - MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); - - /* append this collective !! */ - OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); - opal_list_append(&mca_coll_ml_component.sequential_collectives, - (opal_list_item_t *)new_op); - OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); - } - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ -int mca_coll_ml_allgather_start (const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - ompi_request_t **req) -{ - size_t pack_len, sdt_size; - int ret, n_fragments = 1, comm_size; - - mca_coll_ml_topology_t *topo_info; - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; - - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - - mca_coll_ml_collective_operation_progress_t *coll_op; - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; - - ptrdiff_t lb, extent; - bool scontig, rcontig, in_place = false; - - /* check for in place setting */ - if (MPI_IN_PLACE == sbuf) { - in_place = true; - sdtype = rdtype; - scount = rcount; - } - - /* scontig could be != to rcontig */ - scontig = ompi_datatype_is_contiguous_memory_layout(sdtype, scount); - rcontig = ompi_datatype_is_contiguous_memory_layout(rdtype, rcount); - - comm_size = ompi_comm_size(comm); - - ML_VERBOSE(10, ("Starting allgather")); - - assert(NULL != sdtype); - /* Calculate size of the data, - * at this stage, only contiguous data is supported */ - - /* this is valid for allagther */ - ompi_datatype_type_size(sdtype, &sdt_size); - pack_len = scount * sdt_size; - - if (in_place) { - sbuf = (char *) rbuf + ompi_comm_rank(comm) * pack_len; - } - - /* Allocate collective schedule and pack message */ - /* this is the total ending message size that will need to fit in the ml-buffer */ - if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]) { - /* The len of the message can not be larger than ML buffer size */ - ML_VERBOSE(10, ("Single frag %d %d %d", pack_len, comm_size, ml_module->payload_block->size_buffer)); - assert(pack_len * comm_size <= ml_module->payload_block->size_buffer); - - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - /* change 1 */ - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], - sbuf, rbuf, pack_len, 0 /* offset for first pack */); - - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, - src_buffer_desc->buffer_index, src_buffer_desc); - - coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER; - /* task setup callback function */ - coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; - - /* change 2 */ - if (!scontig) { - coll_op->full_message.n_bytes_scheduled = - mca_coll_ml_convertor_prepare(sdtype, scount, sbuf, - &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND); - - mca_coll_ml_convertor_pack( - (void *) ((uintptr_t) src_buffer_desc->data_addr + pack_len * - (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + - coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), - pack_len, &coll_op->full_message.send_convertor); - } else { - /* change 3 */ - memcpy((void *)((uintptr_t) src_buffer_desc->data_addr + pack_len * - (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + - coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), - sbuf, pack_len); - - coll_op->full_message.n_bytes_scheduled = pack_len; - } - - if (!rcontig) { - mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf, - &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV); - } - - if (coll_op->coll_schedule->topo_info->ranks_contiguous) { - coll_op->process_fn = mca_coll_ml_allgather_small_unpack_data; - } else { - coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; - } - - /* whole ml-buffer is used to send AND receive */ - coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; - coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; - - /* we can set the initial offset here */ - coll_op->variable_fn_params.sbuf_offset = 0; - coll_op->variable_fn_params.rbuf_offset = 0; - - coll_op->variable_fn_params.count = scount; - coll_op->fragment_data.fragment_size = - coll_op->full_message.n_bytes_scheduled; - - /* For small CINCO, we may use the native datatype */ - coll_op->variable_fn_params.dtype = sdtype; - coll_op->variable_fn_params.buffer_size = pack_len; - coll_op->variable_fn_params.root = 0; - } else if (cm->enable_fragmentation || pack_len * comm_size < (1 << 20)) { - /* calculate the number of fragments and the size of each frag */ - size_t n_dts_per_frag, frag_len; - int pipeline_depth = mca_coll_ml_component.pipeline_depth; - - /* Calculate the number of fragments required for this message careful watch the integer division !*/ - frag_len = (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER] ? - pack_len : (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]); - - n_dts_per_frag = frag_len / sdt_size; - n_fragments = (pack_len + sdt_size * n_dts_per_frag - 1) / (sdt_size * n_dts_per_frag); - pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth); - - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - /* change 4 */ - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], - sbuf, rbuf, pack_len, - 0 /* offset for first pack */); - - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, - src_buffer_desc->buffer_index, src_buffer_desc); - topo_info = coll_op->coll_schedule->topo_info; - - /* task setup callback function */ - coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; - - if (!scontig) { - coll_op->full_message.send_converter_bytes_packed = - mca_coll_ml_convertor_prepare( - sdtype, scount, NULL, - &coll_op->full_message.dummy_convertor, - MCA_COLL_ML_NET_STREAM_SEND); - - coll_op->full_message.dummy_conv_position = 0; - mca_coll_ml_convertor_get_send_frag_size( - ml_module, &frag_len, - &coll_op->full_message); - - /* change 5 */ - mca_coll_ml_convertor_prepare(sdtype, scount, sbuf, - &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND); - - mca_coll_ml_convertor_pack( - (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len * - (topo_info->hier_layout_info[0].offset + - topo_info->hier_layout_info[0].level_one_index)), - frag_len, &coll_op->full_message.send_convertor); - } else { - /* change 6 */ - memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len * - (topo_info->hier_layout_info[0].offset + - topo_info->hier_layout_info[0].level_one_index)), - sbuf, frag_len); - } - - if (!rcontig) { - mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf, - &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV); - } - - coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; - - /* hopefully this doesn't royaly screw things up idea behind this is the - * whole ml-buffer is used to send and receive - */ - coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; - coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; - - /* we can set the initial offset here */ - coll_op->variable_fn_params.sbuf_offset = 0; - coll_op->variable_fn_params.rbuf_offset = 0; - - coll_op->fragment_data.buffer_desc = src_buffer_desc; - - coll_op->fragment_data.fragment_size = frag_len; - coll_op->fragment_data.message_descriptor->n_active = 1; - - coll_op->full_message.n_bytes_scheduled = frag_len; - coll_op->full_message.fragment_launcher = mca_coll_ml_allgather_frag_progress; - - coll_op->full_message.pipeline_depth = pipeline_depth; - coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER; - - /* remember this is different for frags !! Caused data corruption when - * not properly set. Need to be sure you have consistent units. - */ - coll_op->variable_fn_params.count = frag_len; - coll_op->variable_fn_params.dtype = MPI_BYTE; /* for fragmented data, we work in - * units of bytes. This means that - * all of our arithmetic is done - * in terms of bytes - */ - - coll_op->variable_fn_params.root = 0; - coll_op->variable_fn_params.frag_size = frag_len; - coll_op->variable_fn_params.buffer_size = frag_len; - } else { - /* change 7 */ - ML_VERBOSE(10, ("ML_ALLGATHER_LARGE_DATA_KNOWN case.")); - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_allgather_functions[ML_LARGE_DATA_ALLGATHER], - sbuf, rbuf, pack_len, 0 /* offset for first pack */); - topo_info = coll_op->coll_schedule->topo_info; - if (MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG & topo_info->all_bcols_mode) { - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, MCA_COLL_ML_NO_BUFFER, NULL); - } else { - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); - } - - /* not sure if I really need this here */ - coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; - coll_op->process_fn = NULL; - /* probably the most important piece */ - coll_op->variable_fn_params.sbuf = sbuf; - coll_op->variable_fn_params.rbuf = rbuf; - coll_op->variable_fn_params.sbuf_offset = 0; - coll_op->variable_fn_params.rbuf_offset = 0; - coll_op->variable_fn_params.count = scount; - coll_op->variable_fn_params.dtype = sdtype;/* for zero copy, we want the - * native datatype and actual count - */ - coll_op->variable_fn_params.root = 0; - - /* you still need to copy in your own data into the rbuf */ - /* don't need to do this if you have in place data */ - if (!in_place) { - memcpy((char *) rbuf + ompi_comm_rank(comm) * pack_len, sbuf, pack_len); - } - } - - coll_op->full_message.send_count = scount; - coll_op->full_message.recv_count = rcount; - - coll_op->full_message.send_data_continguous = scontig; - coll_op->full_message.recv_data_continguous = rcontig; - - ompi_datatype_get_extent(sdtype, &lb, &extent); - coll_op->full_message.send_extent = (size_t) extent; - - ompi_datatype_get_extent(rdtype, &lb, &extent); - coll_op->full_message.recv_extent = (size_t) extent; - - - /* Fill in the function arguments */ - coll_op->variable_fn_params.sequence_num = - OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); - coll_op->variable_fn_params.hier_factor = comm_size; - - MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments); - - - ret = mca_coll_ml_launch_sequential_collective (coll_op); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(10, ("Failed to launch")); - return ret; - } - - *req = &coll_op->full_message.super; - - return OMPI_SUCCESS; -} - -int mca_coll_ml_allgather(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - ompi_request_t *req; - int ret; - - ML_VERBOSE(10, ("Starting blocking allgather")); - - ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module, &req); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } - - ret = ompi_request_wait (&req, MPI_STATUS_IGNORE); - - ML_VERBOSE(10, ("Blocking allgather is complete")); - - return ret; -} - -int mca_coll_ml_allgather_nb(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module) -{ - int ret; - - ML_VERBOSE(10, ("Starting non-blocking allgather")); - - ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module, req); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } - - ML_VERBOSE(10, ("Non-blocking allgather started")); - - return ret; -} diff --git a/ompi/mca/coll/ml/coll_ml_allocation.c b/ompi/mca/coll/ml/coll_ml_allocation.c deleted file mode 100644 index ac0ebbebc0..0000000000 --- a/ompi/mca/coll/ml/coll_ml_allocation.c +++ /dev/null @@ -1,213 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "ompi_config.h" -#include - -#include "coll_ml.h" -#include "coll_ml_inlines.h" -#include "coll_ml_allocation.h" - -mca_bcol_base_memory_block_desc_t *mca_coll_ml_allocate_block(struct mca_coll_ml_component_t *ml_component, - mca_bcol_base_memory_block_desc_t *ml_memblock) -{ - mca_bcol_base_memory_block_desc_t *ret = NULL; - mca_bcol_base_memory_block_desc_t *memory_block = NULL; - mca_coll_ml_lmngr_t *memory_manager = NULL; - - if (ml_memblock) { - ML_ERROR(("Memory already allocated - expecting NULL pointer")); - return ret; - } - memory_block = (mca_bcol_base_memory_block_desc_t*) calloc(1, sizeof(mca_bcol_base_memory_block_desc_t)); - - if (NULL == memory_block){ - ML_ERROR(("Couldn't allocate memory for ml_memblock")); - return ret; - } - - memory_manager = &ml_component->memory_manager; - memory_block->block = mca_coll_ml_lmngr_alloc(memory_manager); - memory_block->size_block = memory_manager->list_block_size; - - if (!memory_block->block){ - ML_VERBOSE(1, ("lmngr failed.")); - free(memory_block); - return NULL; - } - - return memory_block; -} - -void mca_coll_ml_free_block (mca_bcol_base_memory_block_desc_t *ml_memblock) -{ - if (!ml_memblock) - return; - - if (ml_memblock->buffer_descs){ - free(ml_memblock->buffer_descs); - } - - mca_coll_ml_lmngr_free(ml_memblock->block); - free(ml_memblock->bank_release_counters); - free(ml_memblock->ready_for_memsync); - free(ml_memblock->bank_is_busy); - free(ml_memblock); -} - -int mca_coll_ml_initialize_block(mca_bcol_base_memory_block_desc_t *ml_memblock, - uint32_t num_buffers, - uint32_t num_banks, - uint32_t buffer_size, - int32_t data_offset, - opal_list_t *bcols_in_use) -{ - int ret = OMPI_SUCCESS; - uint32_t bank_loop, buff_loop; - uint64_t addr_offset = 0; - mca_bcol_base_payload_buffer_desc_t *pbuff_descs = NULL,*pbuff_desc = NULL; - - if (0 == num_banks || 0 == num_buffers || 0 == buffer_size) { - return OMPI_ERR_BAD_PARAM; - } - - if (NULL == ml_memblock){ - ML_ERROR(("Memory block not initialized")); - ret = OMPI_ERROR; - goto exit_ERROR; - } - - if (ml_memblock->size_block < (num_buffers * num_banks * buffer_size) ){ - ML_ERROR(("Not enough memory for all buffers and banks in the memory block")); - ret = OMPI_ERROR; - goto exit_ERROR; - } - - pbuff_descs = (mca_bcol_base_payload_buffer_desc_t*) malloc(sizeof(mca_bcol_base_payload_buffer_desc_t) - * num_banks * num_buffers); - if (NULL == pbuff_descs) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - for(bank_loop = 0; bank_loop < num_banks; bank_loop++) - for(buff_loop = 0; buff_loop < num_buffers; buff_loop++){ - pbuff_desc = &pbuff_descs[bank_loop*num_buffers + buff_loop]; - - pbuff_desc->base_data_addr = (void *) - ((char *)ml_memblock->block->base_addr + addr_offset); - pbuff_desc->data_addr = (void *) - ((char *)pbuff_desc->base_data_addr + (size_t)data_offset); - - addr_offset+=buffer_size; - pbuff_desc->buffer_index = BUFFER_INDEX(bank_loop,num_buffers,buff_loop); - - pbuff_desc->bank_index=bank_loop; - pbuff_desc->generation_number=0; - } - - /* Initialize ml memory block */ - /* gvm FIX:This counter when zero indicates that the bank is ready for - * recycle. This is initialized to number of bcol components as each bcol is responsible for - * releasing the buffers of a bank. This initialization will have - * faulty behavior, example in case of multiple interfaces, when more than - * one bcol module of the component type is in use. - */ - ml_memblock->bank_release_counters = (uint32_t *) calloc(num_banks, sizeof(uint32_t)); - if (NULL == ml_memblock->bank_release_counters) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - ml_memblock->ready_for_memsync = (bool *) calloc(num_banks, sizeof(bool)); - if (NULL == ml_memblock->ready_for_memsync) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - ml_memblock->bank_is_busy = (bool *) calloc(num_banks, sizeof(bool)); - if (NULL == ml_memblock->bank_is_busy) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - /* Set index for first bank to sync */ - ml_memblock->memsync_counter = 0; - - /* use first bank and first buffer */ - ml_memblock->next_free_buffer = 0; - - ml_memblock->block_addr_offset = addr_offset; - ml_memblock->num_buffers_per_bank = num_buffers; - ml_memblock->num_banks = num_banks; - ml_memblock->size_buffer = buffer_size; - ml_memblock->buffer_descs = pbuff_descs; - - return ret; - -exit_ERROR: - /* Free all buffer descriptors */ - if (pbuff_descs){ - free(pbuff_descs); - } - - return ret; -} - -mca_bcol_base_payload_buffer_desc_t *mca_coll_ml_alloc_buffer (mca_coll_ml_module_t *module) -{ - uint64_t bindex; - uint32_t bank, buffer, num_buffers; - mca_bcol_base_memory_block_desc_t *ml_memblock = module->payload_block; - mca_bcol_base_payload_buffer_desc_t *pbuff_descs = NULL, - *ml_membuffer = NULL; - - /* Return a buffer */ - num_buffers = ml_memblock->num_buffers_per_bank; - pbuff_descs = ml_memblock->buffer_descs; - bindex = ml_memblock->next_free_buffer; - buffer = bindex % num_buffers; - bank = bindex/num_buffers; - - ML_VERBOSE(10, ("ML allocator: allocating buffer index %d, bank index %d", buffer, bank)); - - /* First buffer in bank, use next bank */ - if (0 == buffer) { - if(!ml_memblock->bank_is_busy[bank]) { - /* the bank is free, mark it busy */ - ml_memblock->bank_is_busy[bank] = true; - ML_VERBOSE(10, ("ML allocator: reset bank %d to value %d", bank, - ml_memblock->bank_release_counters[bank])); - } else { - /* the bank is busy, return NULL and upper layer will handle it */ - ML_VERBOSE(10, ("No free payload buffers are available for use." - " Next memory bank is still used by one of bcols")); - return NULL; - } - } - - assert(true == ml_memblock->bank_is_busy[bank]); - - ml_membuffer = &pbuff_descs[bindex]; - ML_VERBOSE(10, ("ML allocator: ml buffer index %d", bindex)); - - /* Compute next free buffer */ - buffer = (buffer == num_buffers - 1) ? 0 : buffer + 1; - if (0 == buffer) { - bank = (bank == ml_memblock->num_banks - 1) ? 0 : bank + 1; - } - - ml_memblock->next_free_buffer = BUFFER_INDEX(bank,num_buffers,buffer); - - return ml_membuffer; -} diff --git a/ompi/mca/coll/ml/coll_ml_allocation.h b/ompi/mca/coll/ml/coll_ml_allocation.h deleted file mode 100644 index 7bb7f63242..0000000000 --- a/ompi/mca/coll/ml/coll_ml_allocation.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_ML_ALLOC_H -#define MCA_ML_ALLOC_H - -#include "ompi_config.h" -#include "ompi/include/ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/bcol/base/base.h" -#include "opal/sys/atomic.h" -#include "opal/mca/mpool/base/base.h" -#include "coll_ml_lmngr.h" - -/* - Returns a block of memory from mpool - - ARGS: - IN ml_component: component descriptor - OUT ml_memblock: block_addr - Starting address of the memory block - size - Size of the block - register_info - Register information passed from the mpool - - Return - On Sucess : Returns size of memory block - On Failure: Returns -1 - - */ - -struct mca_coll_ml_component_t; -struct mca_coll_ml_module_t; - -mca_bcol_base_memory_block_desc_t *mca_coll_ml_allocate_block( - struct mca_coll_ml_component_t *ml_component, - struct mca_bcol_base_memory_block_desc_t *ml_memblock - ); - /* Allocate the memory from mpool */ - /* Register the memory block with bcols */ - -void mca_coll_ml_free_block( - mca_bcol_base_memory_block_desc_t *ml_memblock - ); - - - - -/* - Initialize the memory block and map into buffers and memory banks, and - also buffer descriptors are initialized. - - IN ml_memblock: Memory block descriptor - IN num_buffers: number of buffers - IN num_banks: number of banks - Return - On Sucess: OMPI_SUCCESS - On Failure: OMPI_ERROR - */ -int mca_coll_ml_initialize_block( - mca_bcol_base_memory_block_desc_t *ml_memblock, - uint32_t num_buffers, - uint32_t num_banks, - uint32_t buffer_size, - int32_t data_offset, - opal_list_t *bcols_in_use - ); - /* Map blocks into buffers and banks */ - /* Initialize the descriptors */ - - - -/* - Allocate a memory buffer from the block - IN ml_memblock: Memory block descriptor - OUT ml_membuffer: Buffer allocated for data from the block - - Return - On Sucess: OMPI_SUCCESS - On Failure: OMPI_ERROR - */ -mca_bcol_base_payload_buffer_desc_t *mca_coll_ml_alloc_buffer( - struct mca_coll_ml_module_t *module); - -int mca_coll_ml_free_buffer( - mca_bcol_base_memory_block_desc_t *ml_memblock, - struct mca_bcol_base_payload_buffer_desc_t *ml_membuffer - ); - -/* - Register the memory block with bcol component - - IN ml_memblock: Memory block descriptor - OUT registerations (ml_memblock) - - Return - On Sucess: OMPI_SUCCESS - On Failure: OMPI_ERROR - - */ -int mca_coll_ml_register_block_bcol( - mca_bcol_base_memory_block_desc_t *ml_memblock - ); - -#endif /* MCA_ML_ALLOC_H */ diff --git a/ompi/mca/coll/ml/coll_ml_allreduce.c b/ompi/mca/coll/ml/coll_ml_allreduce.c deleted file mode 100644 index 85457254b8..0000000000 --- a/ompi/mca/coll/ml/coll_ml_allreduce.c +++ /dev/null @@ -1,553 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#include "ompi_config.h" - -#include - -#include "ompi/constants.h" -#include "opal/threads/mutex.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/bcol/bcol.h" -#include "opal/sys/atomic.h" -#include "coll_ml.h" -#include "coll_ml_select.h" -#include "coll_ml_allocation.h" - -static int mca_coll_ml_allreduce_small_unpack(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - int ret; - /* need to put in more */ - int count = coll_op->variable_fn_params.count; - ompi_datatype_t *dtype = coll_op->variable_fn_params.dtype; - - void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr + - (uintptr_t)coll_op->fragment_data.offset_into_user_buffer); - void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr + - (size_t)coll_op->variable_fn_params.rbuf_offset); - - ret = ompi_datatype_copy_content_same_ddt(dtype, (int32_t) count, (char *) dest, - (char *) src); - if (ret < 0) { - return OMPI_ERROR; - } - - ML_VERBOSE(10, ("sbuf addr %p, sbuf offset %d, rbuf addr %p, rbuf offset %d.", - src, coll_op->variable_fn_params.sbuf_offset, dest, - coll_op->variable_fn_params.rbuf_offset)); - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_allreduce_task_setup(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - int fn_idx, h_level, my_index, root; - mca_sbgp_base_module_t *sbgp; - mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info; - - fn_idx = coll_op->sequential_routine.current_active_bcol_fn; - h_level = coll_op->coll_schedule->component_functions[fn_idx].h_level; - sbgp = topo->component_pairs[h_level].subgroup_module; - my_index = sbgp->my_index; - - /* In the case of allreduce, the local leader is always the root */ - root = 0; - if (my_index == root) { - coll_op->variable_fn_params.root_flag = true; - coll_op->variable_fn_params.root_route = NULL; - } else { - coll_op->variable_fn_params.root_flag = false; - coll_op->variable_fn_params.root_route = &topo->route_vector[root]; - } - - /* NTH: This was copied from the old allreduce launcher. */ - if (0 < fn_idx) { - coll_op->variable_fn_params.sbuf = coll_op->variable_fn_params.rbuf; - coll_op->variable_fn_params.userbuf = coll_op->variable_fn_params.rbuf; - } - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_allreduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - /* local variables */ - const void *buf; - - size_t dt_size; - int ret, frag_len, count; - - ptrdiff_t lb, extent; - - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; - mca_coll_ml_collective_operation_progress_t *new_op; - - mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op); - - ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent); - if (ret < 0) { - return OMPI_ERROR; - } - - dt_size = (size_t) extent; - - /* Keep the pipeline filled with fragments */ - while (coll_op->fragment_data.message_descriptor->n_active < - coll_op->fragment_data.message_descriptor->pipeline_depth) { - /* If an active fragment happens to have completed the collective during - * a hop into the progress engine, then don't launch a new fragment, - * instead break and return. - */ - if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled - == coll_op->fragment_data.message_descriptor->n_bytes_total) { - break; - } - - /* Get an ml buffer */ - src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op)); - if (NULL == src_buffer_desc) { - /* If there exist outstanding fragments, then break out - * and let an active fragment deal with this later, - * there are no buffers available. - */ - if (0 < coll_op->fragment_data.message_descriptor->n_active) { - return OMPI_SUCCESS; - } - - /* It is useless to call progress from here, since - * ml progress can't be executed as result ml memsync - * call will not be completed and no memory will be - * recycled. So we put the element on the list, and we will - * progress it later when memsync will recycle some memory*/ - - /* The fragment is already on list and - * the we still have no ml resources - * Return busy */ - if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) { - coll_op->pending |= REQ_OUT_OF_MEMORY; - opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list), - (opal_list_item_t *)coll_op); - ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op)); - } else { - ML_VERBOSE(10,("Out of resources %p", coll_op)); - } - - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - /* Get a new collective descriptor and initialize it */ - new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_allreduce_functions[coll_op->fragment_data.current_coll_op], - coll_op->fragment_data.message_descriptor->src_user_addr, - coll_op->fragment_data.message_descriptor->dest_user_addr, - coll_op->fragment_data.message_descriptor->n_bytes_total, - coll_op->fragment_data.message_descriptor->n_bytes_scheduled); - - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op, - src_buffer_desc->buffer_index, src_buffer_desc); - - new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op; - new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor; - - /* set the task setup callback */ - new_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup; - /* We need this address for pointer arithmetic in memcpy */ - buf = coll_op->fragment_data.message_descriptor->src_user_addr; - /* calculate the number of data types in this packet */ - count = (coll_op->fragment_data.message_descriptor->n_bytes_total - - coll_op->fragment_data.message_descriptor->n_bytes_scheduled < - (size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_ALLREDUCE] ? - (coll_op->fragment_data.message_descriptor->n_bytes_total - - coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size : - (size_t) coll_op->variable_fn_params.count); - - /* calculate the fragment length */ - frag_len = count*dt_size; - - ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count, - (char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t) - coll_op->fragment_data.message_descriptor->n_bytes_scheduled)); - if (ret < 0) { - return OMPI_ERROR; - } - - /* No unpack for root */ - new_op->process_fn = mca_coll_ml_allreduce_small_unpack; - - /* Setup fragment specific data */ - new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; - new_op->fragment_data.buffer_desc = src_buffer_desc; - new_op->fragment_data.fragment_size = frag_len; - (new_op->fragment_data.message_descriptor->n_active)++; - - ML_SET_VARIABLE_PARAMS_BCAST( - new_op, - OP_ML_MODULE(new_op), - count, - MPI_BYTE, - src_buffer_desc, - 0, - 0, - frag_len, - src_buffer_desc->data_addr); - /* Fill in bcast specific arguments */ - /* TBD: remove buffer_size */ - new_op->variable_fn_params.buffer_size = frag_len; - new_op->variable_fn_params.count = count; - new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; - new_op->variable_fn_params.op = coll_op->variable_fn_params.op; - new_op->variable_fn_params.dtype = coll_op->variable_fn_params.dtype; - new_op->variable_fn_params.root = 0; - new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; - new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; - new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING; - - MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); - - ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d", - new_op->variable_fn_params.buffer_size, - new_op->fragment_data.fragment_size, - new_op->fragment_data.message_descriptor->n_bytes_scheduled)); - /* initialize first coll */ - ret = new_op->sequential_routine.seq_task_setup(new_op); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(3,("Fragment failed to initialize itself")); - return ret; - } - - /* append this collective !! */ - OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); - opal_list_append(&mca_coll_ml_component.sequential_collectives, - (opal_list_item_t *)new_op); - OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); - - } - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ -int parallel_allreduce_start(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_ml_module_t *ml_module, - ompi_request_t **req, - int small_data_allreduce, - int large_data_allreduce) -{ - int ret, n_fragments = 1, frag_len, - pipeline_depth, n_dts_per_frag ; - - ptrdiff_t lb, extent; - size_t pack_len, dt_size; - - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; - mca_coll_ml_collective_operation_progress_t *coll_op; - - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - - bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count); - - if (MPI_IN_PLACE == sbuf) { - sbuf = rbuf; - } - - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (ret < 0) { - return OMPI_ERROR; - } - - dt_size = (size_t) extent; - pack_len = count * dt_size; - - ML_VERBOSE(1,("The allreduce requested %d enable fragmentation %d ", - pack_len, - cm->enable_fragmentation)); - if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLREDUCE]) { - /* The len of the message can not be larger than ML buffer size */ - assert(pack_len <= ml_module->payload_block->size_buffer); - - ML_VERBOSE(1,("Using small data allreduce (threshold = %d)", - ml_module->small_message_thresholds[BCOL_ALLREDUCE])); - - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - while (OPAL_UNLIKELY(NULL == src_buffer_desc)) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_allreduce_functions[small_data_allreduce], - sbuf, rbuf, pack_len, 0); - - coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; - coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; - coll_op->variable_fn_params.count = count; - - ret = ompi_datatype_copy_content_same_ddt(dtype, count, - (void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf); - if (ret < 0){ - return OMPI_ERROR; - } - - /* unpack function */ - coll_op->process_fn = mca_coll_ml_allreduce_small_unpack; - } else if (cm->enable_fragmentation || !contiguous) { - ML_VERBOSE(1,("Using Fragmented Allreduce")); - - /* fragment the data */ - /* check for retarded application programming decisions */ - if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_ALLREDUCE]) { - ML_ERROR(("Sorry, but we don't support datatypes that large")); - return OMPI_ERROR; - } - - /* calculate the number of data types that can fit per ml-buffer */ - n_dts_per_frag = ml_module->small_message_thresholds[BCOL_ALLREDUCE] / dt_size; - - /* calculate the number of fragments */ - n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */ - - /* calculate the actual pipeline depth */ - pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth; - - /* calculate the fragment size */ - frag_len = n_dts_per_frag * dt_size; - - /* allocate an ml buffer */ - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_allreduce_functions[small_data_allreduce], - sbuf, rbuf, pack_len, 0 /* offset for first pack */); - - /* task setup callback function */ - coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup; - - coll_op->process_fn = mca_coll_ml_allreduce_small_unpack; - - coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; - coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; - - coll_op->fragment_data.message_descriptor->n_active = 1; - coll_op->full_message.n_bytes_scheduled = frag_len; - coll_op->full_message.fragment_launcher = mca_coll_ml_allreduce_frag_progress; - coll_op->full_message.pipeline_depth = pipeline_depth; - coll_op->fragment_data.current_coll_op = small_data_allreduce; - coll_op->fragment_data.fragment_size = frag_len; - - coll_op->variable_fn_params.count = n_dts_per_frag; /* seems fishy */ - coll_op->variable_fn_params.buffer_size = frag_len; - - /* copy into the ml-buffer */ - ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag, - (char *) src_buffer_desc->data_addr, (char *) sbuf); - if (ret < 0) { - return OMPI_ERROR; - } - } else { - ML_VERBOSE(1,("Using zero-copy ptp allreduce")); - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_allreduce_functions[large_data_allreduce], - sbuf, rbuf, pack_len, 0); - - coll_op->variable_fn_params.userbuf = - coll_op->variable_fn_params.sbuf = sbuf; - - coll_op->variable_fn_params.rbuf = rbuf; - - /* The ML buffer is used for testing. Later, when we - * switch to use knem/mmap/portals this should be replaced - * appropriately - */ - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - coll_op->variable_fn_params.count = count; - } - - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, - src_buffer_desc); - - /* set the offset */ - coll_op->variable_fn_params.sbuf_offset = 0; - coll_op->variable_fn_params.rbuf_offset = 0; - - /* Fill in the function arguments */ - coll_op->variable_fn_params.sequence_num = - OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); - coll_op->sequential_routine.current_active_bcol_fn = 0; - coll_op->variable_fn_params.dtype = dtype; - coll_op->variable_fn_params.op = op; - coll_op->variable_fn_params.root = 0; - coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup; /* invoked after each level in sequential - * progress call - */ - MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments); - - ret = mca_coll_ml_launch_sequential_collective (coll_op); - if (ret != OMPI_SUCCESS) { - ML_VERBOSE(10, ("Failed to launch")); - return ret; - } - - *req = &coll_op->full_message.super; - - return OMPI_SUCCESS; -} - -int mca_coll_ml_allreduce(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module; - ompi_request_t *req; - int ret; - - if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) { - /* coll/ml does not handle non-communative operations at this time. fallback - * on another collective module */ - return ml_module->fallback.coll_allreduce (sbuf, rbuf, count, dtype, op, comm, - ml_module->fallback.coll_allreduce_module); - } - - ret = parallel_allreduce_start(sbuf, rbuf, count, dtype, op, comm, - (mca_coll_ml_module_t *) module, &req, - ML_SMALL_DATA_ALLREDUCE, - ML_LARGE_DATA_ALLREDUCE); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_ERROR(("Failed to launch")); - return ret; - } - - ompi_request_wait_completion(req); - ompi_request_free(&req); - - ML_VERBOSE(10, ("Blocking NB allreduce is done")); - - return OMPI_SUCCESS; -} - -int mca_coll_ml_allreduce_nb(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module) -{ - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module; - int ret; - - if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) { - /* coll/ml does not handle non-communative operations at this time. fallback - * on another collective module */ - return ml_module->fallback.coll_iallreduce (sbuf, rbuf, count, dtype, op, comm, req, - ml_module->fallback.coll_iallreduce_module); - } - - ret = parallel_allreduce_start(sbuf, rbuf, count, dtype, op, comm, - (mca_coll_ml_module_t *) module, req, - ML_SMALL_DATA_ALLREDUCE, - ML_LARGE_DATA_ALLREDUCE); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_ERROR(("Failed to launch")); - return ret; - } - - ML_VERBOSE(10, ("Blocking NB allreduce is done")); - - return OMPI_SUCCESS; -} - -int mca_coll_ml_allreduce_dispatch(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - struct ompi_communicator_t *comm, mca_coll_base_module_t *module) -{ - int rc; - bool use_extra_topo; - ompi_request_t *req; - - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; - - use_extra_topo = (count > 1) ? - !ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_MULTI_ELEM_TYPE] : - !ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_SINGLE_ELEM_TYPE]; - - if (use_extra_topo) { - rc = parallel_allreduce_start(sbuf, rbuf, count, dtype, - op, comm, ml_module, &req, - ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE, - ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE); - } else { - rc = parallel_allreduce_start(sbuf, rbuf, count, dtype, - op, comm, ml_module, &req, - ML_SMALL_DATA_ALLREDUCE, - ML_LARGE_DATA_ALLREDUCE); - } - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - ML_ERROR(("Failed to launch")); - return rc; - } - - ompi_request_wait_completion(req); - ompi_request_free(&req); - - return OMPI_SUCCESS; -} - -int mca_coll_ml_allreduce_dispatch_nb(const void *sbuf, void *rbuf, int count, - ompi_datatype_t *dtype, ompi_op_t *op, - ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module) -{ - int rc; - bool use_extra_topo; - - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; - - use_extra_topo = (count > 1) ? - !ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_MULTI_ELEM_TYPE] : - !ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_SINGLE_ELEM_TYPE]; - - if (use_extra_topo) { - rc = parallel_allreduce_start(sbuf, rbuf, count, dtype, - op, comm, ml_module, req, - ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE, - ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE); - } else { - rc = parallel_allreduce_start(sbuf, rbuf, count, dtype, - op, comm, ml_module, req, - ML_SMALL_DATA_ALLREDUCE, - ML_LARGE_DATA_ALLREDUCE); - } - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - ML_ERROR(("Failed to launch")); - return rc; - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/ml/coll_ml_barrier.c b/ompi/mca/coll/ml/coll_ml_barrier.c deleted file mode 100644 index 6748d30054..0000000000 --- a/ompi/mca/coll/ml/coll_ml_barrier.c +++ /dev/null @@ -1,146 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#include "ompi_config.h" - -#include "ompi/constants.h" -#include "opal/threads/mutex.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/coll/coll.h" -#include "opal/sys/atomic.h" -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" - -static void mca_coll_ml_barrier_task_setup( - mca_coll_ml_task_status_t *task_status, - int index, mca_coll_ml_compound_functions_t *func) -{ - task_status->rt_num_dependencies = func->num_dependencies; - task_status->rt_num_dependent_tasks = func->num_dependent_tasks; - task_status->rt_dependent_task_indices = func->dependent_task_indices; -} - -static int mca_coll_ml_barrier_launch(mca_coll_ml_module_t *ml_module, - ompi_request_t **req) -{ - opal_free_list_item_t *item; - mca_coll_ml_collective_operation_progress_t *coll_op; - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL; - - /* allocate an ml buffer for signaling purposes */ - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - - /* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */ - item = opal_free_list_wait (&(ml_module->coll_ml_collective_descriptors)); - - coll_op = (mca_coll_ml_collective_operation_progress_t *) item; - assert(NULL != coll_op); - - ML_VERBOSE(10, ("Get coll request %p", coll_op)); - - MCA_COLL_ML_OP_BASIC_SETUP(coll_op, 0, 0, NULL, NULL, ml_module->coll_ml_barrier_function); - - coll_op->fragment_data.buffer_desc = src_buffer_desc; - coll_op->dag_description.num_tasks_completed = 0; - - coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; - - coll_op->variable_fn_params.sequence_num = - OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); - - /* Pointer to a coll finalize function */ - coll_op->process_fn = NULL; - - (*req) = &coll_op->full_message.super; - - OMPI_REQUEST_INIT((*req), false); - - (*req)->req_status._cancelled = 0; - (*req)->req_state = OMPI_REQUEST_ACTIVE; - (*req)->req_status.MPI_ERROR = OMPI_SUCCESS; - - /* Set order info if there is a bcol needs ordering */ - MCA_COLL_ML_SET_ORDER_INFO(coll_op, 1); - - return mca_coll_ml_generic_collectives_launcher(coll_op, mca_coll_ml_barrier_task_setup); -} - -/** - * Hierarchical blocking barrier - */ -int mca_coll_ml_barrier_intra(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - int rc; - ompi_request_t *req; - - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; - -#if OPAL_ENABLE_DEBUG - static int barriers_count = 0; -#endif - - ML_VERBOSE(10, ("Barrier num %d start.", ++barriers_count)); - - rc = mca_coll_ml_barrier_launch(ml_module, &req); - if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { - ML_ERROR(("Failed to launch a barrier.")); - return rc; - } - - /* Blocking barrier */ - ompi_request_wait_completion(req); - ompi_request_free(&req); - - ML_VERBOSE(10, ("Barrier num %d was done.", barriers_count)); - - return OMPI_SUCCESS; -} - -/** - * Hierarchical non-blocking barrier - */ -int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module) -{ - int rc; - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; - -#if OPAL_ENABLE_DEBUG - static int barriers_count = 0; -#endif - - ML_VERBOSE(10, ("IBarrier num %d start.", ++barriers_count)); - - rc = mca_coll_ml_barrier_launch(ml_module, req); - if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { - ML_ERROR(("Failed to launch a barrier.")); - return rc; - } - - ML_VERBOSE(10, ("IBarrier num %d was done.", barriers_count)); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/ml/coll_ml_bcast.c b/ompi/mca/coll/ml/coll_ml_bcast.c deleted file mode 100644 index 891838f944..0000000000 --- a/ompi/mca/coll/ml/coll_ml_bcast.c +++ /dev/null @@ -1,849 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#include "ompi_config.h" - -#include -#include - -#include "opal/threads/mutex.h" -#include "opal/sys/atomic.h" - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/bcol/bcol.h" - -#include "coll_ml.h" -#include "coll_ml_inlines.h" -#include "coll_ml_colls.h" -#include "coll_ml_allocation.h" - -#define ML_BUFFER_ALLOC_WAIT(ml, buffer) \ -do { \ - buffer = mca_coll_ml_alloc_buffer(ml); \ - while (NULL == buffer) { \ - opal_progress(); \ - buffer = mca_coll_ml_alloc_buffer(ml); \ - } \ -} while (0) - -#define COLL_ML_SETUP_ORDERING_INFO(op, last, prev) \ -do { \ - /* Don't change order of commands !!!! */ \ - (op)->prev_frag = prev; \ - (op)->fragment_data.message_descriptor->last_started_frag = last; \ - /* op->next_to_process_frag = NULL; */ \ -} while (0) - -#define ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, op, coll_index, root, \ - total_len, frag_len, buf, ml_buff_desc) \ -do { \ - op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, \ - ml_module->coll_ml_bcast_functions[coll_index], \ - buf, buf, \ - total_len, \ - 0 /* offset for first pack */); \ - if (OPAL_LIKELY(frag_len > 0)) { \ - if (ompi_comm_rank(ml_module->comm) == root) { \ - /* single frag, pack the data */ \ - memcpy((void *)(uintptr_t)(ml_buff_desc)->data_addr, \ - buf, frag_len); \ - /* No unpack for root */ \ - op->process_fn = NULL; \ - } else { \ - op->process_fn = mca_coll_ml_bcast_small_unpack_data; \ - } \ - } \ - op->full_message.n_bytes_scheduled = frag_len; \ -} while (0) - -#define SMALL_BCAST 0 -#define LARGE_BCAST (SMALL_BCAST + 1) - -/* bcast data unpack */ -static int mca_coll_ml_bcast_converter_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data = 0; - - mca_coll_ml_collective_operation_progress_t *next_op; - mca_coll_ml_module_t *ml_module = - (mca_coll_ml_module_t *) coll_op->coll_module; - - size_t max_index = - ml_module->payload_block->num_banks * ml_module->payload_block->num_buffers_per_bank; - - bool is_first = true; - int ret; - - /* Check if the fragment delivered in order */ - if (coll_op->fragment_data.buffer_desc->buffer_index != - coll_op->fragment_data.message_descriptor->next_expected_index) { - mca_coll_ml_collective_operation_progress_t *prev_coll_op = coll_op->prev_frag; - assert(NULL == prev_coll_op->next_to_process_frag); - /* make sure that previous process will have pointer to the out - of order process */ - prev_coll_op->next_to_process_frag = coll_op; - assert(!(coll_op->pending & REQ_OUT_OF_ORDER)); - coll_op->pending |= REQ_OUT_OF_ORDER; - /* we will unpack it later */ - ML_VERBOSE(10, ("Get %d expecting %d previous %d", - coll_op->fragment_data.buffer_desc->buffer_index, - coll_op->fragment_data.message_descriptor->next_expected_index, - prev_coll_op->fragment_data.buffer_desc->buffer_index)); - return ORTE_ERR_NO_MATCH_YET; - } - - do { - iov.iov_len = coll_op->fragment_data.fragment_size; - iov.iov_base = (void *)((uintptr_t) coll_op->fragment_data.buffer_desc->data_addr); - - ML_VERBOSE(10, ("Data unpack with convertern index %d", - coll_op->fragment_data.buffer_desc->buffer_index)); - - opal_convertor_unpack(&coll_op->fragment_data.message_descriptor->recv_convertor, - &iov, &iov_count, &max_data); - - /* update next index */ - ++coll_op->fragment_data.message_descriptor->next_expected_index; - if (coll_op->fragment_data.message_descriptor->next_expected_index >= max_index) { - coll_op->fragment_data.message_descriptor->next_expected_index = 0; - } - - /* Return to queue if the packet is done, - the exeption is first packet, we release it later. - */ - next_op = coll_op->next_to_process_frag; - coll_op->next_to_process_frag = NULL; - if ((!is_first) && - (0 != coll_op->fragment_data.offset_into_user_buffer)) { - assert(coll_op->pending & REQ_OUT_OF_ORDER); - coll_op->pending ^= REQ_OUT_OF_ORDER; - /* Pasha: On one hand - I'm not sure that conceptually it is right place to call buffer recycling. Potentially, - coll_ml_fragment_completion_processing() sounds like right place for out of order unpack/sync handling. - * On the other hand - non contiguous data is not supper common and we would like to minimize effect on critical pass - * for non contiguous data types. */ - ret = mca_coll_ml_buffer_recycling(coll_op); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return OMPI_ERROR; - } - - CHECK_AND_RECYCLE(coll_op); - } - - coll_op = next_op; - is_first = false; - } while (NULL != coll_op); - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_bcast_small_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - void * dest = (void *)((uintptr_t) coll_op->full_message.dest_user_addr + - (uintptr_t) coll_op->full_message.n_bytes_delivered); - void * src = (void *)((uintptr_t) coll_op->fragment_data.buffer_desc->data_addr); - - memcpy(dest, src, coll_op->fragment_data.fragment_size); - return OMPI_SUCCESS; -} - -static int mca_coll_ml_bcast_large_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - void * dest = (void *)((uintptr_t) coll_op->fragment_data.message_descriptor->dest_user_addr + - (uintptr_t) coll_op->fragment_data.offset_into_user_buffer); - void * src = (void *)((uintptr_t) coll_op->fragment_data.buffer_desc->data_addr); - - memcpy(dest, src, coll_op->fragment_data.fragment_size); - return OMPI_SUCCESS; -} - -static int mca_coll_ml_bcast_frag_converter_progress(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - /* local variables */ - int ret, frag_len; - size_t max_data = 0; - - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL; - mca_coll_ml_collective_operation_progress_t *new_op = NULL; - mca_coll_ml_task_setup_fn_t task_setup = NULL; - mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op); - - /* Keep the pipeline filled with fragments */ - while (coll_op->fragment_data.message_descriptor->n_active < - mca_coll_ml_component.pipeline_depth) { - /* If an active fragment happens to have completed the collective during - * a hop into the progress engine, then don't launch a new fragment, - * instead break and return. - */ - if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled - == coll_op->fragment_data.message_descriptor->n_bytes_total) { - break; - } - - /* Get an ml buffer */ - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - if (OPAL_UNLIKELY(NULL == src_buffer_desc)) { - /* If there exist outstanding fragments, then break out - * and let an active fragment deal with this later, - * there are no buffers available. - */ - if (0 < coll_op->fragment_data.message_descriptor->n_active) { - return OMPI_SUCCESS; - } - - /* It is useless to call progress from here, since - * ml progress can't be executed as result ml memsync - * call will not be completed and no memory will be - * recycled. So we put the element on the list, and we will - * progress it later when memsync will recycle some memory*/ - - /* The fragment is already on list and - * the we still have no ml resources - * Return busy */ - if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) { - coll_op->pending |= REQ_OUT_OF_MEMORY; - opal_list_append(&ml_module->waiting_for_memory_list, - (opal_list_item_t *)coll_op); - } - - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - /* Get a new collective descriptor and initialize it */ - new_op = mca_coll_ml_duplicate_op_prog_single_frag_dag - (ml_module, coll_op); - /* We need this address for pointer arithmetic in memcpy */ - frag_len = ML_GET_FRAG_SIZE(coll_op, BCOL_BCAST); - /* Decide based on global flag, not variable one */ - if (coll_op->fragment_data.message_descriptor->root) { - struct iovec iov; - uint32_t iov_count = 1; - - /* OBJ_RETAIN(new_op->variable_fn_params.dtype); */ - iov.iov_base = (IOVBASE_TYPE*) src_buffer_desc->data_addr; - iov.iov_len = ml_module->small_message_thresholds[BCOL_BCAST]; - assert(0 != iov.iov_len); - - max_data = ml_module->small_message_thresholds[BCOL_BCAST]; - opal_convertor_pack(&new_op->fragment_data.message_descriptor->send_convertor, - &iov, &iov_count, &max_data); - - new_op->process_fn = NULL; - new_op->variable_fn_params.root_flag = true; - new_op->variable_fn_params.root_route = NULL; - - task_setup = OP_ML_MODULE(new_op)-> - coll_ml_bcast_functions[new_op->fragment_data.current_coll_op]-> - task_setup_fn[COLL_ML_ROOT_TASK_FN]; - } else { - new_op->process_fn = mca_coll_ml_bcast_converter_unpack_data; - new_op->variable_fn_params.root_flag = false; - new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route; - - task_setup = OP_ML_MODULE(new_op)-> - coll_ml_bcast_functions[new_op->fragment_data.current_coll_op]-> - task_setup_fn[COLL_ML_GENERAL_TASK_FN]; - - max_data = ml_module->small_message_thresholds[BCOL_BCAST]; - mca_coll_ml_convertor_get_send_frag_size( - ml_module, &max_data, - new_op->fragment_data.message_descriptor); - } - - new_op->fragment_data.message_descriptor->n_bytes_scheduled += max_data; - new_op->fragment_data.fragment_size = max_data; - new_op->fragment_data.buffer_desc = src_buffer_desc; - - /* Setup fragment specific data */ - ++(new_op->fragment_data.message_descriptor->n_active); - - COLL_ML_SETUP_ORDERING_INFO(new_op, new_op, - new_op->fragment_data.message_descriptor->last_started_frag); - ML_VERBOSE(10, ("Start more, My index %d my prev %d", - new_op->fragment_data.buffer_desc->buffer_index, - new_op->prev_frag->fragment_data.buffer_desc->buffer_index)); - - ML_SET_VARIABLE_PARAMS_BCAST( - new_op, - OP_ML_MODULE(new_op), - frag_len, - MPI_BYTE, - src_buffer_desc, - 0, - 0, - frag_len, - src_buffer_desc->data_addr); - - /* TBD: remove buffer_size */ - new_op->variable_fn_params.buffer_size = coll_op->variable_fn_params.buffer_size; - new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; - - /* Set order info for new frag if there is a bcol needs ordering */ - MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); - - /* Launch this collective !! */ - ret = mca_coll_ml_generic_collectives_append_to_queue(new_op, task_setup); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_ERROR(("Failed to launch")); - return ret; - } - } - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_bcast_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - /* local variables */ - int ret; - int frag_len, current_coll_op = coll_op->fragment_data.current_coll_op; - size_t dt_size; - void *buf; - - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL; - mca_coll_ml_collective_operation_progress_t *new_op = NULL; - mca_coll_ml_task_setup_fn_t task_setup = NULL; - - ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size); - - /* Keep the pipeline filled with fragments */ - while (coll_op->fragment_data.message_descriptor->n_active < - coll_op->fragment_data.message_descriptor->pipeline_depth) { - /* If an active fragment happens to have completed the collective during - * a hop into the progress engine, then don't launch a new fragment, - * instead break and return. - */ - if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled - == coll_op->fragment_data.message_descriptor->n_bytes_total) { - break; - } - - /* Get an ml buffer */ - src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op)); - if (NULL == src_buffer_desc) { - /* If there exist outstanding fragments, then break out - * and let an active fragment deal with this later, - * there are no buffers available. - */ - if (0 < coll_op->fragment_data.message_descriptor->n_active) { - return OMPI_SUCCESS; - } - - /* It is useless to call progress from here, since - * ml progress can't be executed as result ml memsync - * call will not be completed and no memory will be - * recycled. So we put the element on the list, and we will - * progress it later when memsync will recycle some memory*/ - - /* The fragment is already on list and - * the we still have no ml resources - * Return busy */ - if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) { - ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op)); - coll_op->pending |= REQ_OUT_OF_MEMORY; - opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list), - (opal_list_item_t *) coll_op); - } else { - ML_VERBOSE(10,("Out of resources %p", coll_op)); - } - - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - /* Get a new collective descriptor and initialize it */ - new_op = mca_coll_ml_duplicate_op_prog_single_frag_dag - (OP_ML_MODULE(coll_op), coll_op); - /* We need this address for pointer arithmetic in memcpy */ - buf = coll_op->fragment_data.message_descriptor->dest_user_addr; - frag_len = ML_GET_FRAG_SIZE(coll_op, BCOL_BCAST); - - /* Decide based on global flag, not variable one */ - if (coll_op->fragment_data.message_descriptor->root) { - memcpy((void *)(uintptr_t)src_buffer_desc->data_addr, - (void *) ((uintptr_t) buf + (uintptr_t) coll_op-> - fragment_data.message_descriptor->n_bytes_scheduled) , frag_len); - - /* No unpack for root */ - new_op->process_fn = NULL; - new_op->variable_fn_params.root_flag = true; - new_op->variable_fn_params.root_route = NULL; - task_setup = OP_ML_MODULE(new_op)->coll_ml_bcast_functions[current_coll_op]-> - task_setup_fn[COLL_ML_ROOT_TASK_FN]; - - } else { - new_op->process_fn = mca_coll_ml_bcast_large_unpack_data; - new_op->variable_fn_params.root_flag = false; - new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route; - task_setup = OP_ML_MODULE(new_op)->coll_ml_bcast_functions[current_coll_op]-> - task_setup_fn[COLL_ML_GENERAL_TASK_FN]; - } - - /* Setup fragment specific data */ - new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; - new_op->fragment_data.buffer_desc = src_buffer_desc; - new_op->fragment_data.fragment_size = frag_len; - new_op->fragment_data.message_descriptor->n_active++; - - ML_SET_VARIABLE_PARAMS_BCAST( - new_op, - OP_ML_MODULE(new_op), - frag_len, - MPI_BYTE, - src_buffer_desc, - 0, - 0, - frag_len, - src_buffer_desc->data_addr); - - /* Fill in bcast specific arguments */ - /* TBD: remove buffer_size */ - new_op->variable_fn_params.buffer_size = coll_op->variable_fn_params.buffer_size; - new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; - - /* Set order info for new frag if there is a bcol needs ordering */ - MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); - - ML_VERBOSE(10, ("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d", - new_op->variable_fn_params.buffer_size , - new_op->fragment_data.fragment_size, - new_op->fragment_data.message_descriptor->n_bytes_scheduled)); - - /* Launch this collective !! */ - ret = mca_coll_ml_generic_collectives_append_to_queue(new_op, task_setup); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to launch")); - return ret; - } - } - - return OMPI_SUCCESS; -} - -#define BCAST_FRAGMENTATION_IS_ENABLED(module) \ - (module->bcast_fn_index_table[LARGE_BCAST] < ML_BCAST_LARGE_DATA_KNOWN) - -static inline __opal_attribute_always_inline__ - int parallel_bcast_start(void *buf, int count, struct ompi_datatype_t *dtype, - int root, mca_coll_base_module_t *module, ompi_request_t **req) -{ - size_t pack_len = 0; - size_t dt_size = 0; - bool contig = false; - int bcast_index, n_fragments = 1; - - mca_coll_ml_collective_operation_progress_t * coll_op = NULL; - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL; - mca_coll_ml_task_setup_fn_t task_setup; - OPAL_PTRDIFF_TYPE lb, extent; - - /* actual starting place of the user buffer (lb added) */ - void *actual_buf; - - ML_VERBOSE(10, ("Starting bcast, mca_coll_ml_bcast_uknown_root buf: %p", buf)); - - ompi_datatype_type_size(dtype, &dt_size); - pack_len = count * dt_size; - - /* Setup data buffer */ - ML_BUFFER_ALLOC_WAIT(ml_module, src_buffer_desc); - /* Get information about memory layout */ - contig = opal_datatype_is_contiguous_memory_layout((opal_datatype_t *)dtype, count); - - ompi_datatype_get_extent (dtype, &lb, &extent); - - actual_buf = (void *) ((uintptr_t) buf + lb); - - /* Allocate collective schedule and pack message */ - if (contig) { - if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_BCAST]) { - assert(pack_len <= ml_module->payload_block->size_buffer); - bcast_index = ml_module->bcast_fn_index_table[SMALL_BCAST]; - - ML_VERBOSE(10, ("Contig + small message %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index)); - ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op, bcast_index, root, pack_len, - pack_len, actual_buf, src_buffer_desc); - - ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, count, dtype, - src_buffer_desc, 0, 0, ml_module->payload_block->size_buffer, - (src_buffer_desc->data_addr)); - } else if (BCAST_FRAGMENTATION_IS_ENABLED(ml_module)) { - /* We moved the fragmentation decision from communication creation time to - runtime, since for large messages the if latency is not so critical */ - size_t n_dts_per_frag; - int frag_len, pipeline_depth = mca_coll_ml_component.pipeline_depth; - bcast_index = ml_module->bcast_fn_index_table[LARGE_BCAST]; - - ML_VERBOSE(10, ("Contig + fragmentation %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index)); - - /* Calculate the number of fragments required for this message */ - frag_len = (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_BCAST] ? - pack_len : (size_t) ml_module->small_message_thresholds[BCOL_BCAST]); - - n_dts_per_frag = frag_len/dt_size; - n_fragments = (pack_len + dt_size*n_dts_per_frag - 1)/(dt_size*n_dts_per_frag); - pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth); - - ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op, bcast_index, root, pack_len, - frag_len, actual_buf, src_buffer_desc); - ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, (frag_len/dt_size), dtype, - src_buffer_desc, 0, 0, frag_len, (src_buffer_desc->data_addr)); - - coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_progress; - coll_op->full_message.pipeline_depth = pipeline_depth; - /* Initialize fragment specific information */ - coll_op->fragment_data.current_coll_op = bcast_index; - /* coll_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; */ - coll_op->fragment_data.fragment_size = frag_len; - coll_op->fragment_data.message_descriptor->n_active++; - /* should be removed */ - coll_op->variable_fn_params.buffer_size = frag_len; - - ML_VERBOSE(10, ("Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d", - coll_op->variable_fn_params.buffer_size, - coll_op->fragment_data.fragment_size)); - } else { - bcast_index = ml_module->bcast_fn_index_table[LARGE_BCAST]; - ML_VERBOSE(10, ("Contig + zero copy %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index)); - - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_bcast_functions[bcast_index], - actual_buf, actual_buf, pack_len, - 0 /* offset for first pack */); - /* For large messages (bcast) this points to userbuf */ - /* Pasha: temporary work around for basesmuma, userbuf should - be removed */ - coll_op->variable_fn_params.userbuf = buf; - coll_op->process_fn = NULL; - coll_op->full_message.n_bytes_scheduled = pack_len; - - ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, count, dtype, - src_buffer_desc, 0, 0, - ml_module->payload_block->size_buffer, buf); - } - } else { - /* Non contiguous data type */ - bcast_index = ml_module->bcast_fn_index_table[SMALL_BCAST]; - ML_VERBOSE(10, ("NON Contig + fragmentation %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index)); - - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_bcast_functions[bcast_index], - actual_buf, actual_buf, pack_len, - 0 /* offset for first pack */); - if (OPAL_LIKELY(pack_len > 0)) { - size_t max_data = 0; - - if (ompi_comm_rank(ml_module->comm) == root) { - struct iovec iov; - uint32_t iov_count = 1; - - opal_convertor_copy_and_prepare_for_send( - ompi_mpi_local_convertor, - &dtype->super, count, buf, 0, - &coll_op->full_message.send_convertor); - - opal_convertor_get_packed_size(&coll_op->full_message.send_convertor, - &coll_op->full_message.send_converter_bytes_packed); - - coll_op->full_message.n_bytes_total = - coll_op->full_message.send_converter_bytes_packed; - - iov.iov_base = (IOVBASE_TYPE*) src_buffer_desc->data_addr; - iov.iov_len = ml_module->small_message_thresholds[BCOL_BCAST]; - max_data = ml_module->small_message_thresholds[BCOL_BCAST]; - opal_convertor_pack(&coll_op->full_message.send_convertor, - &iov, &iov_count, &max_data); - coll_op->process_fn = NULL; - coll_op->full_message.n_bytes_scheduled = max_data; - - /* We need prepare the data for future pipe line comunication */ - coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_converter_progress; - coll_op->full_message.pipeline_depth = mca_coll_ml_component.pipeline_depth; - coll_op->full_message.root = true; - - } else { - opal_convertor_copy_and_prepare_for_send( - ompi_mpi_local_convertor, - &dtype->super, count, NULL, 0, - &coll_op->full_message.dummy_convertor); - - /* In non-root case we use it for #bytes remaining to receive */ - opal_convertor_get_packed_size(&coll_op->full_message.dummy_convertor, - &coll_op->full_message.send_converter_bytes_packed); - - opal_convertor_copy_and_prepare_for_recv( - ompi_mpi_local_convertor, - &dtype->super, count, buf, 0, - &coll_op->full_message.recv_convertor); - - opal_convertor_get_unpacked_size(&coll_op->full_message.recv_convertor, - &coll_op->full_message.recv_converter_bytes_packed); - - coll_op->full_message.root = false; - coll_op->full_message.n_bytes_total = - coll_op->full_message.recv_converter_bytes_packed; - coll_op->process_fn = mca_coll_ml_bcast_converter_unpack_data; - - coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_converter_progress; - coll_op->full_message.pipeline_depth = mca_coll_ml_component.pipeline_depth; - - max_data = ml_module->small_message_thresholds[BCOL_BCAST]; - coll_op->full_message.dummy_conv_position = 0; - mca_coll_ml_convertor_get_send_frag_size( - ml_module, &max_data, - &coll_op->full_message); - - coll_op->full_message.n_bytes_scheduled = max_data; - } - } - coll_op->fragment_data.current_coll_op = bcast_index; - coll_op->fragment_data.message_descriptor->n_active++; - coll_op->fragment_data.fragment_size = coll_op->full_message.n_bytes_scheduled; - - /* Set initial index */ - coll_op->full_message.next_expected_index = src_buffer_desc->buffer_index; - - /* Prepare linking information for future frags */ - COLL_ML_SETUP_ORDERING_INFO(coll_op, coll_op, NULL); - - /* Since the data is already packed we will use MPI_BYTE and byte count as datatype */ - ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, coll_op->full_message.n_bytes_scheduled, MPI_BYTE, - src_buffer_desc, 0, 0, ml_module->payload_block->size_buffer,(src_buffer_desc->data_addr)); - - n_fragments = (coll_op->full_message.n_bytes_total + - ml_module->small_message_thresholds[BCOL_BCAST] - 1) / ml_module->small_message_thresholds[BCOL_BCAST]; - } - - coll_op->variable_fn_params.hier_factor = 1; - coll_op->fragment_data.buffer_desc = src_buffer_desc; - - /* Set order info if there is a bcol needs ordering */ - MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments); - - if (ompi_comm_rank(ml_module->comm) == root) { - coll_op->full_message.root = - coll_op->variable_fn_params.root_flag = true; - coll_op->variable_fn_params.root_route = NULL; - task_setup = ml_module->coll_ml_bcast_functions[bcast_index]-> - task_setup_fn[COLL_ML_ROOT_TASK_FN]; - } else { - coll_op->full_message.root = - coll_op->variable_fn_params.root_flag = false; - - coll_op->variable_fn_params.root_route = - (NULL == coll_op->coll_schedule->topo_info->route_vector ? - NULL : &coll_op->coll_schedule->topo_info->route_vector[root]); - - task_setup = ml_module->coll_ml_bcast_functions[bcast_index]-> - task_setup_fn[COLL_ML_GENERAL_TASK_FN]; - } - - *req = &coll_op->full_message.super; - return mca_coll_ml_generic_collectives_launcher(coll_op, task_setup); -} - -int mca_coll_ml_parallel_bcast(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - int ret; - ompi_request_t *req; - - ret = parallel_bcast_start(buf, count, dtype, root, module, &req); - if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) { - ML_VERBOSE(10, ("Failed to launch")); - return ret; - } - - /* Blocking bcast */ - ompi_request_wait_completion(req); - ompi_request_free(&req); - - ML_VERBOSE(10, ("Bcast is done mca_coll_ml_bcast_known")); - - return OMPI_SUCCESS; -} - -int mca_coll_ml_parallel_bcast_nb(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module) -{ - int ret; - - ret = parallel_bcast_start(buf, count, dtype, root, module, req); - if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) { - ML_VERBOSE(10, ("Failed to launch")); - return ret; - } - - ML_VERBOSE(10, ("Bcast is done mca_coll_ml_bcast_known")); - - return OMPI_SUCCESS; -} - -int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype_t *dtype, - int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) -{ - - /* local variables */ - int ret, fn_idx; - size_t pack_len = 0; - size_t dt_size = 0; - - mca_coll_ml_collective_operation_progress_t * coll_op = NULL; - mca_coll_ml_compound_functions_t *fixed_schedule; - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL; - mca_bcol_base_coll_fn_desc_t *func; - OPAL_PTRDIFF_TYPE lb, extent; - - /* actual starting place of the user buffer (lb added) */ - void *actual_buf; - - ML_VERBOSE(10, ("Starting static bcast, small messages")); - - assert(NULL != dtype); - /* Calculate size of the data, - * on this stage only contiguous data is supported */ - ompi_datatype_type_size(dtype, &dt_size); - pack_len = count * dt_size; - ompi_datatype_get_extent (dtype, &lb, &extent); - - actual_buf = (void *) ((uintptr_t) buf + lb); - - /* Setup data buffer */ - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - /* Allocate collective schedule and pack message */ - if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_BCAST]) { - /* The len of the message can not be larger than ML buffer size */ - assert(pack_len <= ml_module->payload_block->size_buffer); - - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_bcast_functions[ML_BCAST_SMALL_DATA_SEQUENTIAL], - actual_buf, actual_buf, pack_len, - 0 /* offset for first pack */); - if (ompi_comm_rank(comm) == root) { - /* single frag, pack the data */ - memcpy((void *)(uintptr_t)src_buffer_desc->data_addr, - buf, pack_len); - /* No unpack for root */ - coll_op->process_fn = NULL; - } else { - coll_op->process_fn = mca_coll_ml_bcast_small_unpack_data; - } - - coll_op->variable_fn_params.sbuf = - src_buffer_desc->data_addr; - } else { - ML_VERBOSE(10, ("ML_BCAST_LARGE_DATA_KNOWN case.")); - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_SEQUENTIAL], - actual_buf, actual_buf, pack_len, - 0 /* offset for first pack */); - /* For large messages (bcast) this points to userbuf */ - /* Pasha: temporary work around for basesmuma, userbuf should - be removed */ - coll_op->variable_fn_params.userbuf = - coll_op->variable_fn_params.sbuf = actual_buf; - - coll_op->process_fn = NULL; - } - - /* Fill in the function arguments */ - coll_op->variable_fn_params.sequence_num = - OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); - coll_op->variable_fn_params.count = count; - coll_op->variable_fn_params.dtype = dtype; - - coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; - coll_op->variable_fn_params.src_desc = src_buffer_desc; - coll_op->variable_fn_params.sbuf_offset = 0; - coll_op->variable_fn_params.rbuf_offset = 0; - - /* pasha - why we duplicate it ? */ - coll_op->fragment_data.buffer_desc = src_buffer_desc; - - /* pack data into payload buffer - NOTE: assume no fragmenation at this stage */ - if (ompi_comm_rank(comm) == root) { - coll_op->variable_fn_params.root_flag = true; - coll_op->variable_fn_params.root_route = - &coll_op->coll_schedule->topo_info->route_vector[root]; - - coll_op->full_message.n_bytes_scheduled = pack_len; - } else { - coll_op->variable_fn_params.root_flag = false; - coll_op->variable_fn_params.root_route = - &coll_op->coll_schedule->topo_info->route_vector[root]; - } - - /* seems like we should fix a schedule here and now */ - fixed_schedule = coll_op->coll_schedule-> - comp_fn_arr[coll_op->variable_fn_params.root_route->level]; - - /* now we set this schedule as the compound function list */ - coll_op->coll_schedule->component_functions = fixed_schedule; - - coll_op->sequential_routine.current_active_bcol_fn = 0; - - while (true) { - /* ready, aim, fire collective(s)!! */ - fn_idx = coll_op->sequential_routine.current_active_bcol_fn; - - func = fixed_schedule[fn_idx].bcol_function; - ret = func->coll_fn(&coll_op->variable_fn_params, - (struct mca_bcol_base_function_t *) &fixed_schedule[fn_idx].constant_group_data); - /* set the coll_fn_started flag to true */ - if (BCOL_FN_COMPLETE == ret) { - /* done with this routine, bump the active counter */ - coll_op->sequential_routine.current_active_bcol_fn++; - coll_op->variable_fn_params.root_flag = true; - /* check for collective completion */ - if (coll_op->sequential_routine.current_active_bcol_fn == - coll_op->coll_schedule->n_fns) { - /* handle fragment completion */ - ret = coll_ml_fragment_completion_processing(coll_op); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing"); - } - - /* break out of while loop */ - break; - } - } else { - /* put entire collective opperation onto sequential queue */ - opal_list_append(&mca_coll_ml_component.sequential_collectives, - (opal_list_item_t *) coll_op); - break; - } - } - - /* Blocking bcast */ - ompi_request_wait_completion(&coll_op->full_message.super); - ompi_request_free((ompi_request_t **) &coll_op); - - ML_VERBOSE(10, ("Bcast is done")); - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/ml/coll_ml_colls.h b/ompi/mca/coll/ml/coll_ml_colls.h deleted file mode 100644 index fcefa19b44..0000000000 --- a/ompi/mca/coll/ml/coll_ml_colls.h +++ /dev/null @@ -1,552 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_COLL_ML_COLLS_H -#define MCA_COLL_ML_COLLS_H - -#include "ompi_config.h" -#include "ompi/mca/bcol/bcol.h" - -#define COLL_ML_FN_NAME_LEN 256 - - -/* utility information used to coordinate activities, such as resource - * management between different functions in the hierarchy - */ -struct mca_coll_ml_utility_data_t { - - /* RLG - temp fix !!!! - really need to remove this, but right now - do not want to change the signature of the collective primitives to - use coll_ml_utility_data_t rather than mca_bcol_base_function_t */ - int dummy; - - /* module */ - struct mca_bcol_base_module_t *bcol_module; - - /* */ - int index_in_consecutive_same_bcol_calls; - - /* number of times functions from this bcol are called in order */ - int n_of_this_type_in_a_row; - - /* number of times functions from this module are called - * in the collective operation. */ - int n_of_this_type_in_collective; - int index_of_this_type_in_collective; - -}; -typedef struct mca_coll_ml_utility_data_t mca_coll_ml_utility_data_t; - - -/* forward declaration */ -struct mca_coll_ml_collective_operation_progress_t; -struct mca_coll_ml_task_status_t; - -typedef int (* mca_coll_ml_process_op_fn_t) - (struct mca_coll_ml_collective_operation_progress_t *coll_op); - -typedef int (* mca_coll_ml_task_comp_fn_t) - (struct mca_coll_ml_task_status_t *coll_op); - -typedef int (* mca_coll_ml_fragment_launch_fn_t) - ( struct mca_coll_ml_collective_operation_progress_t *coll_op); - -typedef int (* mca_coll_ml_sequential_task_setup_fn_t) - ( struct mca_coll_ml_collective_operation_progress_t *coll_op); -/* This data structure defines the dependencies for a given - * compound operation. We will use this as a basis for implementing - * collective operations. - */ -struct mca_coll_ml_compound_functions_t { - /* label */ - char fn_name[COLL_ML_FN_NAME_LEN]; - - /* hierarchy level that is used for this bcol */ - int h_level; - - /* the list of functions that make up this task */ - /* coll_bcol_collective_description_t *bcol_function; */ - mca_bcol_base_coll_fn_desc_t *bcol_function; - /* task completion function for this compound function */ - mca_coll_ml_task_comp_fn_t task_comp_fn; - - /* module specific information that is a constant on a per group - * basis - */ - mca_coll_ml_utility_data_t constant_group_data; - - /* number of dependencies to be satified before these function can be - * started */ - int num_dependencies; - - /* - * number of notifications to perform on completion. The assumption - * is that a counter will be incremented. - */ - int num_dependent_tasks; - - /* - * pointers to counters that need be updated. This assumes - * an array of tasks is used to describe the ML level - * collective operation, with these indecies referencing elements - * in this array. - */ - int *dependent_task_indices; - -}; - -typedef struct mca_coll_ml_compound_functions_t mca_coll_ml_compound_functions_t; - -/* Forward declaration for operation_description_t */ -struct mca_coll_ml_module_t; - -enum { - COLL_ML_GENERAL_TASK_FN, - COLL_ML_ROOT_TASK_FN, - COLL_ML_MAX_TASK_FN -}; - -enum { - SEQ_TASK_NOT_STARTED, - SEQ_TASK_PENDING, - SEQ_TASK_IN_PROG -}; - -typedef void (*mca_coll_ml_task_setup_fn_t) (struct mca_coll_ml_task_status_t *task_status, int index, struct mca_coll_ml_compound_functions_t *func); - -/* - * Collective operation definition - */ -struct mca_coll_ml_collective_operation_description_t { - - /* - * Type of collective opeartion - there are two types: - * 1) sequential progress through the collectives is sufficient - * 2) general treatment, popping tasks onto execution queus is needed. - */ - int progress_type; - - struct mca_coll_ml_topology_t *topo_info; - - /* - * number of functions in collective operation - */ - int n_fns; - - /* - * list of functions - */ - mca_coll_ml_compound_functions_t *component_functions; - - /* - * array of lists of functions - */ - mca_coll_ml_compound_functions_t **comp_fn_arr; - - /* - * indices into the list - fixes a sequential schedule - */ - int *sch_idx; - - /* - * Task setup functions, so far we have only 3 - root and non-root - */ - mca_coll_ml_task_setup_fn_t task_setup_fn[COLL_ML_MAX_TASK_FN]; - - /* number of functions are called for bcols need ordering */ - int n_fns_need_ordering; -}; -typedef struct mca_coll_ml_collective_operation_description_t - mca_coll_ml_collective_operation_description_t; - -/* Data structure used to track the state of individual bcol - * functions. This is used to track dependencies and completion - * to progress the ML level function correctly. - * - * mca_coll_ml_task_status_t will be associated with an - * mca_coll_ml_collective_operation_progress_t structure for - * the duration of the lifetime of a communicator. - * An array of task statuses will be stored with - * the mca_coll_ml_collective_operation_progress_t data structure, so - * that the taks status elements do not need to be moved back to - * a free list before they are re-used. When the ML level function - * is complete, all mca_coll_ml_task_status_t are available for - * re-use. - */ -struct mca_coll_ml_task_status_t{ - /* need to move this between lists to progress this correctly */ - opal_list_item_t item; - - /* number of dependencies satisfied */ - int n_dep_satisfied; - - /* *************************************************************** - * Pasha: - * I'm adding to the status: num_dependencies, num_dependent_tasks and - * dependent_task_indices. The information originally resided on mca_coll_ml_compound_functions_t. - * For collective operation with static nature it is not problem. - * But for Bcast operation, where run time parameters, like root, actually - * define the dependency. rt prefix mean run-time. - */ - - /* number of dependencies to be satisfied before these function can be - * started */ - int rt_num_dependencies; - - /* - * number of notifications to perform on completion. The assumption - * is that a counter will be incremented. - */ - int rt_num_dependent_tasks; - - /* - * pointers to counters that need be updated. This assumes - * an array of tasks is used to describe the ML level - * collective operation, with these indecies referencing elements - * in this array. - */ - int *rt_dependent_task_indices; - /* - * - * ***************************************************************/ - - /* index in collective schedule */ - int my_index_in_coll_schedule; - - /* function pointers */ - mca_bcol_base_coll_fn_desc_t *bcol_fn; - - /* association with a specific collective task - the ML - * mca_coll_ml_collective_operation_progress_t stores the - * specific function parameters */ - struct mca_coll_ml_collective_operation_progress_t *ml_coll_operation; - - mca_coll_ml_task_comp_fn_t task_comp_fn; -}; -typedef struct mca_coll_ml_task_status_t mca_coll_ml_task_status_t; - -typedef enum mca_coll_ml_pending_type_t { - REQ_OUT_OF_ORDER = 1, - REQ_OUT_OF_MEMORY = 1 << 1 -} mca_coll_ml_pending_type_t; - -/* Forward declaration */ -struct mca_bcol_base_payload_buffer_desc_t; -/* Data structure used to track ML level collective operation - * progress. - */ -struct mca_coll_ml_collective_operation_progress_t { - /* need this to put on a list properly */ - /* Full message information */ - struct full_message_t { - /* make this a list item */ - ompi_request_t super; - /* Next expected fragment. - * It used for controling order of converter unpack operation */ - size_t next_expected_index; - /* Pointer to last intilized fragment. - * It used for controling order of converter unpack operation */ - struct mca_coll_ml_collective_operation_progress_t *last_started_frag; - /* destination data address in user memory */ - void *dest_user_addr; - /* source data address in user memory */ - const void *src_user_addr; - /* total message size */ - size_t n_bytes_total; - /* per-process total message size - relevant for operations - * such as gather and scatter, where each rank has it's - * own unique data - */ - size_t n_bytes_per_proc_total; - size_t max_n_bytes_per_proc_total; - /* data processes - from a local perspective */ - size_t n_bytes_delivered; - /* current offset - where to continue with next fragment */ - size_t n_bytes_scheduled; - /* number of fragments needed to process this message */ - size_t n_fragments; - /* number of active frags */ - int n_active; - /* actual pipeline depth */ - int pipeline_depth; - /* am I the real root of the collective ? */ - bool root; - /* collective fragment launcher */ - mca_coll_ml_fragment_launch_fn_t fragment_launcher; - /* is data contingous */ - bool send_data_continguous; - bool recv_data_continguous; - /* data type count */ - int64_t send_count; - int64_t recv_count; - /* extent of the data types */ - size_t send_extent; - size_t recv_extent; - /* send data type */ - struct ompi_datatype_t * send_data_type; - /* needed for non-contigous buffers */ - size_t offset_into_send_buffer; - /* receive data type */ - struct ompi_datatype_t * recv_data_type; - /* needed for non-contigous buffers */ - size_t offset_into_recv_buffer; - /* Convertors for non contigous data */ - opal_convertor_t send_convertor; - opal_convertor_t recv_convertor; - /* Will be used by receiver for #bytes calc in the next frag */ - opal_convertor_t dummy_convertor; - size_t dummy_conv_position; - /* Size of packed data */ - size_t send_converter_bytes_packed; - size_t recv_converter_bytes_packed; - /* In case if ordering is needed: order num for next frag */ - int next_frag_num; - /* The variable is used by non-blocking memory synchronization code - * for caching bank index */ - int bank_index_to_recycle; - /* need a handle for collective progress e.g. alltoall*/ - bcol_fragment_descriptor_t frag_info; - } full_message; - - /* collective operation being progressed */ - mca_coll_ml_collective_operation_description_t *coll_schedule; - /* */ - mca_coll_ml_process_op_fn_t process_fn; - - mca_coll_base_module_t *coll_module; - - /* If not null , we have to release next fragment */ - struct mca_coll_ml_collective_operation_progress_t *next_to_process_frag; - /* pointer to previous fragment */ - struct mca_coll_ml_collective_operation_progress_t *prev_frag; - /* This flag marks that the fragment is pending on the waiting - * to be processed prior to recycling - */ - enum mca_coll_ml_pending_type_t pending; - - /* Fragment data */ - struct fragment_data_t { - /* current buffer pointer - offset (in bytes) into the user data */ - size_t offset_into_user_buffer; - size_t offset_into_user_buffer_per_proc; - - /* amount of data (in bytes) in this fragment - amount of data - * actually processed */ - size_t fragment_size; - size_t per_rank_fragment_size; - size_t data_type_count_per_frag; - - /* pointer to full message progress data */ - struct full_message_t *message_descriptor; - - /* ML buffer descriptor attached to this buffer */ - struct mca_bcol_base_payload_buffer_desc_t *buffer_desc; - /* handle for collective progress, e.g. alltoall */ - bcol_fragment_descriptor_t bcol_fragment_desc; - - /* Which collective algorithm */ - int current_coll_op; - } fragment_data; - - /* specific function parameters */ - /* the assumption is that the variable parameters passed into - * the ML level function will persist until the collective operation - * is complete. For a blocking function this is until the collective - * function is exited, and for nonblocking collective functions this - * is until test or wait completes the collective. - */ - int global_root; - bcol_function_args_t variable_fn_params; - - struct{ - /* current active function - for sequential algorithms */ - int current_active_bcol_fn; - - /* current function status - not started, or in progress. - * When the routine has completed, the active bcol index is - * incremented, so no need to keep track of a completed - * status. - */ - int current_bcol_status; - - /* use this call back to setup algorithm specific info - after each level necessary - */ - mca_coll_ml_sequential_task_setup_fn_t seq_task_setup; - - } sequential_routine; - - struct{ - /* - * BCOL function status - individual elements will be posted to - * ml level component queues, as appropriate. - */ - mca_coll_ml_task_status_t *status_array; - - /* number of completed tasks - need this for collective completion. - * Resource completion is tracked by each BCOL module . - */ - int num_tasks_completed; - } dag_description; -}; -typedef struct mca_coll_ml_collective_operation_progress_t -mca_coll_ml_collective_operation_progress_t; -OBJ_CLASS_DECLARATION(mca_coll_ml_collective_operation_progress_t); - -#define OP_ML_MODULE(op) ((mca_coll_ml_module_t *)((op)->coll_module)) -#define GET_COMM(op) ((OP_ML_MODULE(op))->comm) -#define IS_COLL_SYNCMEM(op) (ML_MEMSYNC == op->fragment_data.current_coll_op) - -#define CHECK_AND_RECYCLE(op) \ -do { \ - if (0 == (op)->pending) { \ - /* Caching 2 values that we can't to touch on op after returing it */ \ - /* back to the free list (free list may release memory on distruct )*/ \ - struct ompi_communicator_t *comm = GET_COMM(op); \ - bool is_coll_sync = IS_COLL_SYNCMEM(op); \ - ML_VERBOSE(10, ("Releasing %p", op)); \ - OMPI_REQUEST_FINI(&(op)->full_message.super); \ - opal_free_list_return (&(((mca_coll_ml_module_t *)(op)->coll_module)-> \ - coll_ml_collective_descriptors), \ - (opal_free_list_item_t *)op); \ - /* Special check for memory synchronization completion */ \ - /* We have to return it first to free list, since the communicator */ \ - /* release potentially may trigger ML module distraction and having */ \ - /* the element not on the list may cause memory leak. */ \ - if (OPAL_UNLIKELY(is_coll_sync)) { \ - if (OMPI_COMM_IS_INTRINSIC(comm)) { \ - opal_show_help("help-mpi-coll-ml.txt", \ - "coll-ml-check-fatal-error", true, \ - comm->c_name); \ - ompi_mpi_abort(comm, 6); \ - } else { \ - opal_show_help("help-mpi-coll-ml.txt", \ - "coll-ml-check-error", true, \ - comm->c_name); \ - /* After this point it is UNSAFE to touch ml module */ \ - /* or communicator */ \ - OBJ_RELEASE(comm); \ - } \ - } \ - } \ -} while (0) - -#define MCA_COLL_ML_SET_ORDER_INFO(coll_progress, num_frags) \ -do { \ - mca_coll_ml_topology_t *topo = (coll_progress)->coll_schedule->topo_info; \ - bcol_function_args_t *variable_params = &(coll_progress)->variable_fn_params; \ - if (topo->topo_ordering_info.num_bcols_need_ordering > 0) { \ - variable_params->order_info.bcols_started = 0; \ - variable_params->order_info.order_num = \ - topo->topo_ordering_info.next_order_num; \ - variable_params->order_info.n_fns_need_ordering = \ - (coll_progress)->coll_schedule->n_fns_need_ordering; \ - topo->topo_ordering_info.next_order_num += num_frags; \ - (coll_progress)->fragment_data.message_descriptor->next_frag_num = \ - variable_params->order_info.order_num + 1; \ - } \ -} while (0) - -#define MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(coll_progress) \ -do { \ - mca_coll_ml_topology_t *topo = (coll_progress)->coll_schedule->topo_info; \ - if (topo->topo_ordering_info.num_bcols_need_ordering > 0) { \ - bcol_function_args_t *variable_params = &(coll_progress)->variable_fn_params; \ - struct fragment_data_t *frag_data = &(coll_progress)->fragment_data; \ - variable_params->order_info.bcols_started = 0; \ - variable_params->order_info.order_num = frag_data->message_descriptor->next_frag_num; \ - variable_params->order_info.n_fns_need_ordering = \ - (coll_progress)->coll_schedule->n_fns_need_ordering; \ - frag_data->message_descriptor->next_frag_num++; \ - } \ -} while (0) - -#define MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule) \ -do { \ - int i; \ - (schedule)->n_fns_need_ordering = 0; \ - for (i = 0; i < (schedule)->n_fns; ++i) { \ - mca_bcol_base_module_t *current_bcol = \ - (schedule)->component_functions[i].constant_group_data.bcol_module; \ - assert (NULL != current_bcol); \ - if (current_bcol->bcol_component->need_ordering) { \ - (schedule)->n_fns_need_ordering++; \ - } \ - } \ -} while (0) - -enum { - MCA_COLL_ML_NET_STREAM_SEND, - MCA_COLL_ML_NET_STREAM_RECV -}; - -static inline __opal_attribute_always_inline__ - int mca_coll_ml_convertor_prepare(ompi_datatype_t *dtype, int count, const void *buff, - opal_convertor_t *convertor, int stream) -{ - size_t bytes_packed; - - if (MCA_COLL_ML_NET_STREAM_SEND == stream) { - opal_convertor_copy_and_prepare_for_send( - ompi_mpi_local_convertor, - &dtype->super, count, buff, 0, - convertor); - } else { - opal_convertor_copy_and_prepare_for_recv( - ompi_mpi_local_convertor, - &dtype->super, count, buff, 0, - convertor); - } - - opal_convertor_get_packed_size(convertor, &bytes_packed); - - return bytes_packed; -} - -static inline __opal_attribute_always_inline__ - int mca_coll_ml_convertor_pack(void *data_addr, size_t buff_size, - opal_convertor_t *convertor) -{ - struct iovec iov; - - size_t max_data = 0; - uint32_t iov_count = 1; - - iov.iov_base = (IOVBASE_TYPE*) data_addr; - iov.iov_len = buff_size; - - opal_convertor_pack(convertor, &iov, &iov_count, &max_data); - - return max_data; -} - -static inline __opal_attribute_always_inline__ - int mca_coll_ml_convertor_unpack(void *data_addr, size_t buff_size, - opal_convertor_t *convertor) -{ - struct iovec iov; - - size_t max_data = 0; - uint32_t iov_count = 1; - - iov.iov_base = (void *) (uintptr_t) data_addr; - iov.iov_len = buff_size; - - opal_convertor_unpack(convertor, &iov, &iov_count, &max_data); - - return max_data; -} -#endif /* MCA_COLL_ML_COLLS_H */ - diff --git a/ompi/mca/coll/ml/coll_ml_component.c b/ompi/mca/coll/ml/coll_ml_component.c deleted file mode 100644 index 4b4cf277c1..0000000000 --- a/ompi/mca/coll/ml/coll_ml_component.c +++ /dev/null @@ -1,449 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Most of the description of the data layout is in the - * coll_sm_module.c file. - */ - -#include "ompi_config.h" - -#include -#include -#include -#include - -#include "ompi/constants.h" -#include "ompi/mca/coll/base/base.h" -#include "opal/mca/mpool/base/base.h" -#include "opal/mca/mpool/mpool.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/mca/sbgp/base/base.h" - -#include "coll_ml.h" -#include "coll_ml_inlines.h" - -#include "ompi/patterns/net/netpatterns.h" -#include "coll_ml_mca.h" -#include "coll_ml_custom_utils.h" - - -/* - * Public string showing the coll ompi_ml V2 component version number - */ -const char *mca_coll_ml_component_version_string = -"Open MPI ml-V2 collective MCA component version " OMPI_VERSION; - -/* - * Local functions - */ - -static int ml_open(void); -static int ml_close(void); -static int coll_ml_progress(void); - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ - -mca_coll_ml_component_t mca_coll_ml_component = { - - /* First, fill in the super */ - - .super = { - /* First, the mca_component_t struct containing meta - information about the component itself */ - - .collm_version = { - MCA_COLL_BASE_VERSION_2_0_0, - - /* Component name and version */ - - .mca_component_name = "ml", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - - /* Component open, close, and register functions */ - - .mca_open_component = ml_open, - .mca_close_component = ml_close, - .mca_register_component_params = mca_coll_ml_register_params - }, - .collm_data = { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE - }, - - /* Initialization / querying functions */ - .collm_init_query = mca_coll_ml_init_query, - .collm_comm_query = mca_coll_ml_comm_query, - }, -}; - -void mca_coll_ml_abort_ml(char *message) -{ - ML_ERROR(("ML Collective FATAL ERROR: %s", message)); - /* shutdown the MPI */ - ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_INTERN); -} -/* - * progress function - */ - -#define INDEX(task) ((task)->my_index_in_coll_schedule) -#define ACTIVE_L (&mca_coll_ml_component.active_tasks) -#define PENDING_L (&mca_coll_ml_component.pending_tasks) -#define SEQ_L (&mca_coll_ml_component.sequential_collectives) - -static int coll_ml_progress() -{ - - int rc = OMPI_SUCCESS; - int fn_idx; - - mca_coll_ml_task_status_t *task_status, *task_status_tmp; - mca_coll_ml_collective_operation_progress_t *seq_coll_op; - mca_coll_ml_collective_operation_progress_t *seq_coll_op_tmp; - - mca_bcol_base_module_collective_fn_primitives_t progress_fn, - coll_fn; - mca_coll_ml_utility_data_t *const_args; - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - - /* Pasha: Not sure that is it correct way to resolve the problem. - Iprobe call for progress engine. The progress engine calls for our - progress and as result the first element on the list is progressed again - and so we call for Iprobe again.... as result we get HUGE stack. - - One way to prevent it - remove the item from the list, and once you finish - to process it - put it back. - - Other way - put flag on component, if the progress is running - exit immediate. - */ - if (cm->progress_is_busy) { - /* We are already working...*/ - return OMPI_SUCCESS; - } else { - cm->progress_is_busy = true; - } - - /* progress sequential collective operations */ - /* RLG - need to do better here for parallel progress */ - OPAL_THREAD_LOCK(&(cm->sequential_collectives_mutex)); - OPAL_LIST_FOREACH_SAFE(seq_coll_op, seq_coll_op_tmp, SEQ_L, mca_coll_ml_collective_operation_progress_t) { - do { - fn_idx = seq_coll_op->sequential_routine.current_active_bcol_fn; - /* initialize the task */ - - if (SEQ_TASK_IN_PROG == seq_coll_op->sequential_routine.current_bcol_status){ - progress_fn = seq_coll_op->coll_schedule-> - component_functions[fn_idx].bcol_function->progress_fn; - } else { - /* PPP Pasha - apparently task setup should be called only here. see linr 190 */ - progress_fn = seq_coll_op->coll_schedule-> - component_functions[fn_idx].bcol_function->coll_fn; - } - - const_args = &seq_coll_op->coll_schedule->component_functions[fn_idx].constant_group_data; - /* RLG - note need to move to useing coll_ml_utility_data_t as - * collective argument, rather than mca_bcol_base_function_t - */ - rc = progress_fn(&(seq_coll_op->variable_fn_params), (mca_bcol_base_function_t *)const_args); - if (BCOL_FN_COMPLETE == rc) { - /* done with this routine */ - seq_coll_op->sequential_routine.current_active_bcol_fn++; - /* this is totally hardwired for bcast, need a general call-back */ - - fn_idx = seq_coll_op->sequential_routine.current_active_bcol_fn; - if (fn_idx == seq_coll_op->coll_schedule->n_fns) { - /* done with this collective - recycle descriptor */ - - /* remove from the progress list */ - (void) opal_list_remove_item(SEQ_L, (opal_list_item_t *)seq_coll_op); - - /* handle fragment completion */ - rc = coll_ml_fragment_completion_processing(seq_coll_op); - - if (OMPI_SUCCESS != rc) { - mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing"); - } - } else { - rc = seq_coll_op->sequential_routine.seq_task_setup(seq_coll_op); - if (OMPI_SUCCESS != rc) { - mca_coll_ml_abort_ml("Failed to run sequential task setup"); - } - - seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING; - continue; - } - } else if (BCOL_FN_NOT_STARTED == rc) { - seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING; - } else if (BCOL_FN_STARTED == rc) { - seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_IN_PROG; - } - - break; - } while (true); - } - OPAL_THREAD_UNLOCK(&(cm->sequential_collectives_mutex)); - - /* general dag's */ - /* see if active tasks can be progressed */ - OPAL_THREAD_LOCK(&(cm->active_tasks_mutex)); - OPAL_LIST_FOREACH(task_status, ACTIVE_L, mca_coll_ml_task_status_t) { - /* progress task */ - progress_fn = task_status->bcol_fn->progress_fn; - const_args = &task_status->ml_coll_operation->coll_schedule-> - component_functions[INDEX(task_status)].constant_group_data; - rc = progress_fn(&(task_status->ml_coll_operation->variable_fn_params), - (mca_bcol_base_function_t *)const_args); - if (BCOL_FN_COMPLETE == rc) { - ML_VERBOSE(3, ("GOT BCOL_COMPLETED!!!!")); - rc = mca_coll_ml_task_completion_processing(&task_status, ACTIVE_L); - if (OMPI_SUCCESS != rc) { - mca_coll_ml_abort_ml("Failed to run mca_coll_ml_task_completion_processing"); - } - } else if (BCOL_FN_STARTED == rc) { - /* nothing to do */ - } else { - mca_coll_ml_abort_ml("Failed to run mca_coll_ml_task_completion_processing"); - } - } - OPAL_THREAD_UNLOCK(&(cm->active_tasks_mutex)); - - /* see if new tasks can be initiated */ - OPAL_THREAD_LOCK(&(cm->pending_tasks_mutex)); - OPAL_LIST_FOREACH_SAFE(task_status, task_status_tmp, PENDING_L, mca_coll_ml_task_status_t) { - /* check to see if dependencies are satisfied */ - int n_dependencies = task_status->rt_num_dependencies; - int n_dependencies_satisfied = task_status->n_dep_satisfied; - - if (n_dependencies == n_dependencies_satisfied) { - /* initiate the task */ - coll_fn = task_status->bcol_fn->coll_fn; - const_args = &task_status->ml_coll_operation->coll_schedule-> - component_functions[INDEX(task_status)].constant_group_data; - rc = coll_fn(&(task_status->ml_coll_operation->variable_fn_params), - (mca_bcol_base_function_t *)const_args); - if (BCOL_FN_COMPLETE == rc) { - ML_VERBOSE(3, ("GOT BCOL_COMPLETED!")); - rc = mca_coll_ml_task_completion_processing(&task_status, PENDING_L); - if (OMPI_SUCCESS != rc) { - mca_coll_ml_abort_ml("Failed to run mca_coll_ml_task_completion_processing"); - } - } else if ( BCOL_FN_STARTED == rc ) { - ML_VERBOSE(3, ("GOT BCOL_STARTED!")); - (void) opal_list_remove_item(PENDING_L, (opal_list_item_t *)task_status); - /* RLG - is there potential for deadlock here ? Need to - * look at this closely - */ - OPAL_THREAD_LOCK(&(cm->active_tasks_mutex)); - opal_list_append(ACTIVE_L, (opal_list_item_t *)task_status); - OPAL_THREAD_UNLOCK(&(cm->active_tasks_mutex)); - } else if( BCOL_FN_NOT_STARTED == rc ) { - /* nothing to do */ - ML_VERBOSE(10, ("GOT BCOL_FN_NOT_STARTED!")); - } else { - OPAL_THREAD_UNLOCK(&(cm->pending_tasks_mutex)); - /* error will be returned - RLG : need to reconsider return - * types - we have no way to convey error information - * the way the code is implemented now */ - ML_VERBOSE(3, ("GOT error !")); - rc = OMPI_ERROR; - OMPI_ERRHANDLER_RETURN(rc,MPI_COMM_WORLD,rc,"Error returned from bcol function: aborting"); - break; - } - } - } - OPAL_THREAD_UNLOCK(&(cm->pending_tasks_mutex)); - - /* return */ - cm->progress_is_busy = false; - - return rc; -} - - -static void adjust_coll_config_by_mca_param(void) -{ - /* setting bcast mca params */ - if (COLL_ML_STATIC_BCAST == mca_coll_ml_component.bcast_algorithm) { - mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_KNOWN; - mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_KNOWN; - } else if (COLL_ML_SEQ_BCAST == mca_coll_ml_component.bcast_algorithm) { - mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_SEQUENTIAL; - mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_SEQUENTIAL; - } else { /* Unknown root */ - mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_UNKNOWN; - mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_UNKNOWN; - } -} - -/* - * Open the component - */ -static int ml_open(void) -{ - /* local variables */ - int rc, c_idx, m_idx; - mca_coll_ml_component_t *cs = &mca_coll_ml_component; - - /* set the starting sequence number */ - cs->base_sequence_number = -1; - cs->progress_is_busy = false; - - /* If the priority is zero (default) disable the component */ - if (mca_coll_ml_component.ml_priority <= 0) { - return OMPI_ERR_NOT_AVAILABLE; - } - - /* Init memory structures (no real memory is allocated) */ - OBJ_CONSTRUCT(&cs->memory_manager, mca_coll_ml_lmngr_t); - - if (OMPI_SUCCESS != (rc = mca_base_framework_open(&ompi_sbgp_base_framework, 0))) { - fprintf(stderr," failure in open mca_sbgp_base_open \n"); - return rc; - } - if (OMPI_SUCCESS != (rc = mca_base_framework_open(&ompi_bcol_base_framework, 0))) { - fprintf(stderr," failure in open mca_bcol_base_open \n"); - return rc; - } - - /* Reset collective tunings cache */ - for (c_idx = 0; c_idx < ML_NUM_OF_FUNCTIONS; c_idx++) { - for (m_idx = 0; m_idx < ML_NUM_MSG; m_idx++) { - mca_coll_ml_reset_config(&cs->coll_config[c_idx][m_idx]); - } - } - - adjust_coll_config_by_mca_param(); - - /* Load configuration file and cache the configuration on component */ - rc = mca_coll_ml_config_file_init(); - if (OMPI_SUCCESS != rc) { - return OMPI_ERROR; - } - - - /* reigster the progress function */ - rc = opal_progress_register(coll_ml_progress); - if (OMPI_SUCCESS != rc ) { - fprintf(stderr," failed to register the ml progress function \n"); - fflush(stderr); - return rc; - } - - OBJ_CONSTRUCT(&(cs->pending_tasks_mutex), opal_mutex_t); - OBJ_CONSTRUCT(&(cs->pending_tasks), opal_list_t); - OBJ_CONSTRUCT(&(cs->active_tasks_mutex), opal_mutex_t); - OBJ_CONSTRUCT(&(cs->active_tasks), opal_list_t); - OBJ_CONSTRUCT(&(cs->sequential_collectives_mutex), opal_mutex_t); - OBJ_CONSTRUCT(&(cs->sequential_collectives), opal_list_t); - - rc = netpatterns_init(); - if (OMPI_SUCCESS != rc) { - return rc; - } - - cs->topo_discovery_fn[COLL_ML_HR_FULL] = - mca_coll_ml_fulltree_hierarchy_discovery; - - cs->topo_discovery_fn[COLL_ML_HR_ALLREDUCE] = - mca_coll_ml_allreduce_hierarchy_discovery; - - cs->topo_discovery_fn[COLL_ML_HR_NBS] = - mca_coll_ml_fulltree_exclude_basesmsocket_hierarchy_discovery; - - cs->topo_discovery_fn[COLL_ML_HR_SINGLE_PTP] = - mca_coll_ml_fulltree_ptp_only_hierarchy_discovery; - - cs->topo_discovery_fn[COLL_ML_HR_SINGLE_IBOFFLOAD] = - mca_coll_ml_fulltree_iboffload_only_hierarchy_discovery; - - cs->need_allreduce_support = false; - - return OMPI_SUCCESS; -} - -/* - * Close the component - */ -static int ml_close(void) -{ - int ret; - - mca_coll_ml_component_t *cs = &mca_coll_ml_component; - - /* There is not need to release/close resource if the - * priority was set to zero */ - if (cs->ml_priority <= 0) { - return OMPI_SUCCESS; - } - - OBJ_DESTRUCT(&cs->memory_manager); - OBJ_DESTRUCT(&cs->pending_tasks_mutex); - OBJ_DESTRUCT(&cs->pending_tasks); - OBJ_DESTRUCT(&cs->active_tasks_mutex); - OBJ_DESTRUCT(&cs->active_tasks); - OBJ_DESTRUCT(&cs->sequential_collectives_mutex); - OBJ_DESTRUCT(&cs->sequential_collectives); - - /* deregister progress function */ - ret = opal_progress_unregister(coll_ml_progress); - if (OMPI_SUCCESS != ret ) { - OMPI_ERROR_LOG(ret); - return ret; - } - - /* close the sbgp and bcol frameworks */ - if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_sbgp_base_framework))) { - OMPI_ERROR_LOG(ret); - return ret; - } - - if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bcol_base_framework))) { - OMPI_ERROR_LOG(ret); - return ret; - } - - return OMPI_SUCCESS; -} - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_coll_ml_init_query(bool enable_progress_threads, - bool enable_mpi_threads) -{ - int ret; - - /* at this stage there is no reason to disaulify this component */ - /* Add here bcol init nand sbgp init */ - ret = mca_sbgp_base_init(enable_progress_threads, enable_mpi_threads); - if (OMPI_SUCCESS != ret) { - return ret; - } - - ret = mca_bcol_base_init(enable_progress_threads, enable_mpi_threads); - if (OMPI_SUCCESS != ret) { - return ret; - } - - /* done */ - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/ml/coll_ml_config.c b/ompi/mca/coll/ml/coll_ml_config.c deleted file mode 100644 index c7556ca104..0000000000 --- a/ompi/mca/coll/ml/coll_ml_config.c +++ /dev/null @@ -1,613 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2016 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include - -#ifdef HAVE_UNISTD_H -#include -#endif - -#include "coll_ml.h" -#include "coll_ml_inlines.h" -#include "coll_ml_config.h" -#include "coll_ml_lex.h" - -static char *key_buffer = NULL; -static size_t key_buffer_len = 0; - -typedef struct section_config_t { - char *section_name; - int section_id; - per_collective_configuration_t config; -} section_config_t; - -typedef struct coll_config_t { - char *coll_name; - int coll_id; - section_config_t section; -} coll_config_t; - -static int algorithm_name_to_id(char *name) -{ - assert (NULL != name); - if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_KNOWN")) - return ML_BCAST_SMALL_DATA_KNOWN; - if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_UNKNOWN")) - return ML_BCAST_SMALL_DATA_UNKNOWN; - if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_SEQUENTIAL")) - return ML_BCAST_SMALL_DATA_SEQUENTIAL; - if (!strcasecmp(name,"ML_BCAST_LARGE_DATA_KNOWN")) - return ML_BCAST_LARGE_DATA_KNOWN; - if (!strcasecmp(name,"ML_BCAST_LARGE_DATA_UNKNOWN")) - return ML_BCAST_LARGE_DATA_UNKNOWN; - if (!strcasecmp(name,"ML_BCAST_LARGE_DATA_SEQUENTIAL")) - return ML_BCAST_LARGE_DATA_SEQUENTIAL; - if (!strcasecmp(name,"ML_N_DATASIZE_BINS")) - return ML_N_DATASIZE_BINS; - if (!strcasecmp(name,"ML_NUM_BCAST_FUNCTIONS")) - return ML_NUM_BCAST_FUNCTIONS; - if (!strcasecmp(name,"ML_SCATTER_SMALL_DATA_KNOWN")) - return ML_SCATTER_SMALL_DATA_KNOWN; - if (!strcasecmp(name,"ML_SCATTER_N_DATASIZE_BINS")) - return ML_SCATTER_N_DATASIZE_BINS; - if (!strcasecmp(name,"ML_SCATTER_SMALL_DATA_UNKNOWN")) - return ML_SCATTER_SMALL_DATA_UNKNOWN; - if (!strcasecmp(name,"ML_SCATTER_SMALL_DATA_SEQUENTIAL")) - return ML_SCATTER_SMALL_DATA_SEQUENTIAL; - if (!strcasecmp(name,"ML_NUM_SCATTER_FUNCTIONS")) - return ML_NUM_SCATTER_FUNCTIONS; - if (!strcasecmp(name,"ML_SMALL_DATA_ALLREDUCE")) - return ML_SMALL_DATA_ALLREDUCE; - if (!strcasecmp(name,"ML_LARGE_DATA_ALLREDUCE")) - return ML_LARGE_DATA_ALLREDUCE; - if (!strcasecmp(name,"ML_SMALL_DATA_REDUCE")) - return ML_SMALL_DATA_ALLREDUCE; - if (!strcasecmp(name,"ML_LARGE_DATA_REDUCE")) - return ML_LARGE_DATA_ALLREDUCE; - if (!strcasecmp(name,"ML_SMALL_DATA_REDUCE")) - return ML_SMALL_DATA_REDUCE; - if (!strcasecmp(name,"ML_LARGE_DATA_REDUCE")) - return ML_LARGE_DATA_REDUCE; - if (!strcasecmp(name,"ML_NUM_ALLREDUCE_FUNCTIONS")) - return ML_NUM_ALLREDUCE_FUNCTIONS; - if (!strcasecmp(name,"ML_SMALL_DATA_ALLTOALL")) - return ML_SMALL_DATA_ALLTOALL; - if (!strcasecmp(name,"ML_LARGE_DATA_ALLTOALL")) - return ML_LARGE_DATA_ALLTOALL; - if (!strcasecmp(name,"ML_NUM_ALLTOALL_FUNCTIONS")) - return ML_NUM_ALLTOALL_FUNCTIONS; - if (!strcasecmp(name,"ML_SMALL_DATA_ALLGATHER")) - return ML_SMALL_DATA_ALLGATHER; - if (!strcasecmp(name,"ML_LARGE_DATA_ALLGATHER")) - return ML_LARGE_DATA_ALLGATHER; - if (!strcasecmp(name,"ML_NUM_ALLGATHER_FUNCTIONS")) - return ML_NUM_ALLGATHER_FUNCTIONS; - if (!strcasecmp(name,"ML_SMALL_DATA_GATHER")) - return ML_SMALL_DATA_GATHER; - if (!strcasecmp(name,"ML_LARGE_DATA_GATHER")) - return ML_LARGE_DATA_GATHER; - if (!strcasecmp(name,"ML_NUM_GATHER_FUNCTIONS")) - return ML_NUM_GATHER_FUNCTIONS; - if (!strcasecmp(name,"ML_BARRIER_DEFAULT")) - return ML_BARRIER_DEFAULT; - - /* ERROR */ - return ML_UNDEFINED; -} - -static int hierarchy_name_to_id(char *name) -{ - assert (NULL != name); - if (!strcasecmp(name, "FULL_HR")) { - return COLL_ML_HR_FULL; - } - if (!strcasecmp(name, "FULL_HR_NO_BASESOCKET")) { - return COLL_ML_HR_NBS; - } - if (!strcasecmp(name, "PTP_ONLY")) { - return COLL_ML_HR_SINGLE_PTP; - } - if (!strcasecmp(name, "IBOFFLOAD_ONLY")) { - return COLL_ML_HR_SINGLE_IBOFFLOAD; - } - /* Error */ - return ML_UNDEFINED; -} - -static int section_name_to_id(char *name) -{ - assert (NULL != name); - if (!strcasecmp(name, "SMALL")) { - return ML_SMALL_MSG; - } - - if (!strcasecmp(name, "LARGE")) { - return ML_LARGE_MSG; - } - /* Error */ - return ML_UNDEFINED; -} - -static int coll_name_to_id(char *name) -{ - assert (NULL != name); - if (!strcasecmp(name, "ALLGATHER")) { - return ML_ALLGATHER; - } - if (!strcasecmp(name, "ALLGATHERV")) { - return ML_ALLGATHERV; - } - if (!strcasecmp(name, "ALLREDUCE")) { - return ML_ALLREDUCE; - } - if (!strcasecmp(name, "ALLTOALL")) { - return ML_ALLTOALL; - } - if (!strcasecmp(name, "ALLTOALLV")) { - return ML_ALLTOALLV; - } - if (!strcasecmp(name, "ALLTOALLW")) { - return ML_ALLTOALLW; - } - if (!strcasecmp(name, "ALLTOALLW")) { - return ML_ALLTOALLW; - } - if (!strcasecmp(name, "BARRIER")) { - return ML_BARRIER; - } - if (!strcasecmp(name, "BCAST")) { - return ML_BCAST; - } - if (!strcasecmp(name, "EXSCAN")) { - return ML_EXSCAN; - } - if (!strcasecmp(name, "GATHER")) { - return ML_GATHER; - } - if (!strcasecmp(name, "GATHERV")) { - return ML_GATHERV; - } - if (!strcasecmp(name, "REDUCE")) { - return ML_REDUCE; - } - if (!strcasecmp(name, "REDUCE_SCATTER")) { - return ML_REDUCE_SCATTER; - } - if (!strcasecmp(name, "SCAN")) { - return ML_SCAN; - } - if (!strcasecmp(name, "SCATTER")) { - return ML_SCATTER; - } - if (!strcasecmp(name, "SCATTERV")) { - return ML_SCATTERV; - } - - /* nonblocking functions */ - - if (!strcasecmp(name, "IALLGATHER")) { - return ML_IALLGATHER; - } - if (!strcasecmp(name, "IALLGATHERV")) { - return ML_IALLGATHERV; - } - if (!strcasecmp(name, "IALLREDUCE")) { - return ML_IALLREDUCE; - } - if (!strcasecmp(name, "IALLTOALL")) { - return ML_IALLTOALL; - } - if (!strcasecmp(name, "IALLTOALLV")) { - return ML_IALLTOALLV; - } - if (!strcasecmp(name, "IALLTOALLW")) { - return ML_IALLTOALLW; - } - if (!strcasecmp(name, "IALLTOALLW")) { - return ML_IALLTOALLW; - } - if (!strcasecmp(name, "IBARRIER")) { - return ML_IBARRIER; - } - if (!strcasecmp(name, "IBCAST")) { - return ML_IBCAST; - } - if (!strcasecmp(name, "IEXSCAN")) { - return ML_IEXSCAN; - } - if (!strcasecmp(name, "IGATHER")) { - return ML_IGATHER; - } - if (!strcasecmp(name, "IGATHERV")) { - return ML_IGATHERV; - } - if (!strcasecmp(name, "IREDUCE")) { - return ML_IREDUCE; - } - if (!strcasecmp(name, "IREDUCE_SCATTER")) { - return ML_IREDUCE_SCATTER; - } - if (!strcasecmp(name, "ISCAN")) { - return ML_ISCAN; - } - if (!strcasecmp(name, "ISCATTER")) { - return ML_ISCATTER; - } - if (!strcasecmp(name, "ISCATTERV")) { - return ML_ISCATTERV; - } - - /* Error - collecives name was not matched */ - return ML_UNDEFINED; -} -static int set_collective_name(coll_config_t *coll_config) -{ - int coll_id = - coll_name_to_id(coll_ml_config_yytext); - - if (ML_UNDEFINED == coll_id) { - return OMPI_ERROR; - } - - coll_config->coll_id = coll_id; - coll_config->coll_name = strdup(coll_ml_config_yytext); - - return OMPI_SUCCESS; -} - -static int set_section_name(section_config_t *section_config) -{ - int section_id; - - section_id = section_name_to_id(coll_ml_config_yytext); - - if (ML_UNDEFINED == section_id) { - return OMPI_ERROR; - } - - section_config->section_id = section_id; - section_config->section_name = strdup(coll_ml_config_yytext); - - return OMPI_SUCCESS; -} - -void mca_coll_ml_reset_config(per_collective_configuration_t *config) -{ - config->topology_id = ML_UNDEFINED; - config->threshold = ML_UNDEFINED; - config->algorithm_id = ML_UNDEFINED; - config->fragmentation_enabled = ML_UNDEFINED; -} - -static void reset_section(section_config_t *section_cf) -{ - if (section_cf->section_name) { - free (section_cf->section_name); - section_cf->section_name = NULL; - } - - section_cf->section_id = ML_UNDEFINED; - mca_coll_ml_reset_config(§ion_cf->config); -} - -static void reset_collective(coll_config_t *coll_cf) -{ - if (coll_cf->coll_name) { - free (coll_cf->coll_name); - coll_cf->coll_name = NULL; - } - - coll_cf->coll_id = ML_UNDEFINED; - reset_section(&coll_cf->section); -} - -/* - * String to integer; - */ -static int string_to_int(char *str) -{ - while (isspace(*str)) { - ++str; - } - - /* Nope -- just decimal, so use atoi() */ - return atoi(str); -} - -static int parse_algorithm_key(section_config_t *section, char *value) -{ - int ret; - ret = algorithm_name_to_id(value); - if (ML_UNDEFINED == ret) { - return OMPI_ERROR; - } else { - section->config.algorithm_id = ret; - } - - return OMPI_SUCCESS; -} - -static int parse_threshold_key(section_config_t *section, char *value) -{ - assert (NULL != value); - - if(!strcasecmp(value, "unlimited")) { - section->config.threshold = -1; - } else { - section->config.threshold = string_to_int(value); - } - - return OMPI_SUCCESS; -} - -static int parse_hierarchy_key(section_config_t *section, char *value) -{ - int ret; - - ret = hierarchy_name_to_id(value); - if (ML_UNDEFINED == ret) { - return OMPI_ERROR; - } - - section->config.topology_id = ret; - - return OMPI_SUCCESS; -} - -static int parse_fragmentation_key(section_config_t *section, char *value) -{ - assert (NULL != value); - - if(!strcasecmp(value, "enable")) { - section->config.fragmentation_enabled = 1; - } else if (!strcasecmp(value, "disable")) { - section->config.fragmentation_enabled = 0; - } else { - ML_ERROR(("Line %d, unexpected fragmentation value %s. Legal values are: enable/disable", - coll_ml_config_yynewlines, value)); - return OMPI_ERROR; - } - return OMPI_SUCCESS; -} - -/* Save configuration that have been collected so far */ -static int save_settings(coll_config_t *coll_config) -{ - per_collective_configuration_t *cf; - - if (ML_UNDEFINED == coll_config->coll_id || ML_UNDEFINED == coll_config->section.section_id) { - return OMPI_ERROR; - } - - cf = &mca_coll_ml_component.coll_config[coll_config->coll_id][coll_config->section.section_id]; - - cf->topology_id = coll_config->section.config.topology_id; - cf->threshold = coll_config->section.config.threshold; - cf->algorithm_id = coll_config->section.config.algorithm_id; - cf->fragmentation_enabled = coll_config->section.config.fragmentation_enabled; - - return OMPI_SUCCESS; -} - -/* - * Parse a single line - */ -static int parse_line(section_config_t *section) -{ - int val, ret = OMPI_SUCCESS; - char *value = NULL; - - /* Save the name name */ - if (key_buffer_len < strlen(coll_ml_config_yytext) + 1) { - char *tmp; - key_buffer_len = strlen(coll_ml_config_yytext) + 1; - tmp = (char *) realloc(key_buffer, key_buffer_len); - if (NULL == tmp) { - free(key_buffer); - key_buffer_len = 0; - key_buffer = NULL; - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - key_buffer = tmp; - } - strncpy(key_buffer, coll_ml_config_yytext, key_buffer_len); - - /* The first thing we have to see is an "=" */ - val = coll_ml_config_yylex(); - if (coll_ml_config_parse_done || COLL_ML_CONFIG_PARSE_EQUAL != val) { - ML_ERROR(("Line %d, expected = before key: %s", - coll_ml_config_yynewlines, - key_buffer)); - return OMPI_ERROR; - } - - /* Next we get the value */ - val = coll_ml_config_yylex(); - if (COLL_ML_CONFIG_PARSE_SINGLE_WORD == val || - COLL_ML_CONFIG_PARSE_VALUE == val) { - value = strdup(coll_ml_config_yytext); - if (NULL == value) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* Now we need to see the newline */ - val = coll_ml_config_yylex(); - if (COLL_ML_CONFIG_PARSE_NEWLINE != val && - COLL_ML_CONFIG_PARSE_DONE != val) { - ML_ERROR(("Line %d, expected new line after %s", - coll_ml_config_yynewlines, - key_buffer)); - free(value); - return OMPI_ERROR; - } - } - - /* If we did not get EOL or EOF, something is wrong */ - else if (COLL_ML_CONFIG_PARSE_DONE != val && - COLL_ML_CONFIG_PARSE_NEWLINE != val) { - ML_ERROR(("Line %d, expected new line or end of line", - coll_ml_config_yynewlines)); - return OMPI_ERROR; - } else { - ML_ERROR(("Line %d malformed", coll_ml_config_yynewlines)); - return OMPI_ERROR; - } - - /* Line parsing is done, read the values */ - if (!strcasecmp(key_buffer, "algorithm")) { - ret = parse_algorithm_key(section, value); - } else if (!strcasecmp(key_buffer, "threshold")) { - ret = parse_threshold_key(section, value); - } else if (!strcasecmp(key_buffer, "hierarchy")) { - ret = parse_hierarchy_key(section, value); - } else if (!strcasecmp(key_buffer, "fragmentation")) { - ret = parse_fragmentation_key(section, value); - /* Failed to parse the key */ - } else { - ML_ERROR(("Line %d, unknown key %s", - coll_ml_config_yynewlines, key_buffer)); - } - - /* All done */ - free(value); - - return ret; -} - -/**************************************************************************/ - -/* - * Parse a single file - */ -static int parse_file(char *filename) -{ - int val; - int ret = OMPI_SUCCESS; - bool first_section = true, first_coll = true; - coll_config_t coll_config; - - memset (&coll_config, 0, sizeof (coll_config)); - reset_collective(&coll_config); - - /* Open the file */ - coll_ml_config_yyin = fopen(filename, "r"); - if (NULL == coll_ml_config_yyin) { - ML_ERROR(("Failed to open config file %s", filename)); - ret = OMPI_ERR_NOT_FOUND; - goto cleanup; - } - - /* Do the parsing */ - coll_ml_config_parse_done = false; - coll_ml_config_yynewlines = 1; - coll_ml_config_init_buffer(coll_ml_config_yyin); - while (!coll_ml_config_parse_done) { - val = coll_ml_config_yylex(); - switch (val) { - case COLL_ML_CONFIG_PARSE_DONE: - case COLL_ML_CONFIG_PARSE_NEWLINE: - break; - case COLL_ML_CONFIG_PARSE_COLLECTIVE: - /* dump all the information to last section that was defined */ - if (!first_coll) { - ret = save_settings(&coll_config); - - if (OMPI_SUCCESS != ret) { - ML_ERROR(("Error in syntax for collective %s", coll_config.coll_name)); - goto cleanup; - } - } - - /* reset collective config */ - reset_collective(&coll_config); - - first_coll = false; - first_section = true; - - ret = set_collective_name(&coll_config); - if (OMPI_SUCCESS != ret) { - goto cleanup; - } - break; - case COLL_ML_CONFIG_PARSE_SECTION: - if (ML_UNDEFINED == coll_config.coll_id) { - ML_ERROR(("Collective section wasn't defined !")); - ret = OMPI_ERROR; - goto cleanup; - } - - if (!first_section) { - /* dump all the information to last section that was defined */ - ret = save_settings(&coll_config); - if (OMPI_SUCCESS != ret) { - ML_ERROR(("Error in syntax for collective %s section %s", coll_config.coll_name, - coll_config.section.section_name)); - goto cleanup; - } - } - - first_section = false; - - /* reset all section values */ - reset_section(&coll_config.section); - - /* set new section name */ - ret = set_section_name(&coll_config.section); - if (OMPI_SUCCESS != ret) { - goto cleanup; - } - break; - case COLL_ML_CONFIG_PARSE_SINGLE_WORD: - if (ML_UNDEFINED == coll_config.coll_id || - ML_UNDEFINED == coll_config.section.section_id) { - ML_ERROR(("Collective section or sub-section was not defined !")); - ret = OMPI_ERROR; - goto cleanup; - } else { - parse_line(&coll_config.section); - } - break; - - default: - /* anything else is an error */ - ML_ERROR(("Unexpected token!")); - ret = OMPI_ERROR; - goto cleanup; - break; - } - } - - save_settings(&coll_config); - fclose(coll_ml_config_yyin); - coll_ml_config_yylex_destroy (); - ret = OMPI_SUCCESS; - -cleanup: - reset_collective(&coll_config); - if (NULL != key_buffer) { - free(key_buffer); - key_buffer = NULL; - key_buffer_len = 0; - } - return ret; -} - -int mca_coll_ml_config_file_init(void) -{ - return parse_file(mca_coll_ml_component.config_file_name); -} - diff --git a/ompi/mca/coll/ml/coll_ml_config.h b/ompi/mca/coll/ml/coll_ml_config.h deleted file mode 100644 index 15ad7dff2a..0000000000 --- a/ompi/mca/coll/ml/coll_ml_config.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef COLL_ML_CONFIG_H_ -#define COLL_ML_CONFIG_H_ - -#include "opal_config.h" -#include - -BEGIN_C_DECLS - -#define ML_UNDEFINED -1 - -struct per_collective_configuration_t { - int topology_id; - int threshold; - int algorithm_id; - int fragmentation_enabled; -}; -typedef struct per_collective_configuration_t per_collective_configuration_t; - -void mca_coll_ml_reset_config(per_collective_configuration_t *config); -int mca_coll_ml_config_file_init(void); - -END_C_DECLS -#endif diff --git a/ompi/mca/coll/ml/coll_ml_copy_fns.c b/ompi/mca/coll/ml/coll_ml_copy_fns.c deleted file mode 100644 index a3d41b06c0..0000000000 --- a/ompi/mca/coll/ml/coll_ml_copy_fns.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#include "ompi_config.h" - -#include "ompi/constants.h" -#include "opal/threads/mutex.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/bcol/bcol.h" -#include "opal/sys/atomic.h" -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#include "ompi/mca/coll/ml/coll_ml_allocation.h" -#include "coll_ml_colls.h" -#include -#include - - - -/* This routine re-orders and packs user data. The assumption is that - * there is per-process data, the amount of data is the same for all - * ranks, and the user data is contigous. - */ -int mca_coll_ml_pack_reorder_contiguous_data(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - int i, rank; - void *user_buf, *library_buf; - size_t bytes_per_proc; - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) - coll_op->coll_module; - mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info; - ptrdiff_t ptr_dif; - - /* get the offset into each processes data. The assumption is that - * we are manipulating the same amount of data for each process. - */ - - /* figure out how much data per-proc to copy */ - bytes_per_proc=coll_op->fragment_data.per_rank_fragment_size; - - /* loop over all the ranks in the communicator */ - for( i=0 ; i < ompi_comm_size(ml_module->comm) ; i++ ) { - - /* look up the rank of the i'th element in the sorted list */ - rank = topo_info->sort_list[i]; - - /* get the pointer to user data */ - user_buf=(void *)coll_op->full_message.src_user_addr; - /* compute offset into the user buffer */ - - /* offset for data already processed */ - ptr_dif=rank*coll_op->full_message.n_bytes_per_proc_total+ - coll_op->fragment_data.offset_into_user_buffer_per_proc; - user_buf=(void *) ((char *)user_buf+ptr_dif); - /* - rank*coll_op->full_message.n_bytes_per_proc_total+ - coll_op->fragment_data.offset_into_user_buffer_per_proc); - */ - - /* get the pointer to the ML buffer */ - library_buf= (void *) - ((char *)coll_op->variable_fn_params.src_desc->data_addr+i*bytes_per_proc); - - /* copy the data */ - memcpy(library_buf, user_buf, bytes_per_proc); - - } - - return OMPI_SUCCESS; -} - -/* This routine re-orders and packs user data. The assumption is that - * there is per-process data, the amount of data is the same for all - * ranks, and the user data is contigous. - */ -int mca_coll_ml_pack_reorder_noncontiguous_data(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - int i, rank; - void *user_buf, *library_buf; - size_t bytes_per_proc; - ptrdiff_t ptr_dif; - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) - coll_op->coll_module; - mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info; - - /* get the offset into each processes data. The assumption is that - * we are manipulating the same amount of data for each process. - */ - - /* figure out how much data per-proc to copy */ - bytes_per_proc = coll_op->fragment_data.per_rank_fragment_size; - - /* loop over all the ranks in the communicator */ - for(i = 0; i < ompi_comm_size(ml_module->comm); i++ ) { - - /* look up the rank of the i'th element in the sorted list */ - rank = topo_info->sort_list[i]; - - /* get the pointer to user data */ - user_buf=(void *)coll_op->full_message.src_user_addr; - /* compute offset into the user buffer */ - - /* offset for data already processed */ - ptr_dif=rank*coll_op->full_message.send_count* - coll_op->full_message.send_extent+ - coll_op->fragment_data.offset_into_user_buffer_per_proc; - user_buf=(void *) ((char *)user_buf+ptr_dif); - - /* get the pointer to the ML buffer */ - library_buf= (void *) - ((char *)coll_op->variable_fn_params.src_desc->data_addr+i*bytes_per_proc); - - /* copy the data */ - memcpy(library_buf, user_buf, bytes_per_proc); - - } - - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/coll/ml/coll_ml_custom_utils.c b/ompi/mca/coll/ml/coll_ml_custom_utils.c deleted file mode 100644 index c00c4a5439..0000000000 --- a/ompi/mca/coll/ml/coll_ml_custom_utils.c +++ /dev/null @@ -1,139 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#include "ompi_config.h" - -#include -#include -#include - -#include "opal/util/output.h" -#include "opal/class/opal_list.h" -#include "opal/class/opal_object.h" -#include "ompi/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/threads/mutex.h" -#include "opal/sys/atomic.h" - -#include "ompi/op/op.h" -#include "ompi/constants.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/coll/base/base.h" -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#include "ompi/patterns/comm/coll_ops.h" - -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/communicator/communicator.h" - -#include "ompi/mca/bcol/base/base.h" -#include "coll_ml_custom_utils.h" - -/* - * Local types - */ - -struct avail_coll_t { - opal_list_item_t super; - int ac_priority; - mca_coll_base_module_2_1_0_t *ac_module; -}; -typedef struct avail_coll_t avail_coll_t; - -/* - * Stuff for the OBJ interface - * If topo_index == COLL_ML_TOPO_MAX it looks over all possilbe topologies, otherwhise it looks - * in the topology that was specified. - */ - -int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_module_t *ml_module, - int topo_index) -{ - int i, rc, hier, *ranks_in_comm, - is_used = 0, - comm_size = ompi_comm_size(ml_module->comm); - int n_hier, tp , max_tp; - const mca_coll_ml_topology_t *topo_info; - - ranks_in_comm = (int *) malloc(comm_size * sizeof(int)); - if (OPAL_UNLIKELY(NULL == ranks_in_comm)) { - ML_ERROR(("Memory allocation failed.")); - ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_NO_MEM); - /* not reached but causes a clang warning to not return here */ - return OMPI_ERR_OUT_OF_RESOURCE; - } - - for (i = 0; i < comm_size; ++i) { - ranks_in_comm[i] = i; - } - - if (COLL_ML_TOPO_MAX == topo_index) { - tp = 0; - max_tp = COLL_ML_TOPO_MAX; - } else { - tp = topo_index; - max_tp = topo_index + 1; - } - - for (; tp < max_tp; tp++) { - topo_info = &ml_module->topo_list[tp]; - n_hier = topo_info->n_levels; - for (hier = 0; hier < n_hier; ++hier) { - hierarchy_pairs *pair = &topo_info->component_pairs[hier]; - mca_bcol_base_component_t *b_cm = pair->bcol_component; - if(0 == strcmp(bcol_name, - b_cm->bcol_version.mca_component_name)) { - is_used = 1; - break; - } - } - } - - rc = comm_allreduce_pml(&is_used, &is_used, 1, MPI_INT, - ompi_comm_rank(ml_module->comm), MPI_MAX, - comm_size, ranks_in_comm, ml_module->comm); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - ML_ERROR(("comm_allreduce_pml failed.")); - ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_OP); - } - - free(ranks_in_comm); - - return is_used; -} - -/* The function is very different from the above function */ -int mca_coll_ml_check_if_bcol_is_requested(const char *component_name) -{ - mca_base_component_list_item_t *bcol_comp; - - ML_VERBOSE(10, ("Loop over bcol components")); - OPAL_LIST_FOREACH(bcol_comp, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) { - if(0 == strcmp(component_name, - ((mca_bcol_base_component_2_0_0_t *) - bcol_comp->cli_component)->bcol_version.mca_component_name)) { - return true; - } - } - - /* the component was not resquested */ - return false; -} diff --git a/ompi/mca/coll/ml/coll_ml_custom_utils.h b/ompi/mca/coll/ml/coll_ml_custom_utils.h deleted file mode 100644 index 7d6a8feb00..0000000000 --- a/ompi/mca/coll/ml/coll_ml_custom_utils.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#ifndef MCA_COLL_ML_CUSTOM_UTILS_H -#define MCA_COLL_ML_CUSTOM_UTILS_H - -#include "ompi_config.h" - -#include "coll_ml.h" - -/* the function is used to check if the bcol name is used in this ml module */ -int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_module_t *ml_module, - int topo_index); - -/* The function is used to check if the bcol component was REQUESTED by user */ -int mca_coll_ml_check_if_bcol_is_requested(const char *component_name); - -END_C_DECLS - -#endif /* MCA_COLL_ML_ML_H */ diff --git a/ompi/mca/coll/ml/coll_ml_descriptors.c b/ompi/mca/coll/ml/coll_ml_descriptors.c deleted file mode 100644 index 4060c27ed7..0000000000 --- a/ompi/mca/coll/ml/coll_ml_descriptors.c +++ /dev/null @@ -1,60 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#include "ompi_config.h" -#include "coll_ml.h" -#include "coll_ml_inlines.h" - - -static inline void mca_coll_ml_fragment_constructor(mca_coll_ml_fragment_t *frag) -{ - frag->fn_args = NULL; -} - -static inline void mca_coll_ml_fragment_destructor(mca_coll_ml_fragment_t *frag) -{ - if (frag->fn_args) { - free(frag->fn_args); - frag->fn_args = NULL; - } -} - -static inline void mca_coll_ml_descriptor_constructor(mca_coll_ml_descriptor_t *descriptor) -{ - - OBJ_CONSTRUCT(&(descriptor->fragment),mca_coll_ml_fragment_t); - - /* this fragment is alway associated with this message descriptor */ - descriptor->fragment.full_msg_descriptor=descriptor; - -} - - -static inline void mca_coll_ml_descriptor_destructor(mca_coll_ml_descriptor_t *descriptor) -{ - OBJ_DESTRUCT(&(descriptor->fragment)); -} - -OBJ_CLASS_INSTANCE( - mca_coll_ml_fragment_t, - opal_list_item_t, - mca_coll_ml_fragment_constructor, - mca_coll_ml_fragment_destructor); - -OBJ_CLASS_INSTANCE( - mca_coll_ml_descriptor_t, - ompi_request_t, - mca_coll_ml_descriptor_constructor, - mca_coll_ml_descriptor_destructor); - diff --git a/ompi/mca/coll/ml/coll_ml_functions.h b/ompi/mca/coll/ml/coll_ml_functions.h deleted file mode 100644 index 5d0d0d7b1a..0000000000 --- a/ompi/mca/coll/ml/coll_ml_functions.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#ifndef MCA_COLL_ML_FUNCTIONS_H -#define MCA_COLL_ML_FUNCTIONS_H - -#include "ompi_config.h" - -BEGIN_C_DECLS - -#define ML_MEMSYNC -100 - -enum { - ML_BARRIER_DEFAULT -}; - /* small data algorithm */ -/* broadcast functions */ -enum { - /* small data algorithm */ - ML_BCAST_SMALL_DATA_KNOWN, - /* small data - dynamic decision making supported */ - ML_BCAST_SMALL_DATA_UNKNOWN, - /* Sequential algorithm */ - ML_BCAST_SMALL_DATA_SEQUENTIAL, - - ML_BCAST_LARGE_DATA_KNOWN, - - ML_BCAST_LARGE_DATA_UNKNOWN, - - ML_BCAST_LARGE_DATA_SEQUENTIAL, - - /* marker - all routines about this are expected to be used in - * selection logic that is based on size of the data */ - ML_N_DATASIZE_BINS, - - /* number of functions - also counts some markers, but ... */ - ML_NUM_BCAST_FUNCTIONS -}; - - -/* scatter functions */ -enum { - /* small data algorithm */ - ML_SCATTER_SMALL_DATA_KNOWN, - - /* marker - all routines about this are expected to be used in - * selection logic that is based on size of the data */ - ML_SCATTER_N_DATASIZE_BINS, - - /* small data - dynamic decision making supported */ - ML_SCATTER_SMALL_DATA_UNKNOWN, - - /* Sequential algorithm */ - ML_SCATTER_SMALL_DATA_SEQUENTIAL, - - /* number of functions - also counts some markers, but ... */ - ML_NUM_SCATTER_FUNCTIONS -}; - - -/* Allreduce functions */ -enum { - /* small data algorithm */ - ML_SMALL_DATA_ALLREDUCE, - - /* Large data algorithm */ - ML_LARGE_DATA_ALLREDUCE, - - /* If some of bcols doesn't support - all possibles types, use these extra algthms */ - /* small data algorithm */ - ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE, - - /* large data algorithm */ - ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE, - - /* number of functions */ - ML_NUM_ALLREDUCE_FUNCTIONS -}; - -/* Reduce functions */ -enum { - /* small data algorithm */ - ML_SMALL_DATA_REDUCE, - - /* Large data algorithm */ - ML_LARGE_DATA_REDUCE, - - /* number of functions */ - ML_NUM_REDUCE_FUNCTIONS -}; -/* Alltoall functions */ -enum { - /* small data algorithm */ - ML_SMALL_DATA_ALLTOALL, - /* large all to all */ - ML_LARGE_DATA_ALLTOALL, - /* number of functions */ - ML_NUM_ALLTOALL_FUNCTIONS -}; - -/* Allgather functions */ -enum { - /* small data */ - ML_SMALL_DATA_ALLGATHER, - /* large data */ - ML_LARGE_DATA_ALLGATHER, - /* number of functions */ - ML_NUM_ALLGATHER_FUNCTIONS -}; - -/* gather functions */ -enum { - /* small data */ - ML_SMALL_DATA_GATHER, - /* large data */ - ML_LARGE_DATA_GATHER, - /* number of functions */ - ML_NUM_GATHER_FUNCTIONS -}; - -END_C_DECLS - -#endif /* MCA_COLL_ML_FUNCTIONS_H */ diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithm_memsync_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithm_memsync_setup.c deleted file mode 100644 index f50d040f61..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithm_memsync_setup.c +++ /dev/null @@ -1,195 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/coll/ml/coll_ml_functions.h" - -static int mca_coll_ml_build_memsync_schedule( - mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t **coll_desc) -{ - int i_hier, rc, i_fn, n_fcns, i, - n_hiers = topo_info->n_levels; - - bool call_for_top_func; - mca_bcol_base_module_t *bcol_module; - - mca_coll_ml_compound_functions_t *comp_fn; - mca_coll_ml_collective_operation_description_t *schedule; - - *coll_desc = (mca_coll_ml_collective_operation_description_t *) - calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); - - schedule = *coll_desc; - if (OPAL_UNLIKELY(NULL == schedule)) { - ML_ERROR(("Can't allocate memory.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - if (topo_info->global_highest_hier_group_index == - topo_info->component_pairs[n_hiers - 1].bcol_index) { - /* The process that is member of highest level subgroup - should call for top algorithms in addition to fan-in/out steps */ - call_for_top_func = true; - n_fcns = 2 * n_hiers - 1; /* Up + Top + Down */ - } else { - /* The process is not member of highest level subgroup, - as result it does not call for top algorithm, - but it calls for all fan-in/out steps */ - call_for_top_func = false; - n_fcns = 2 * n_hiers; - } - - /* Set dependencies equal to number of hierarchies */ - schedule->n_fns = n_fcns; - schedule->topo_info = topo_info; - - /* Allocated the component function */ - schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) - calloc(n_fcns, sizeof(struct mca_coll_ml_compound_functions_t)); - - if (OPAL_UNLIKELY(NULL == schedule->component_functions)) { - ML_ERROR(("Can't allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto Barrier_Setup_Error; - } - - for (i_fn = 0; i_fn < n_fcns; ++i_fn) { - i_hier = (i_fn < n_hiers ? i_fn : n_fcns - i_fn - 1); - comp_fn = &schedule->component_functions[i_fn]; - - /* The hierarchial level */ - comp_fn->h_level = i_hier; - bcol_module = GET_BCOL(topo_info, i_hier); - - /* The UP direction */ - if (1 + i_fn < n_hiers || (1 + i_fn == n_hiers && !call_for_top_func)) { - /* Pasha: We do not have memory syncronization FANIN function, instead I use barrier. - * In future we have to replace it with memsync fan-in function - * comp_fn->bcol_function = - * bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_FANIN][1][0][0]; - */ - comp_fn->bcol_function = GET_BCOL_SYNC_FN(bcol_module); - - - assert(NULL != comp_fn->bcol_function); - - /* Each function call with index K is depended of all K-1 previous indices - - in simple words we will do sequential Fan-In calls */ - comp_fn->num_dependencies = i_fn; - comp_fn->num_dependent_tasks = n_fcns - i_fn - 1; - - /* Init component function */ - strcpy(comp_fn->fn_name, "MEMSYNC-FANIN"); - - /* On the highest level */ - } else if ((1 + i_fn == n_hiers && call_for_top_func)) { - comp_fn->bcol_function = GET_BCOL_SYNC_FN(bcol_module); - - /* Each function call with index K is depended of all K-1 previous indices - - in simple words we do sequential calls */ - comp_fn->num_dependencies = n_hiers - 1; /* All Fan-Ins */ - comp_fn->num_dependent_tasks = n_fcns - n_hiers; /* All Fan-Outs */ - - /* Init component function */ - strcpy(comp_fn->fn_name, "MEMSYNC-BARRIER"); - - assert(NULL != comp_fn->bcol_function); - ML_VERBOSE(10, ("func indx %d set to BARRIER %p", i_fn, comp_fn->bcol_function)); - - /* The DOWN direction */ - } else { - /* Pasha: We do not have memory syncronization FANOUT function, instead I use barrier. - * In future we have to replace it with memsync fan-out function - * comp_fn->bcol_function = - * bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_FANOUT][1][0][0]; - */ - comp_fn->bcol_function = GET_BCOL_SYNC_FN(bcol_module); - - /* Each function call with index K is depended of all UP and TOP algths */ - comp_fn->num_dependencies = n_hiers; - comp_fn->num_dependent_tasks = 0; - - /* Init component function */ - strcpy(comp_fn->fn_name, "MEMSYNC-FANOUT"); - } - - assert(NULL != comp_fn->bcol_function); - ML_VERBOSE(10, ("func indx %d set to %p", i_fn, comp_fn->bcol_function)); - - if (comp_fn->num_dependent_tasks > 0) { - comp_fn->dependent_task_indices = (int *) calloc(comp_fn->num_dependent_tasks, sizeof(int)); - if (OPAL_UNLIKELY(NULL == comp_fn->dependent_task_indices)) { - ML_ERROR(("Can't allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto Barrier_Setup_Error; - } - - /* All indexes follow after this one */ - for (i = 0; i < comp_fn->num_dependent_tasks; ++i) { - comp_fn->dependent_task_indices[i] = i_fn + i + 1; - } - } - - /* No need completion func for Barrier */ - comp_fn->task_comp_fn = NULL; - - ML_VERBOSE(10, ("Setting collective [Barrier] fn_idx %d, n_of_this_type_in_a_row %d, " - "index_in_consecutive_same_bcol_calls %d.", - i_fn, comp_fn->constant_group_data.n_of_this_type_in_a_row, - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls)); - } - - rc = ml_coll_barrier_constant_group_data_setup(topo_info, schedule); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - ML_ERROR(("Failed to init const group data.")); - goto Barrier_Setup_Error; - } - - schedule->progress_type = 0; - - return OMPI_SUCCESS; - -Barrier_Setup_Error: - if (NULL != schedule->component_functions) { - free(schedule->component_functions); - schedule->component_functions = NULL; - } - - free (schedule); - *coll_desc = NULL; - - return rc; -} - -int ml_coll_memsync_setup(mca_coll_ml_module_t *ml_module) -{ - int ret; - /* For barrier syncronization we use barrier topology */ - mca_coll_ml_topology_t *topo_info = - &ml_module->topo_list[ml_module->collectives_topology_map[ML_BARRIER][ML_SMALL_MSG]]; - - ret = mca_coll_ml_build_memsync_schedule(topo_info, - &ml_module->coll_ml_memsync_function); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup static bcast")); - return ret; - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms.c deleted file mode 100644 index 179557dafb..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms.c +++ /dev/null @@ -1,188 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#include "ompi/mca/coll/ml/coll_ml_allocation.h" - -/* collective managment descriptor initialization - called right after - * the constructor by opal_free_list code - */ -static int mca_coll_ml_collective_operation_progress_init - (opal_free_list_item_t* item, void* ctx) -{ - int i; - int max_dag_size = ((struct coll_desc_init *)ctx)->max_dag_size; - size_t max_n_bytes_per_proc_total = - ((struct coll_desc_init *)ctx)->max_n_bytes_per_proc_total; - mca_coll_ml_collective_operation_progress_t *coll_op = - (mca_coll_ml_collective_operation_progress_t *) item; - - coll_op->dag_description.status_array = - (mca_coll_ml_task_status_t *) - calloc(max_dag_size, sizeof(mca_coll_ml_task_status_t)); - assert(coll_op->dag_description.status_array); - - /* initialize individual elements */ - for (i = 0; i < max_dag_size; i++ ) { - /* Pasha: We assume here index syncronization between - task indexes and indexes in component_function array - (mca_coll_ml_collective_operation_description) - */ - coll_op->dag_description.status_array[i]. - my_index_in_coll_schedule = i; - coll_op->dag_description.status_array[i]. - ml_coll_operation = coll_op; - - OBJ_CONSTRUCT(&coll_op->dag_description.status_array[i].item, opal_list_item_t); - } - - /* set the size per proc of the ML buffer */ - coll_op->full_message.max_n_bytes_per_proc_total= - max_n_bytes_per_proc_total; - - /* set the pointer to the bcol module */ - coll_op->coll_module = - ((struct coll_desc_init *)ctx)->bcol_base_module; - - return OPAL_SUCCESS; -} - -int ml_coll_schedule_setup(mca_coll_ml_module_t *ml_module) -{ - /* local variables */ - int ret = OMPI_SUCCESS, comm_size; - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - size_t ml_per_proc_buffer_size; - - /* Barrier */ - ret = ml_coll_hier_barrier_setup(ml_module); - if( OMPI_SUCCESS != ret ) { - return ret; - } - - /* Broadcast */ - ret = ml_coll_hier_bcast_setup(ml_module); - if( OMPI_SUCCESS != ret ) { - return ret; - } - - /* Allreduce */ - if (!mca_coll_ml_component.use_knomial_allreduce) { - ret = ml_coll_hier_allreduce_setup(ml_module); - } else { - ret = ml_coll_hier_allreduce_setup_new(ml_module); - } - - if( OMPI_SUCCESS != ret ) { - return ret; - } - - - /* Alltoall */ - /* - ret = ml_coll_hier_alltoall_setup_new(ml_module); - - if( OMPI_SUCCESS != ret ) { - return ret; - } - */ - - /* Allgather */ - ret = ml_coll_hier_allgather_setup(ml_module); - - if( OMPI_SUCCESS != ret ) { - return ret; - } - - /* Gather */ - /* - ret = ml_coll_hier_gather_setup(ml_module); - - if( OMPI_SUCCESS != ret ) { - return ret; - } - */ - - /* Reduce */ - ret = ml_coll_hier_reduce_setup(ml_module); - if( OMPI_SUCCESS != ret ) { - return ret; - } - - /* Scatter */ - /* - ret = ml_coll_hier_scatter_setup(ml_module); - if( OMPI_SUCCESS != ret ) { - return ret; - } - */ - - ret = ml_coll_memsync_setup(ml_module); - if( OMPI_SUCCESS != ret ) { - return ret; - } - - /* nonblocking Reduce */ - - /* Alltoall */ - - /* nonblocking alltoall */ - - /* max_dag_size will be set here, so initialize it */ - - /* Pasha: Do we have to keep the max_dag_size ? - In most generic case, it will be equal to max_fn_calls */ - ml_module->max_dag_size = ml_module->max_fn_calls; - - assert(ml_module->max_dag_size > 0); - - /* initialize the mca_coll_ml_collective_operation_progress_t free list */ - /* NOTE: as part of initialization each routine needs to make sure that - * the module element max_dag_size is set large enough - space for - * tracking collective progress is allocated based on this value. */ - - /* figure out what the size of the ml buffer is */ - ml_per_proc_buffer_size=ml_module->payload_block->size_buffer; - comm_size=ompi_comm_size(ml_module->comm); - ml_per_proc_buffer_size/=comm_size; - ml_module->coll_desc_init_data.max_dag_size=ml_module->max_dag_size; - ml_module->coll_desc_init_data.max_n_bytes_per_proc_total=ml_per_proc_buffer_size; - ml_module->coll_desc_init_data.bcol_base_module=(mca_coll_base_module_t *) - ml_module; - - ret = opal_free_list_init ( - &(ml_module->coll_ml_collective_descriptors), - sizeof(mca_coll_ml_collective_operation_progress_t), - /* no special alignment needed */ - 8, - OBJ_CLASS(mca_coll_ml_collective_operation_progress_t), - /* no payload data */ - 0, 0, - /* NOTE: hack - need to parametrize this */ - cm->free_list_init_size, - cm->free_list_max_size, - cm->free_list_grow_size, - /* No Mpool */ - NULL, 0, NULL, - mca_coll_ml_collective_operation_progress_init, - (void *)&(ml_module->coll_desc_init_data) - ); - if (OMPI_SUCCESS != ret) { - return ret; - } - - /* done */ - return ret; -} diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c deleted file mode 100644 index cd964d41dd..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c +++ /dev/null @@ -1,240 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/coll/ml/coll_ml_functions.h" -#include "ompi/mca/coll/ml/coll_ml_hier_algorithms_common_setup.h" -#include "ompi/patterns/net/netpatterns_knomial_tree.h" - -#define SMALL_MSG_RANGE 1 -#define LARGE_MSG_RANGE 5 - -static int mca_coll_ml_build_allgather_schedule(mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t **coll_desc, int bcol_func_index) -{ - int ret; /* exit code in case of error */ - int nfn = 0; - int i; - int *scratch_indx = NULL, - *scratch_num = NULL; - - mca_coll_ml_collective_operation_description_t *schedule = NULL; - mca_coll_ml_compound_functions_t *comp_fn; - mca_coll_ml_schedule_hier_info_t h_info; - - ML_VERBOSE(9, ("Setting hierarchy, inputs : n_levels %d, hiest %d ", - topo_info->n_levels, topo_info->global_highest_hier_group_index)); - MCA_COLL_ML_INIT_HIER_INFO(h_info, topo_info->n_levels, - topo_info->global_highest_hier_group_index, topo_info); - - ret = mca_coll_ml_schedule_init_scratch(topo_info, &h_info, - &scratch_indx, &scratch_num); - if (OMPI_SUCCESS != ret) { - ML_ERROR(("Can't mca_coll_ml_schedule_init_scratch.")); - goto Error; - } - assert(NULL != scratch_indx); - assert(NULL != scratch_num); - - schedule = *coll_desc = - mca_coll_ml_schedule_alloc(&h_info); - if (NULL == schedule) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - /* Setting topology information */ - schedule->topo_info = topo_info; - - /* Set dependencies equal to number of hierarchies */ - for (i = 0; i < h_info.num_up_levels; i++) { - int query_conf[MCA_COLL_ML_QUERY_SIZE]; - MCA_COLL_ML_SET_QUERY(query_conf, DATA_SRC_KNOWN, BLOCKING, BCOL_GATHER, bcol_func_index, 0, 0); - comp_fn = &schedule->component_functions[i]; - MCA_COLL_ML_SET_COMP_FN(comp_fn, i, topo_info, - i, scratch_indx, scratch_num, query_conf, "GATHER_DATA"); - } - - nfn = i; - if (h_info.call_for_top_function) { - int query_conf[MCA_COLL_ML_QUERY_SIZE]; - MCA_COLL_ML_SET_QUERY(query_conf, DATA_SRC_KNOWN, NON_BLOCKING, BCOL_ALLGATHER, bcol_func_index, 0, 0); - comp_fn = &schedule->component_functions[nfn]; - MCA_COLL_ML_SET_COMP_FN(comp_fn, nfn, topo_info, - nfn, scratch_indx, scratch_num, query_conf, "ALLGATHER_DATA"); - ++nfn; - } - - /* coming down the hierarchy */ - for (i = h_info.num_up_levels - 1; i >= 0; i--, nfn++) { - int query_conf[MCA_COLL_ML_QUERY_SIZE]; - MCA_COLL_ML_SET_QUERY(query_conf, DATA_SRC_KNOWN, NON_BLOCKING, BCOL_BCAST, bcol_func_index, 0, 0); - comp_fn = &schedule->component_functions[nfn]; - MCA_COLL_ML_SET_COMP_FN(comp_fn, i, topo_info, - nfn, scratch_indx, scratch_num, query_conf, "BCAST_DATA"); - } - - /* Fill the rest of constant data */ - mca_coll_ml_call_types(&h_info, schedule); - - MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule); - - free(scratch_num); - free(scratch_indx); - - return OMPI_SUCCESS; - - Error: - if (NULL != scratch_indx) { - free(scratch_indx); - } - if (NULL != scratch_num) { - free(scratch_num); - } - - return ret; -} - -int ml_coll_hier_allgather_setup(mca_coll_ml_module_t *ml_module) -{ - /* Hierarchy Setup */ - int ret, topo_index, alg; - mca_coll_ml_topology_t *topo_info = ml_module->topo_list; - - ML_VERBOSE(10,("entering allgather setup")); - -#if 0 - /* used to validate the recursive k - ing allgather tree */ - { - /* debug print */ - int ii, jj; - netpatterns_k_exchange_node_t exchange_node; - - ret = netpatterns_setup_recursive_knomial_allgather_tree_node(8, 3, 3, &exchange_node); - fprintf(stderr,"log tree order %d tree_order %d\n", exchange_node.log_tree_order,exchange_node.tree_order); - if( EXCHANGE_NODE == exchange_node.node_type){ - if( exchange_node.n_extra_sources > 0){ - fprintf(stderr,"Receiving data from extra rank %d\n",exchange_node.rank_extra_sources_array[0]); - } - for( ii = 0; ii < exchange_node.log_tree_order; ii++){ - for( jj = 0; jj < (exchange_node.tree_order-1); jj++) { - if( exchange_node.rank_exchanges[ii][jj] >= 0){ - fprintf(stderr,"level %d I send %d bytes to %d from offset %d \n",ii+1, - exchange_node.payload_info[ii][jj].s_len, - exchange_node.rank_exchanges[ii][jj], - exchange_node.payload_info[ii][jj].s_offset); - fprintf(stderr,"level %d I receive %d bytes from %d at offset %d\n",ii+1, - exchange_node.payload_info[ii][jj].r_len, - exchange_node.rank_exchanges[ii][jj], - exchange_node.payload_info[ii][jj].r_offset); - } - } - } - fprintf(stderr,"exchange_node.n_extra_sources %d\n",exchange_node.n_extra_sources); - fprintf(stderr,"exchange_node.myid_reindex %d\n",exchange_node.reindex_myid); - if( exchange_node.n_extra_sources > 0){ - fprintf(stderr,"Sending back data to extra rank %d\n",exchange_node.rank_extra_sources_array[0]); - } - } else { - fprintf(stderr,"I am an extra and send to proxy %d\n", - exchange_node.rank_extra_sources_array[0]); - } - } -#endif - - alg = mca_coll_ml_component.coll_config[ML_ALLGATHER][ML_SMALL_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_ALLGATHER][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_ALLGATHER] = NULL; - return OMPI_ERROR; - } - - ret = mca_coll_ml_build_allgather_schedule(&ml_module->topo_list[topo_index], - &ml_module->coll_ml_allgather_functions[alg], - SMALL_MSG_RANGE); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup static alltoall")); - return ret; - } - - alg = mca_coll_ml_component.coll_config[ML_ALLGATHER][ML_LARGE_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_ALLGATHER][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_ALLGATHER] = NULL; - return OMPI_ERROR; - } - - ret = mca_coll_ml_build_allgather_schedule(&ml_module->topo_list[topo_index], - &ml_module->coll_ml_allgather_functions[alg], - LARGE_MSG_RANGE); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup static alltoall")); - return ret; - } - - return OMPI_SUCCESS; -} - -void ml_coll_hier_allgather_cleanup(mca_coll_ml_module_t *ml_module) -{ - /* Hierarchy Setup */ - int topo_index, alg; - mca_coll_ml_topology_t *topo_info = ml_module->topo_list; - - alg = mca_coll_ml_component.coll_config[ML_ALLGATHER][ML_SMALL_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_ALLGATHER][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_ALLGATHER] = NULL; - return; - } - - if (NULL == ml_module->coll_ml_allgather_functions[alg]) { - return; - } - - if (ml_module->coll_ml_allgather_functions[alg]->component_functions) { - free(ml_module->coll_ml_allgather_functions[alg]->component_functions); - ml_module->coll_ml_allgather_functions[alg]->component_functions = NULL; - } - - if (ml_module->coll_ml_allgather_functions[alg]) { - free(ml_module->coll_ml_allgather_functions[alg]); - ml_module->coll_ml_allgather_functions[alg] = NULL; - } - - alg = mca_coll_ml_component.coll_config[ML_ALLGATHER][ML_LARGE_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_ALLGATHER][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_ALLGATHER] = NULL; - return; - } - - if (ml_module->coll_ml_allgather_functions[alg]->component_functions) { - free(ml_module->coll_ml_allgather_functions[alg]->component_functions); - ml_module->coll_ml_allgather_functions[alg]->component_functions = NULL; - } - - if (ml_module->coll_ml_allgather_functions[alg]) { - free(ml_module->coll_ml_allgather_functions[alg]); - ml_module->coll_ml_allgather_functions[alg] = NULL; - } -} diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c deleted file mode 100644 index a371d51b7a..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c +++ /dev/null @@ -1,434 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/coll/ml/coll_ml_functions.h" - -#define ALLREDUCE_SMALL 1 -#define ALLREDUCE_LARGE 5 -#define SMALL_MSG_RANGE 1 -#define LARGE_MSG_RANGE 5 - -static int mca_coll_ml_build_allreduce_schedule( - mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t **coll_desc, int bcol_func_index) -{ - - bool call_for_top_function, prev_is_zero; - int n_hiers = topo_info->n_levels; - int i_hier, j_hier; - int cnt, value_to_set = 0; - int ret; /* exit code in case of error */ - int nfn=0; - int *scratch_indx = NULL, - *scratch_num = NULL; - int global_high_hierarchy_index = - topo_info->global_highest_hier_group_index; - - mca_coll_ml_collective_operation_description_t *schedule; - mca_coll_ml_compound_functions_t *comp_fn; - mca_bcol_base_module_t *prev_bcol, - *bcol_module; - int num_up_levels,nbcol_functions,i; - - if (global_high_hierarchy_index == - topo_info->component_pairs[n_hiers - 1].bcol_index) { - /* The process that is member of highest level subgroup - should call for top algorithms in addition to fan-in/out steps*/ - call_for_top_function = true; - /* hier level run only top algorithm, so we deduct 1 */ - num_up_levels = n_hiers - 1; - /* Top algorithm is called only once, so we deduct 1 */ - nbcol_functions = 2 * n_hiers - 1; - } else { - /* The process is not member of highest level subgroup, - as result it does not call for top algorithm, - but it calls for all fan-in/out steps */ - call_for_top_function = false; - num_up_levels = n_hiers; - nbcol_functions = 2 * n_hiers; - } - - *coll_desc = (mca_coll_ml_collective_operation_description_t *) - calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); - schedule = *coll_desc; - if (NULL == schedule) { - ML_ERROR(("Can't allocate memory.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - scratch_indx = (int *) calloc(n_hiers * 2, sizeof (int)); - if (NULL == scratch_indx) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Allreduce_Setup_Error; - } - - scratch_num = (int *) malloc(sizeof(int) * (n_hiers * 2)); - if (NULL == scratch_num) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Allreduce_Setup_Error; - } - - prev_bcol = NULL; - - for (i = 0, cnt = 0; i < num_up_levels; ++i, ++cnt) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - prev_bcol = GET_BCOL(topo_info, i); - } - } - - /* top - only if the proc arrive to highest_level_is_global_highest_level */ - if (call_for_top_function) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, n_hiers - 1))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - prev_bcol = GET_BCOL(topo_info, n_hiers - 1); - } - - ++cnt; - } - - /* going down */ - for (i = num_up_levels - 1; i >= 0; --i, ++cnt) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - prev_bcol = GET_BCOL(topo_info, i); - } - } - - i = cnt - 1; - prev_is_zero = true; - - do { - if (prev_is_zero) { - value_to_set = scratch_indx[i] + 1; - prev_is_zero = false; - } - - if (0 == scratch_indx[i]) { - prev_is_zero = true; - } - - scratch_num[i] = value_to_set; - --i; - } while(i >= 0); - - /* Set dependencies equal to number of hierarchies */ - schedule->n_fns = nbcol_functions; - schedule->topo_info = topo_info; - schedule->progress_type = 0; - - /* Allocated the component function */ - schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) - calloc(nbcol_functions, sizeof(struct mca_coll_ml_compound_functions_t)); - - if (NULL == schedule->component_functions) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Allreduce_Setup_Error; - } - - for (i = 0; i < num_up_levels; i++) { - comp_fn = &schedule->component_functions[i]; - comp_fn->h_level = i; /* hierarchy level */ - bcol_module = GET_BCOL(topo_info, i); - - /* strcpy (comp_fn->fn_name, "ALLREDUCE_SMALL_DATA"); */ - - comp_fn->num_dependent_tasks = 0; - comp_fn->num_dependencies = 0; - - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_REDUCE][bcol_func_index][0][0]; - if (NULL == comp_fn->bcol_function) { - /* if there isn't a bcol function for this then we can't continue */ - ret = OMPI_ERR_NOT_SUPPORTED; - goto Allreduce_Setup_Error; - } - - comp_fn->task_comp_fn = NULL; - - comp_fn->constant_group_data.bcol_module = bcol_module; - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i]; - comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i]; - comp_fn->constant_group_data.n_of_this_type_in_collective = 0; - comp_fn->constant_group_data.index_of_this_type_in_collective = 0; - } - - nfn = i; - if (call_for_top_function) { - comp_fn = &schedule->component_functions[nfn]; - comp_fn->h_level = nfn; /* hierarchy level */ - bcol_module = GET_BCOL(topo_info, nfn); - - assert (NULL != bcol_module); - - /* strcpy (comp_fn->fn_name, "ALLREDUCE_SMALL_DATA"); */ - - /* The allreduce should depend on the reduce */ - comp_fn->num_dependent_tasks = 0; - comp_fn->num_dependencies = 0; - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_ALLREDUCE][bcol_func_index][0][0]; - if (NULL == comp_fn->bcol_function) { - /* if there isn't a bcol function for this then we can't continue */ - ret = OMPI_ERR_NOT_SUPPORTED; - goto Allreduce_Setup_Error; - } - - comp_fn->task_comp_fn = NULL; - - comp_fn->constant_group_data.bcol_module = bcol_module; - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[nfn]; - comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[nfn]; - comp_fn->constant_group_data.n_of_this_type_in_collective = 0; - comp_fn->constant_group_data.index_of_this_type_in_collective = 0; - - ++nfn; - } - - for (i = num_up_levels - 1; i >= 0; i--) { - comp_fn = &schedule->component_functions[nfn]; - comp_fn->h_level = i; /* hierarchy level */ - bcol_module = GET_BCOL(topo_info, i); - - assert (NULL != bcol_module); - - /* strcpy (comp_fn->fn_name, "ALLREDUCE_SMALL_DATA"); */ - - comp_fn->num_dependent_tasks = 0; - comp_fn->num_dependencies = 0; - - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_BCAST][bcol_func_index][0][0]; - if (NULL == comp_fn->bcol_function) { - /* if there isn't a bcol function for this then we can't continue */ - ret = OMPI_ERR_NOT_SUPPORTED; - goto Allreduce_Setup_Error; - } - - comp_fn->task_comp_fn = NULL; - - comp_fn->constant_group_data.bcol_module = bcol_module; - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[nfn]; - comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[nfn]; - comp_fn->constant_group_data.n_of_this_type_in_collective = 0; - comp_fn->constant_group_data.index_of_this_type_in_collective = 0; - - ++nfn; - } - - /* Fill the rest of constant data */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - mca_bcol_base_module_t *current_bcol = - schedule->component_functions[i_hier]. - constant_group_data.bcol_module; - cnt = 0; - for (j_hier = 0; j_hier < n_hiers; j_hier++) { - if (current_bcol == - schedule->component_functions[j_hier]. - constant_group_data.bcol_module) { - schedule->component_functions[j_hier]. - constant_group_data.index_of_this_type_in_collective = cnt; - cnt++; - } - } - - schedule->component_functions[i_hier]. - constant_group_data.n_of_this_type_in_collective = cnt; - } - - MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule); - - free(scratch_num); - free(scratch_indx); - - return OMPI_SUCCESS; - -Allreduce_Setup_Error: - - if (NULL != scratch_indx) { - free(scratch_indx); - } - - if (NULL != scratch_num) { - free(scratch_num); - } - - if (NULL != schedule->component_functions) { - free(schedule->component_functions); - } - *coll_desc = NULL; - free (schedule); - - return ret; -} - -int ml_coll_hier_allreduce_setup_new(mca_coll_ml_module_t *ml_module) -{ - /* Hierarchy Setup */ - int ret; - int topo_index; - int alg; - mca_coll_ml_topology_t *topo_info = ml_module->topo_list; - - alg = mca_coll_ml_component.coll_config[ML_ALLREDUCE][ML_SMALL_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL; - return OMPI_ERROR; - } - - ret = mca_coll_ml_build_allreduce_schedule( - &ml_module->topo_list[topo_index], - &ml_module->coll_ml_allreduce_functions[alg], - SMALL_MSG_RANGE); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup Small Message Allreduce")); - return ret; - } - - alg = mca_coll_ml_component.coll_config[ML_ALLREDUCE][ML_LARGE_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL; - return OMPI_ERROR; - } - - ret = mca_coll_ml_build_allreduce_schedule( - &ml_module->topo_list[topo_index], - &ml_module->coll_ml_allreduce_functions[alg], - LARGE_MSG_RANGE); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup Large Message Allreduce")); - return ret; - } - - if (true == mca_coll_ml_component.need_allreduce_support) { - topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE]; - if (ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index was defined")); - topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL; - return OMPI_ERROR; - } - - ret = mca_coll_ml_build_allreduce_schedule( - &ml_module->topo_list[topo_index], - &ml_module->coll_ml_allreduce_functions[ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE], - SMALL_MSG_RANGE); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup Extra Small Message Allreduce")); - return ret; - } - - topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE]; - if (ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index was defined")); - topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL; - return OMPI_ERROR; - } - - ret = mca_coll_ml_build_allreduce_schedule( - &ml_module->topo_list[topo_index], - &ml_module->coll_ml_allreduce_functions[ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE], - LARGE_MSG_RANGE); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup Extra Large Message Allreduce")); - return ret; - } - } - - return OMPI_SUCCESS; -} - -void ml_coll_hier_allreduce_cleanup_new(mca_coll_ml_module_t *ml_module) -{ - /* Hierarchy Setup */ - int topo_index; - int alg; - mca_coll_ml_topology_t *topo_info = ml_module->topo_list; - - alg = mca_coll_ml_component.coll_config[ML_ALLREDUCE][ML_SMALL_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL; - return; - } - - if (NULL == ml_module->coll_ml_allreduce_functions[alg]) { - return; - } - - free(ml_module->coll_ml_allreduce_functions[alg]->component_functions); - ml_module->coll_ml_allreduce_functions[alg]->component_functions = NULL; - free(ml_module->coll_ml_allreduce_functions[alg]); - ml_module->coll_ml_allreduce_functions[alg] = NULL; - - alg = mca_coll_ml_component.coll_config[ML_ALLREDUCE][ML_LARGE_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL; - return; - } - - free(ml_module->coll_ml_allreduce_functions[alg]->component_functions); - ml_module->coll_ml_allreduce_functions[alg]->component_functions = NULL; - free(ml_module->coll_ml_allreduce_functions[alg]); - ml_module->coll_ml_allreduce_functions[alg] = NULL; - - if (true == mca_coll_ml_component.need_allreduce_support) { - topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE]; - if (ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index was defined")); - topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL; - return; - } - - alg = ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE; - free(ml_module->coll_ml_allreduce_functions[alg]->component_functions); - ml_module->coll_ml_allreduce_functions[alg]->component_functions = NULL; - free(ml_module->coll_ml_allreduce_functions[alg]); - ml_module->coll_ml_allreduce_functions[alg] = NULL; - - topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE]; - if (ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index was defined")); - topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL; - return; - } - - alg = ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE; - free(ml_module->coll_ml_allreduce_functions[alg]->component_functions); - ml_module->coll_ml_allreduce_functions[alg]->component_functions = NULL; - free(ml_module->coll_ml_allreduce_functions[alg]); - ml_module->coll_ml_allreduce_functions[alg] = NULL; - } -} diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c deleted file mode 100644 index 2b4a0c2a9f..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c +++ /dev/null @@ -1,206 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/coll/ml/coll_ml_functions.h" - -static int mca_coll_ml_build_barrier_schedule( - mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t - **coll_desc, - mca_coll_ml_module_t *ml_module) -{ - int i_hier, rc, i_fn, n_fcns, i, - n_hiers = topo_info->n_levels; - - bool call_for_top_func; - mca_bcol_base_module_t *bcol_module; - - mca_coll_ml_compound_functions_t *comp_fn; - mca_coll_ml_collective_operation_description_t *schedule; - - *coll_desc = (mca_coll_ml_collective_operation_description_t *) - malloc(sizeof(mca_coll_ml_collective_operation_description_t)); - - schedule = *coll_desc; - if (OPAL_UNLIKELY(NULL == schedule)) { - ML_ERROR(("Can't allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto Barrier_Setup_Error; - } - - if (topo_info->global_highest_hier_group_index == - topo_info->component_pairs[n_hiers - 1].bcol_index) { - /* The process that is member of highest level subgroup - should call for top algorithms in addition to fan-in/out steps */ - call_for_top_func = true; - n_fcns = 2 * n_hiers - 1; /* Up + Top + Down */ - } else { - /* The process is not member of highest level subgroup, - as result it does not call for top algorithm, - but it calls for all fan-in/out steps */ - call_for_top_func = false; - n_fcns = 2 * n_hiers; - } - - if( ml_module->max_fn_calls < n_fcns ) { - ml_module->max_fn_calls = n_fcns; - } - - /* Set dependencies equal to number of hierarchies */ - schedule->n_fns = n_fcns; - schedule->topo_info = topo_info; - - /* Allocated the component function */ - schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) - calloc(n_fcns, sizeof(struct mca_coll_ml_compound_functions_t)); - - if (OPAL_UNLIKELY(NULL == schedule->component_functions)) { - ML_ERROR(("Can't allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto Barrier_Setup_Error; - } - for (i_fn = 0; i_fn < n_fcns; ++i_fn) { - i_hier = (i_fn < n_hiers ? i_fn : n_fcns - i_fn - 1); - comp_fn = &schedule->component_functions[i_fn]; - - /* The hierarchial level */ - comp_fn->h_level = i_hier; - bcol_module = GET_BCOL(topo_info, i_hier); - - /* The UP direction */ - if (1 + i_fn < n_hiers || (1 + i_fn == n_hiers && !call_for_top_func)) { - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_FANIN][1][0][0]; - - if (NULL == comp_fn->bcol_function) { - ML_VERBOSE(10, ("no function available for BCOL_FANIN, NON_BLOCKING, DATA_SRC_KNOWN")); - rc = OMPI_ERR_NOT_AVAILABLE; - goto Barrier_Setup_Error; - } - - /* Each function call with index K is depended of all K-1 previous indices - - in simple words we will do sequential Fan-In calls */ - comp_fn->num_dependencies = (0 == i_fn) ? 0 : 1; - comp_fn->num_dependent_tasks = 1; - /* Init component function */ - strcpy(comp_fn->fn_name, "FANIN"); - /* On the highest level */ - } else if ((1 + i_fn == n_hiers && call_for_top_func)) { - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_BARRIER][1][0][0]; - - if (NULL == comp_fn->bcol_function) { - ML_VERBOSE(10, ("no function available for BCOL_BARRIER, NON_BLOCKING, DATA_SRC_KNOWN")); - rc = OMPI_ERR_NOT_AVAILABLE; - goto Barrier_Setup_Error; - } - - /* Each function call with index K is depended of all K-1 previous indices - - in simple words we do sequential calls */ - comp_fn->num_dependencies = (1 == n_hiers) ? 0 : 1; /* All Fan-Ins */ - comp_fn->num_dependent_tasks = n_fcns - n_hiers; /* All Fan-Outs */ - - /* Init component function */ - strcpy(comp_fn->fn_name, "BARRIER"); - - ML_VERBOSE(10, ("func indx %d set to BARRIER %p", i_fn, comp_fn->bcol_function)); - - /* The DOWN direction */ - } else { - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_FANOUT][1][0][0]; - - if (NULL == comp_fn->bcol_function) { - ML_VERBOSE(10, ("no function available for BCOL_FANOUT, NON_BLOCKING, DATA_SRC_KNOWN")); - rc = OMPI_ERR_NOT_AVAILABLE; - goto Barrier_Setup_Error; - } - - /* Each function call with index K is depended of all UP and TOP algths */ - comp_fn->num_dependencies = 1; - comp_fn->num_dependent_tasks = call_for_top_func ? 0 : - (i_fn + 1 == n_fcns ? 0 : 1); - - /* Init component function */ - strcpy(comp_fn->fn_name, "FANOUT"); - } - - ML_VERBOSE(10, ("func indx %d set to %p", i_fn, comp_fn->bcol_function)); - - if (comp_fn->num_dependent_tasks > 0) { - comp_fn->dependent_task_indices = (int *) calloc(comp_fn->num_dependent_tasks, sizeof(int)); - if (OPAL_UNLIKELY(NULL == comp_fn->dependent_task_indices)) { - ML_ERROR(("Can't allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto Barrier_Setup_Error; - } - - /* All indexes follow after this one */ - for (i = 0; i < comp_fn->num_dependent_tasks; ++i) { - comp_fn->dependent_task_indices[i] = i_fn + i + 1; - } - } else { - comp_fn->dependent_task_indices = NULL; - } - - - /* No need completion func for Barrier */ - comp_fn->task_comp_fn = NULL; - - ML_VERBOSE(10, ("Setting collective [Barrier] fn_idx %d, n_of_this_type_in_a_row %d, " - "index_in_consecutive_same_bcol_calls %d.", - i_fn, comp_fn->constant_group_data.n_of_this_type_in_a_row, - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls)); - } - - rc = ml_coll_barrier_constant_group_data_setup(topo_info, schedule); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - ML_ERROR(("Failed to init const group data.")); - goto Barrier_Setup_Error; - } - - schedule->progress_type = 0; - - return OMPI_SUCCESS; - -Barrier_Setup_Error: - if (NULL != schedule->component_functions) { - free(schedule->component_functions); - schedule->component_functions = NULL; - } - - return rc; -} - -int ml_coll_hier_barrier_setup(mca_coll_ml_module_t *ml_module) -{ - int rc; - mca_coll_ml_topology_t *topo_info = - &ml_module->topo_list[ml_module->collectives_topology_map[ML_BARRIER][ML_SMALL_MSG]]; - - rc = mca_coll_ml_build_barrier_schedule(topo_info, - &ml_module->coll_ml_barrier_function, ml_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - /* Make sure to reset the barrier pointer to NULL */ - topo_info->hierarchical_algorithms[BCOL_BARRIER] = NULL; - - return rc; - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c deleted file mode 100644 index 314a6f4655..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c +++ /dev/null @@ -1,851 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/coll/ml/coll_ml_functions.h" - -static int mca_coll_ml_task_comp_dynamic_root_small_message - (struct mca_coll_ml_task_status_t *task) { - - task->ml_coll_operation->variable_fn_params.root_flag = true; - - return OMPI_SUCCESS; -} - - -int mca_coll_ml_setup_scratch_vals(mca_coll_ml_compound_functions_t *func_list, - int *scratch_indx, int *scratch_num, int n_hiers) -{ - int i_hier, j_hier; - int cnt, value_to_set = 0; - bool prev_is_zero; - mca_coll_ml_compound_functions_t *comp_fn; - mca_bcol_base_module_t *prev_bcol = NULL, - *bcol_module; - - /* Calculate scratch numbers */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - bcol_module = func_list[i_hier].constant_group_data.bcol_module; - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, bcol_module)) { - scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1; - } else { - scratch_indx[i_hier] = 0; - prev_bcol = bcol_module; - } - } - - --i_hier; - prev_is_zero = true; - - do { - if (prev_is_zero) { - value_to_set = scratch_indx[i_hier] + 1; - prev_is_zero = false; - } - - if (0 == scratch_indx[i_hier]) { - prev_is_zero = true; - } - - scratch_num[i_hier] = value_to_set; - --i_hier; - } while(i_hier >= 0); - - - /* Each hierarchy has one function to be implemented */ - /* this is the basic setup required of the bcol function */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - /* We want to be generic, but on this stage we support only single - * bcol per hierarchy level - */ - comp_fn = &func_list[i_hier]; - comp_fn->h_level = i_hier; /* hierarchy level */ - - /* we can change this */ - comp_fn->task_comp_fn = mca_coll_ml_task_comp_dynamic_root_small_message; - /* assert(NULL != comp_fn->bcol_function); */ - /* Constants */ - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_collective = 0; - comp_fn->constant_group_data.index_of_this_type_in_collective = 0; - - ML_VERBOSE(10, ("Setting collective [bcast] fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", - i_hier, - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls, - comp_fn->constant_group_data.n_of_this_type_in_a_row)); - } - - /* Fill the rest of constant data */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - mca_bcol_base_module_t *current_bcol = - func_list[i_hier]. - constant_group_data.bcol_module; - cnt = 0; - for (j_hier = 0; j_hier < n_hiers; j_hier++) { - if (current_bcol == - func_list[j_hier]. - constant_group_data.bcol_module) { - func_list[j_hier].constant_group_data. - index_of_this_type_in_collective = cnt; - - cnt++; - } - } - func_list[i_hier].constant_group_data.n_of_this_type_in_collective = cnt; - } - - return OMPI_SUCCESS; - -} - -static void mca_coll_ml_zero_dep_bcast(mca_coll_ml_task_status_t *task_status, int index, mca_coll_ml_compound_functions_t *func) -{ - /* no real dependency, set everything to zero */ - task_status->rt_num_dependencies = 0; - task_status->rt_num_dependent_tasks = 0; - task_status->rt_dependent_task_indices = NULL; -} - -/* - * Build schedule without runtime attributes - */ -static int mca_coll_ml_build_bcast_dynamic_schedule_no_attributes( - mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t **coll_desc, int bcol_func_index) -{ - - int n_hiers = topo_info->n_levels; - int i_hier, j_hier; - int cnt, value_to_set = 0; - int ret; /* exit code in case of error */ - bool prev_is_zero; - int *scratch_indx = NULL, - *scratch_num = NULL; - - mca_coll_ml_collective_operation_description_t *schedule; - mca_coll_ml_compound_functions_t *comp_fn; - mca_bcol_base_module_t *prev_bcol, - *bcol_module; - - *coll_desc = (mca_coll_ml_collective_operation_description_t *) - calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); - schedule = *coll_desc; - if (NULL == schedule) { - ML_ERROR(("Can't allocate memory.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - scratch_indx = (int *) calloc(n_hiers, sizeof (int)); - if (NULL == scratch_indx) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - - scratch_num = (int *) malloc(sizeof(int) * (n_hiers)); - if (NULL == scratch_num) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - - prev_bcol = NULL; - - /* Calculate scratch numbers */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) { - scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1; - } else { - scratch_indx[i_hier] = 0; - prev_bcol = GET_BCOL(topo_info, i_hier); - } - } - - --i_hier; - prev_is_zero = true; - - do { - if (prev_is_zero) { - value_to_set = scratch_indx[i_hier] + 1; - prev_is_zero = false; - } - - if (0 == scratch_indx[i_hier]) { - prev_is_zero = true; - } - - scratch_num[i_hier] = value_to_set; - --i_hier; - } while(i_hier >= 0); - - /* Set dependencies equal to number of hierarchies */ - schedule->n_fns = n_hiers; - schedule->topo_info = topo_info; - schedule->progress_type = 0; /* Pasha: Not really defined, puting zero */ - - /* Allocated the component function */ - schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) - calloc(n_hiers, sizeof(struct mca_coll_ml_compound_functions_t)); - if (NULL == schedule->component_functions) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - - /* Each hierarchy has one function to be implemented */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - /* We want to be generic, but on this stage we support only single - * bcol per hierarchy level - */ - comp_fn = &schedule->component_functions[i_hier]; - comp_fn->h_level = i_hier; /* hierarchy level */ - bcol_module = GET_BCOL(topo_info, i_hier); - /* Init component function */ - strcpy (comp_fn->fn_name, "BCAST_TEST_SMALL_DYNAMIC"); - comp_fn->num_dependent_tasks = 0; - comp_fn->num_dependencies = 0; - comp_fn->dependent_task_indices = NULL; - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_UNKNOWN][NON_BLOCKING][BCOL_BCAST][bcol_func_index][0][0]; - comp_fn->task_comp_fn = mca_coll_ml_task_comp_dynamic_root_small_message; - assert(NULL != comp_fn->bcol_function); - /* - comp_fn->bcol_function->progress_fn = - bcol_module->filtered_fns_table[BCOL_BCAST][1][0][0]; - */ - /* Constants */ - comp_fn->constant_group_data.bcol_module = bcol_module; - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_collective = 0; - comp_fn->constant_group_data.index_of_this_type_in_collective = 0; - - ML_VERBOSE(10, ("Setting collective [bcast] fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", - i_hier, - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls, - comp_fn->constant_group_data.n_of_this_type_in_a_row)); - } - - /* Fill the rest of constant data */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - mca_bcol_base_module_t *current_bcol = - schedule->component_functions[i_hier]. - constant_group_data.bcol_module; - cnt = 0; - for (j_hier = 0; j_hier < n_hiers; j_hier++) { - if (current_bcol == - schedule->component_functions[j_hier]. - constant_group_data.bcol_module) { - schedule->component_functions[j_hier]. - constant_group_data.index_of_this_type_in_collective = cnt; - ML_VERBOSE(10, ("Pasha: Setting collective [bcast small][count %d], fn_idx %d, collective_alg->functions[i].index_of_this_type_in_collective %d", - cnt, i_hier, - schedule->component_functions[j_hier]. - constant_group_data.index_of_this_type_in_collective)); - cnt++; - } - } - - schedule->component_functions[i_hier]. - constant_group_data.n_of_this_type_in_collective = cnt; - } - - schedule->task_setup_fn[COLL_ML_ROOT_TASK_FN] = mca_coll_ml_zero_dep_bcast; - schedule->task_setup_fn[COLL_ML_GENERAL_TASK_FN] = mca_coll_ml_zero_dep_bcast; - - MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule); - - free(scratch_num); - free(scratch_indx); - - return OMPI_SUCCESS; - -Bcast_Setup_Error: - - if (NULL != scratch_indx) { - free(scratch_indx); - } - - if (NULL != scratch_num) { - free(scratch_num); - } - - if (NULL != schedule->component_functions) { - free(schedule->component_functions); - } - - return ret; -} - -static int mca_coll_ml_build_bcast_sequential_schedule_no_attributes( - mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t **coll_desc, int bcol_func_index) -{ - - int n_hiers = topo_info->n_levels; - int i_hier, j_hier; - int cnt, value_to_set = 0; - int ret; /* exit code in case of error */ - bool prev_is_zero; - int *scratch_indx = NULL, - *scratch_num = NULL; - - mca_coll_ml_collective_operation_description_t *schedule; - mca_coll_ml_compound_functions_t *comp_fn; - mca_coll_ml_compound_functions_t *comp_fns_temp; - mca_bcol_base_module_t *prev_bcol, - *bcol_module; - - *coll_desc = (mca_coll_ml_collective_operation_description_t *) - calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); - schedule = *coll_desc; - if (NULL == schedule) { - ML_ERROR(("Can't allocate memory.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - scratch_indx = (int *) calloc(n_hiers, sizeof (int)); - if (NULL == scratch_indx) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - - scratch_num = (int *) malloc(sizeof(int) * (n_hiers)); - if (NULL == scratch_num) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - - prev_bcol = NULL; - - /* Calculate scratch numbers */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) { - scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1; - } else { - scratch_indx[i_hier] = 0; - prev_bcol = GET_BCOL(topo_info, i_hier); - } - } - - --i_hier; - prev_is_zero = true; - - do { - if (prev_is_zero) { - value_to_set = scratch_indx[i_hier] + 1; - prev_is_zero = false; - } - - if (0 == scratch_indx[i_hier]) { - prev_is_zero = true; - } - - scratch_num[i_hier] = value_to_set; - --i_hier; - } while(i_hier >= 0); - - /* Set dependencies equal to number of hierarchies */ - schedule->n_fns = n_hiers; - schedule->topo_info = topo_info; - schedule->progress_type = 0; /* Pasha: Not really defined, puting zero - * Josh: would be nice to define it as "sequential" - * or "concurrent" - */ - - /* Allocated the component function */ - schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) - calloc(n_hiers, sizeof(struct mca_coll_ml_compound_functions_t)); - if (NULL == schedule->component_functions) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - /* Allocate the schedule list */ - schedule->comp_fn_arr = (struct mca_coll_ml_compound_functions_t **) - calloc(n_hiers,sizeof(struct mca_coll_ml_compound_functions_t *)); - if (NULL == schedule->comp_fn_arr) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - /* Each hierarchy has one function to be implemented */ - /* this is the basic setup required of the bcol function */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - /* We want to be generic, but on this stage we support only single - * bcol per hierarchy level - */ - comp_fn = &schedule->component_functions[i_hier]; - comp_fn->h_level = i_hier; /* hierarchy level */ - bcol_module = GET_BCOL(topo_info, i_hier); - /* Init component function */ - strcpy (comp_fn->fn_name, "BCAST_TEST_SMALL_SEQUENTIAL"); - - /* should be very simple, shouldn't require any kind of fancy dependencies set*/ - - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_BCAST][bcol_func_index][0][0]; - - /* initialize the coll_fn_started flag to false */ - /*comp_fn->coll_fn_started = false;*/ - /* debug print */ - - /* - if(comp_fn->coll_fn_started){ - fprintf(stderr,"this statement is true\n"); - } else { - fprintf(stderr,"done setting to false \n"); - } - */ - - comp_fn->task_comp_fn = mca_coll_ml_task_comp_dynamic_root_small_message; - /* assert(NULL != comp_fn->bcol_function); */ - /* Constants */ - comp_fn->constant_group_data.bcol_module = bcol_module; - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_collective = 0; - comp_fn->constant_group_data.index_of_this_type_in_collective = 0; - - ML_VERBOSE(10, ("Setting collective [bcast] fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", - i_hier, - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls, - comp_fn->constant_group_data.n_of_this_type_in_a_row)); - } - - /* Fill the rest of constant data */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - mca_bcol_base_module_t *current_bcol = - schedule->component_functions[i_hier]. - constant_group_data.bcol_module; - cnt = 0; - for (j_hier = 0; j_hier < n_hiers; j_hier++) { - if (current_bcol == - schedule->component_functions[j_hier]. - constant_group_data.bcol_module) { - schedule->component_functions[j_hier]. - constant_group_data.index_of_this_type_in_collective = cnt; - ML_VERBOSE(10, ("Pasha: Setting collective [bcast small][count %d], fn_idx %d, collective_alg->functions[i].index_of_this_type_in_collective %d", - cnt, i_hier, - schedule->component_functions[j_hier]. - constant_group_data.index_of_this_type_in_collective)); - cnt++; - } - } - schedule->component_functions[i_hier]. - constant_group_data.n_of_this_type_in_collective = cnt; - } - /* Now that the functions have been set-up properly, we can simple permute the ordering a bit */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - /* first one is trivial */ - comp_fns_temp = (struct mca_coll_ml_compound_functions_t *) - calloc(n_hiers, sizeof(struct mca_coll_ml_compound_functions_t)); - /* else we need to build the schedule */ - - for(j_hier = 0; j_hier < n_hiers; j_hier++) { - /* put the i_hier-th function first in the list */ - if( 0 == j_hier ) { - comp_fns_temp[j_hier] = schedule->component_functions[i_hier]; - } else if( j_hier <= i_hier ) { - comp_fns_temp[j_hier] = schedule->component_functions[j_hier-1]; - } else { - comp_fns_temp[j_hier] = schedule->component_functions[j_hier]; - } - } - /* now let's attach this list to our array of lists */ - schedule->comp_fn_arr[i_hier] = comp_fns_temp; - - } - - -#if 1 - /* I'm going to just loop over each schedule and - * set up the scratch indices, scratch numbers - * and other constant data - */ - for( i_hier = 1; i_hier < n_hiers; i_hier++) { - /* calculate the scratch indices and associated numbers */ - ret = mca_coll_ml_setup_scratch_vals(schedule->comp_fn_arr[i_hier], scratch_indx, - scratch_num, n_hiers); - if( OMPI_SUCCESS != ret ) { - ret = OMPI_ERROR; - goto Bcast_Setup_Error; - } - - } -#endif - - MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule); - - free(scratch_num); - free(scratch_indx); - - return OMPI_SUCCESS; - -Bcast_Setup_Error: - - if (NULL != scratch_indx) { - free(scratch_indx); - } - - if (NULL != scratch_num) { - free(scratch_num); - } - - if (NULL != schedule->component_functions) { - free(schedule->component_functions); - } - - if (NULL != schedule->comp_fn_arr) { - free(schedule->comp_fn_arr); - } - free (schedule); - *coll_desc = NULL; - - return ret; -} - -static void mca_coll_ml_static_bcast_root(mca_coll_ml_task_status_t *task_status, int index, - mca_coll_ml_compound_functions_t *func) -{ - task_status->rt_num_dependencies = 0; - task_status->rt_num_dependent_tasks = 0; - task_status->rt_dependent_task_indices = 0; -} - -static void mca_coll_ml_static_bcast_non_root(mca_coll_ml_task_status_t *task_status, int index, - mca_coll_ml_compound_functions_t *func) -{ - /* Make active only the first level of hierarchy the gets the data, all the rest of levels - will be activated by dependency list */ - if (task_status->ml_coll_operation->variable_fn_params.root_route->level == index) { - task_status->rt_num_dependencies = 0; - task_status->rt_num_dependent_tasks = func->num_dependent_tasks; - task_status->rt_dependent_task_indices = func->dependent_task_indices; - task_status->ml_coll_operation->variable_fn_params.root = - task_status->ml_coll_operation->variable_fn_params.root_route->rank; - } else { - task_status->rt_num_dependencies = 1; /* wait for root */ - task_status->rt_num_dependent_tasks = 0; /* no depended task */ - task_status->rt_dependent_task_indices = NULL; /* NULL */ - } -} - -static int mca_coll_ml_build_bcast_known_schedule_no_attributes( - mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t **coll_desc, int bcol_func_index) -{ - - int n_hiers = topo_info->n_levels; - int i_hier, j_hier; - int cnt, value_to_set = 0; - int ret; /* exit code in case of error */ - bool prev_is_zero; - int *scratch_indx = NULL, - *scratch_num = NULL; - - mca_coll_ml_collective_operation_description_t *schedule; - mca_coll_ml_compound_functions_t *comp_fn; - mca_bcol_base_module_t *prev_bcol, - *bcol_module; - - *coll_desc = (mca_coll_ml_collective_operation_description_t *) - calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); - schedule = *coll_desc; - if (NULL == schedule) { - ML_ERROR(("Can't allocate memory.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - scratch_indx = (int *) calloc(n_hiers, sizeof (int)); - if (NULL == scratch_indx) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - - scratch_num = (int *) malloc(sizeof(int) * (n_hiers)); - if (NULL == scratch_num) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - - prev_bcol = NULL; - - /* Calculate scratch numbers */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) { - scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1; - } else { - prev_bcol = GET_BCOL(topo_info, i_hier); - } - } - - --i_hier; - prev_is_zero = true; - - do { - if (prev_is_zero) { - value_to_set = scratch_indx[i_hier] + 1; - prev_is_zero = false; - } - - if (0 == scratch_indx[i_hier]) { - prev_is_zero = true; - } - - scratch_num[i_hier] = value_to_set; - --i_hier; - } while(i_hier >= 0); - - /* Set dependencies equal to number of hierarchies */ - schedule->n_fns = n_hiers; - schedule->topo_info = topo_info; - schedule->progress_type = 0; /* Pasha: Not really defined, puting zero */ - - /* Allocated the component function */ - schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) - calloc(n_hiers, sizeof(struct mca_coll_ml_compound_functions_t)); - if (NULL == schedule->component_functions) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Bcast_Setup_Error; - } - - /* Each hierarchy has one function to be implemented */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - int j; - /* We want to be generic, but on this stage we support only single - * bcol per hierarchy level - */ - comp_fn = &schedule->component_functions[i_hier]; - comp_fn->h_level = i_hier; /* hierarchy level */ - bcol_module = GET_BCOL(topo_info, i_hier); - /* Init component function */ - strcpy (comp_fn->fn_name, "BCAST_TEST_SMALL_STATIC"); - /* Hack for single layer of hierarchy */ - if (1 == n_hiers) { - comp_fn->num_dependent_tasks = n_hiers - 1; - comp_fn->num_dependencies = 0; - } else { - comp_fn->num_dependent_tasks = n_hiers; /* root will have n_hier - 1 depended tasks, non root zero*/ - comp_fn->num_dependencies = 0; /* root will have zero dependencies */ - } - - if (0 != comp_fn->num_dependent_tasks) { - comp_fn->dependent_task_indices = (int *)calloc(n_hiers, sizeof(int)); - for (j = 0; j < n_hiers; j++) { - comp_fn->dependent_task_indices[j] = j; /* only root will use this one */ - } - } - - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_BCAST][bcol_func_index][0][0]; - - comp_fn->task_comp_fn = mca_coll_ml_task_comp_dynamic_root_small_message; - /* assert(NULL != comp_fn->bcol_function); */ - /* Constants */ - comp_fn->constant_group_data.bcol_module = bcol_module; - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_collective = 0; - comp_fn->constant_group_data.index_of_this_type_in_collective = 0; - - ML_VERBOSE(10, ("Setting collective [bcast] fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", - i_hier, - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls, - comp_fn->constant_group_data.n_of_this_type_in_a_row)); - } - - /* Fill the rest of constant data */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - mca_bcol_base_module_t *current_bcol = - schedule->component_functions[i_hier]. - constant_group_data.bcol_module; - cnt = 0; - for (j_hier = 0; j_hier < n_hiers; j_hier++) { - if (current_bcol == - schedule->component_functions[j_hier]. - constant_group_data.bcol_module) { - schedule->component_functions[j_hier]. - constant_group_data.index_of_this_type_in_collective = cnt; - ML_VERBOSE(10, ("Pasha: Setting collective [bcast small][count %d], fn_idx %d, collective_alg->functions[i].index_of_this_type_in_collective %d", - cnt, i_hier, - schedule->component_functions[j_hier]. - constant_group_data.index_of_this_type_in_collective)); - cnt++; - } - } - schedule->component_functions[i_hier]. - constant_group_data.n_of_this_type_in_collective = cnt; - } - - schedule->task_setup_fn[COLL_ML_ROOT_TASK_FN] = mca_coll_ml_static_bcast_root; - schedule->task_setup_fn[COLL_ML_GENERAL_TASK_FN] = mca_coll_ml_static_bcast_non_root; - - MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule); - - free(scratch_num); - free(scratch_indx); - - return OMPI_SUCCESS; - -Bcast_Setup_Error: - - if (NULL != scratch_indx) { - free(scratch_indx); - } - - if (NULL != scratch_num) { - free(scratch_num); - } - - if (NULL != schedule->component_functions) { - free(schedule->component_functions); - } - free (schedule); - *coll_desc = NULL; - - return ret; -} - - - -#define BCAST_SMALL 1 -#define BCAST_LARGE 5 - -int ml_coll_hier_bcast_setup(mca_coll_ml_module_t *ml_module) -{ - /* Hierarchy Setup */ - int ret, i , size_code, alg; - int topo_index = 0; - mca_coll_ml_topology_t *topo_info = ml_module->topo_list; - - for (i = 0; i < ML_NUM_MSG; i++) { - - switch (i) { - case ML_SMALL_MSG: - size_code = BCAST_SMALL; - break; - case ML_LARGE_MSG: - size_code = BCAST_LARGE; - break; - default: - topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - return OMPI_ERROR; - } - - alg = mca_coll_ml_component.coll_config[ML_BCAST][i].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_BCAST][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - return OMPI_ERROR; - } - - switch (alg) { - case ML_BCAST_SMALL_DATA_KNOWN: - case ML_BCAST_LARGE_DATA_KNOWN: - ret = mca_coll_ml_build_bcast_known_schedule_no_attributes(&topo_info[topo_index], - &ml_module->coll_ml_bcast_functions[alg], size_code); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup static bcast")); - topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - return ret; - } - break; - case ML_BCAST_SMALL_DATA_UNKNOWN: - case ML_BCAST_LARGE_DATA_UNKNOWN: - ret = mca_coll_ml_build_bcast_dynamic_schedule_no_attributes(&topo_info[topo_index], - &ml_module->coll_ml_bcast_functions[alg], size_code); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup dynamic bcast")); - topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - return ret; - } - break; - case ML_BCAST_SMALL_DATA_SEQUENTIAL: - case ML_BCAST_LARGE_DATA_SEQUENTIAL: - ret = mca_coll_ml_build_bcast_sequential_schedule_no_attributes(&topo_info[topo_index], - &ml_module->coll_ml_bcast_functions[alg], size_code); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup static bcast")); - topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - return ret; - } - break; - default: - topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - return OMPI_ERROR; - } - assert(NULL != ml_module->coll_ml_bcast_functions[alg] && - NULL != ml_module->coll_ml_bcast_functions[alg]); - } - - topo_info->hierarchical_algorithms[BCOL_BCAST] = NULL; - return ret; -} - -void ml_coll_hier_bcast_cleanup(mca_coll_ml_module_t *ml_module) -{ - /* Hierarchy Setup */ - int i, alg; - int topo_index = 0; - mca_coll_ml_topology_t *topo_info = ml_module->topo_list; - - assert (NULL != ml_module); - - for (i = 0; i < ML_NUM_MSG; i++) { - - switch (i) { - case ML_SMALL_MSG: - case ML_LARGE_MSG: - break; - default: - topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - return; - } - - alg = mca_coll_ml_component.coll_config[ML_BCAST][i].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_BCAST][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - return; - } - - if (NULL != ml_module->coll_ml_bcast_functions[alg]) { - if (ML_BCAST_SMALL_DATA_KNOWN <= alg && ML_BCAST_LARGE_DATA_SEQUENTIAL >= alg) { - if (ml_module->coll_ml_bcast_functions[alg]->component_functions) { - free(ml_module->coll_ml_bcast_functions[alg]->component_functions); - ml_module->coll_ml_bcast_functions[alg]->component_functions = NULL; - } - - free(ml_module->coll_ml_bcast_functions[alg]); - ml_module->coll_ml_bcast_functions[alg] = NULL; - } else { - topo_info->hierarchical_algorithms[ML_BCAST] = NULL; - } - } - } -} diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_common_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_common_setup.c deleted file mode 100644 index 7167c7de79..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_common_setup.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#include "ompi/mca/coll/ml/coll_ml_hier_algorithms_common_setup.h" - -int mca_coll_ml_schedule_init_scratch(mca_coll_ml_topology_t *topo_info, - mca_coll_ml_schedule_hier_info_t *h_info, - int **out_scratch_indx, int **out_scratch_num) -{ - bool prev_is_zero; - int i, cnt; - int n_hiers = h_info->n_hiers; - int value_to_set = 0; - mca_bcol_base_module_t *prev_bcol = NULL; - int *scratch_indx, *scratch_num; - - scratch_indx = *out_scratch_indx = - (int *) calloc(n_hiers * 2, sizeof(int)); - if (NULL == *out_scratch_indx) { - ML_ERROR(("Can't allocate memory.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - scratch_num = *out_scratch_num = - (int *) calloc(n_hiers * 2, sizeof(int)); - if (NULL == *out_scratch_num) { - ML_ERROR(("Can't allocate memory.")); - free(out_scratch_indx); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - for (i = 0, cnt = 0; i < h_info->num_up_levels; ++i, ++cnt) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - scratch_indx[cnt] = 0; - prev_bcol = GET_BCOL(topo_info, i); - } - } - - /* top - only if the proc arrive to highest_level_is_global_highest_level */ - if (h_info->call_for_top_function) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, n_hiers - 1))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - scratch_indx[cnt] = 0; - prev_bcol = GET_BCOL(topo_info, n_hiers - 1); - } - ++cnt; - } - - /* going down */ - for (i = h_info->num_up_levels - 1; i >= 0; --i, ++cnt) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - scratch_indx[cnt] = 0; - prev_bcol = GET_BCOL(topo_info, i); - } - } - - i = cnt - 1; - prev_is_zero = true; - - do { - if (prev_is_zero) { - value_to_set = scratch_indx[i] + 1; - prev_is_zero = false; - } - - if (0 == scratch_indx[i]) { - prev_is_zero = true; - } - - scratch_num[i] = value_to_set; - --i; - } while(i >= 0); - - return OMPI_SUCCESS; -} - -mca_coll_ml_collective_operation_description_t * - mca_coll_ml_schedule_alloc(mca_coll_ml_schedule_hier_info_t *h_info) -{ - mca_coll_ml_collective_operation_description_t *schedule = NULL; - - schedule = (mca_coll_ml_collective_operation_description_t *) - malloc(sizeof(mca_coll_ml_collective_operation_description_t)); - if (NULL == schedule) { - ML_ERROR(("Can't allocate memory.")); - return NULL; - } - - /* Set dependencies equal to number of hierarchies */ - schedule->n_fns = h_info->nbcol_functions; - schedule->progress_type = 0; - /* Allocated the component function */ - schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) - calloc(h_info->nbcol_functions, sizeof(struct mca_coll_ml_compound_functions_t)); - if (NULL == schedule->component_functions) { - ML_ERROR(("Can't allocate memory.")); - free(schedule); - return NULL; - } - return schedule; -} - -void mca_coll_ml_call_types(mca_coll_ml_schedule_hier_info_t *h_info, - mca_coll_ml_collective_operation_description_t *schedule) -{ - int i_hier, j_hier, cnt; - mca_bcol_base_module_t *current_bcol = NULL; - - for (i_hier = 0; i_hier < h_info->n_hiers; i_hier++) { - current_bcol = - schedule->component_functions[i_hier]. - constant_group_data.bcol_module; - cnt = 0; - for (j_hier = 0; j_hier < h_info->n_hiers; j_hier++) { - if (current_bcol == - schedule->component_functions[j_hier]. - constant_group_data.bcol_module) { - schedule->component_functions[j_hier]. - constant_group_data.index_of_this_type_in_collective = cnt; - cnt++; - } - } - schedule->component_functions[i_hier]. - constant_group_data.n_of_this_type_in_collective = cnt; - } -} diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_common_setup.h b/ompi/mca/coll/ml/coll_ml_hier_algorithms_common_setup.h deleted file mode 100644 index 03cb185ec8..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_common_setup.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_COLL_ML_COMMON_SETUP_H -#define MCA_COLL_ML_COMMON_SETUP_H - -#include "ompi_config.h" - -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/coll/ml/coll_ml.h" - -struct mca_coll_ml_schedule_hier_info_t { - int n_hiers; - int num_up_levels; - int nbcol_functions; - bool call_for_top_function; -}; -typedef struct mca_coll_ml_schedule_hier_info_t - mca_coll_ml_schedule_hier_info_t; - -#define MCA_COLL_ML_INIT_HIER_INFO(info, n_hr, g_hr, ml_module) \ -do { \ - info.n_hiers = n_hr; \ - if (g_hr == \ - ml_module->component_pairs[n_hr - 1].bcol_index) { \ - /* The process that is member of highest level subgroup \ - should call for top algorithms in addition to fan-in/out steps*/ \ - ML_VERBOSE(9, ("Setting top %d %d", n_hr, ml_module->component_pairs[g_hr - 1].bcol_index)); \ - info.call_for_top_function = true; \ - /* hier level run only top algorithm, so we deduct 1 */ \ - info.num_up_levels = n_hr - 1; \ - /* Top algorithm is called only once, so we deduct 1 */ \ - info.nbcol_functions = 2 * n_hr - 1; \ - } else { \ - ML_VERBOSE(9, ("not setting top %d %d", n_hr, ml_module->component_pairs[g_hr - 1].bcol_index)); \ - /* The process is not member of highest level subgroup, \ - as result it does not call for top algorithm, \ - but it calls for all fan-in/out steps */ \ - info.call_for_top_function = false; \ - info.num_up_levels = n_hr; \ - info.nbcol_functions = 2 * n_hr; \ - } \ -} while (0); - -#define MCA_COLL_ML_SET_COMP_FN(fn, level, module, s_level, \ - scratch_indx, scratch_num, qc, name) \ -do { \ - fn->h_level = level; /* hierarchy level */ \ - strcpy (fn->fn_name, "name"); \ - fn->num_dependent_tasks = 0; \ - fn->num_dependencies = 0; \ - fn->task_comp_fn = NULL; \ - fn->constant_group_data.bcol_module = GET_BCOL(module, level); \ - fn->constant_group_data.index_in_consecutive_same_bcol_calls = \ - scratch_indx[s_level];\ - fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[s_level]; \ - fn->constant_group_data.n_of_this_type_in_collective = 0; \ - fn->constant_group_data.index_of_this_type_in_collective = 0; \ - fn->bcol_function = fn->constant_group_data.bcol_module-> \ - filtered_fns_table[qc[0]] \ - [qc[1]] \ - [qc[2]] \ - [qc[3]] \ - [qc[4]] \ - [qc[5]]; \ -} while (0); - -#define MCA_COLL_ML_QUERY_SIZE 6 - -#define MCA_COLL_ML_SET_QUERY(query, src_type, blocking, coll_type, index, other0, other1) \ -do { \ - query[0] = src_type; \ - query[1] = blocking; \ - query[2] = coll_type; \ - query[3] = index; \ - query[4] = other0; \ - query[5] = other1; \ -} while (0); - -int mca_coll_ml_schedule_init_scratch(mca_coll_ml_topology_t *topo_info, - mca_coll_ml_schedule_hier_info_t *h_info, - int **out_scratch_indx, int **out_scratch_num); - -mca_coll_ml_collective_operation_description_t* -mca_coll_ml_schedule_alloc(mca_coll_ml_schedule_hier_info_t *h_info); - -void mca_coll_ml_call_types(mca_coll_ml_schedule_hier_info_t *h_info, - mca_coll_ml_collective_operation_description_t *schedule); -#endif diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c deleted file mode 100644 index 579f77d12b..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c +++ /dev/null @@ -1,371 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/include/ompi/constants.h" -#include "ompi/mca/coll/ml/coll_ml_functions.h" -static int mca_coll_ml_task_comp_static_reduce - (struct mca_coll_ml_task_status_t *task) { - - task->ml_coll_operation->variable_fn_params.root_flag = true; - - return OMPI_SUCCESS; -} - -static void mca_coll_ml_static_reduce_non_root(mca_coll_ml_task_status_t *task_status, int index, - mca_coll_ml_compound_functions_t *func) -{ - /* I am not a root rank, but someone in my group is a root*/ - if (task_status->ml_coll_operation->variable_fn_params.root_route->level == index) { - task_status->rt_num_dependencies = func->num_dependencies; - task_status->rt_num_dependent_tasks = 0; - task_status->rt_dependent_task_indices = NULL; - task_status->ml_coll_operation->variable_fn_params.root = - task_status->ml_coll_operation->variable_fn_params.root_route->rank; - } else { - task_status->rt_num_dependencies = 0; - task_status->rt_num_dependent_tasks = 1; - task_status->rt_dependent_task_indices = &task_status->ml_coll_operation->variable_fn_params.root_route->level; - } - -} - -static void mca_coll_ml_static_reduce_root(mca_coll_ml_task_status_t *task_status, int index, - mca_coll_ml_compound_functions_t *func) -{ - task_status->rt_num_dependencies = func->num_dependencies; - task_status->rt_num_dependent_tasks = 0; - task_status->rt_dependent_task_indices = NULL; -} - -/* - * Fill up the collective descriptor - * - */ -static int mca_coll_ml_build_static_reduce_schedule( - mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t **coll_desc) -{ - int i_hier, j_hier, n_fcns, - n_hiers = topo_info->n_levels; - int *scratch_indx = NULL, - *scratch_num = NULL; - int cnt, value_to_set = 0; - int ret = OMPI_SUCCESS; - bool prev_is_zero; - mca_coll_ml_compound_functions_t *comp_fns_temp; - mca_bcol_base_module_t *prev_bcol, - *bcol_module; - mca_coll_ml_compound_functions_t *comp_fn; - mca_coll_ml_collective_operation_description_t *schedule = NULL; - - *coll_desc = (mca_coll_ml_collective_operation_description_t *) - calloc(1, sizeof(mca_coll_ml_collective_operation_description_t)); - - schedule = *coll_desc; - if (OPAL_UNLIKELY(NULL == schedule)) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - scratch_indx = (int *) calloc (n_hiers, sizeof (int)); - if (NULL == scratch_indx) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - scratch_num = (int *) malloc(sizeof(int) * (n_hiers)); - if (NULL == scratch_num) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - prev_bcol = NULL; - - /* Calculate scratch numbers */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) { - scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1; - } else { - scratch_indx[i_hier] = 0; - prev_bcol = GET_BCOL(topo_info, i_hier); - } - } - - --i_hier; - prev_is_zero = true; - - do { - if (prev_is_zero) { - value_to_set = scratch_indx[i_hier] + 1; - prev_is_zero = false; - } - - if (0 == scratch_indx[i_hier]) { - prev_is_zero = true; - } - - scratch_num[i_hier] = value_to_set; - --i_hier; - } while(i_hier >= 0); - - /* All hierarchies call one function, unlike other collectives */ - n_fcns = n_hiers; - - /* Set dependencies equal to number of hierarchies */ - schedule->n_fns = n_fcns; - schedule->topo_info = topo_info; - schedule->progress_type = 0; - /* Allocated the component function */ - schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) - calloc(n_fcns, sizeof(struct mca_coll_ml_compound_functions_t)); - - if (OPAL_UNLIKELY(NULL == schedule->component_functions)) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - - for (i_hier = 0; i_hier < n_hiers; ++i_hier) { - comp_fn = &schedule->component_functions[i_hier]; - - /* The hierarchial level */ - comp_fn->h_level = i_hier; - bcol_module = GET_BCOL(topo_info, i_hier); - - comp_fn->bcol_function = - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_REDUCE][1][0][0]; - - strcpy(comp_fn->fn_name, "REDUCE"); - ML_VERBOSE(10, ("func indx %d set to %p", i_hier, comp_fn->bcol_function)); - - - ML_VERBOSE(1,("In ML_REDUCE_SETUP .. looks fine here")); - /* No need completion func for Barrier */ - comp_fn->task_comp_fn = mca_coll_ml_task_comp_static_reduce; - - /* Constants */ - comp_fn->constant_group_data.bcol_module = bcol_module; - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i_hier]; - comp_fn->constant_group_data.n_of_this_type_in_collective = 0; - comp_fn->constant_group_data.index_of_this_type_in_collective = 0; - - ML_VERBOSE(10, ("Setting collective [reduce] fn_idx %d, n_of_this_type_in_a_row %d, " - "index_in_consecutive_same_bcol_calls %d.", - i_hier, comp_fn->constant_group_data.n_of_this_type_in_a_row, - comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls)); - } - - - /* Fill the rest of constant data */ - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - mca_bcol_base_module_t *current_bcol = - schedule->component_functions[i_hier]. - constant_group_data.bcol_module; - cnt = 0; - for (j_hier = 0; j_hier < n_hiers; j_hier++) { - if (current_bcol == - schedule->component_functions[j_hier]. - constant_group_data.bcol_module) { - schedule->component_functions[j_hier]. - constant_group_data.index_of_this_type_in_collective = cnt; - cnt++; - } - } - schedule->component_functions[i_hier]. - constant_group_data.n_of_this_type_in_collective = cnt; - } - - /* Manju: Reduction should always use the fixed schedule. - * The subgroups that this process is leader should be executed first, then - * it should execute the subgroups where this process is not a leader, and - * then execute the subgroup that includes the root. - */ - - /* Allocate the schedule list */ - schedule->comp_fn_arr = (struct mca_coll_ml_compound_functions_t **) - calloc(n_hiers,sizeof(struct mca_coll_ml_compound_functions_t *)); - if (NULL == schedule->comp_fn_arr) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - /* Now that the functions have been set-up properly, we can simple permute the ordering a bit */ - - for (i_hier = 0; i_hier < n_hiers; i_hier++) { - /* first one is trivial */ - int leader_hierarchy = 0; - int non_leader_hierarchy = 0; - int func_index; - - comp_fns_temp = (struct mca_coll_ml_compound_functions_t *) - calloc(n_hiers, sizeof(struct mca_coll_ml_compound_functions_t)); - - leader_hierarchy = 0; - non_leader_hierarchy = n_hiers - 2; - - for(j_hier = 0; j_hier < n_hiers - 1 ; j_hier++) { - - func_index = j_hier < i_hier ? j_hier : j_hier + 1; - /* I'm a leader for this group */ - if (0 == topo_info->component_pairs->subgroup_module->my_index) { - comp_fns_temp[leader_hierarchy++] = - schedule->component_functions[func_index]; - } - else { - comp_fns_temp[non_leader_hierarchy--] = - schedule->component_functions[func_index]; - } - } - - comp_fns_temp[j_hier] = schedule->component_functions[i_hier]; - /* now let's attach this list to our array of lists */ - schedule->comp_fn_arr[i_hier] = comp_fns_temp; - } - - /* Manju: Do we need this ? */ - - /* I'm going to just loop over each schedule and - * set up the scratch indices, scratch numbers - * and other constant data - */ - /* - for( i_hier = 1; i_hier < n_hiers; i_hier++) { - ret = mca_coll_ml_setup_scratch_vals(schedule->comp_fn_arr[i_hier], scratch_indx, - scratch_num, n_hiers); - if( OMPI_SUCCESS != ret ) { - ret = OMPI_ERROR; - goto Error; - } - - } - */ - - /* Do I need this ? */ - schedule->task_setup_fn[COLL_ML_ROOT_TASK_FN] = mca_coll_ml_static_reduce_root; - schedule->task_setup_fn[COLL_ML_GENERAL_TASK_FN] = mca_coll_ml_static_reduce_non_root; - - MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule); - - /* reduce does not use the component functions so we no longer need this. see - * coll_ml_reduce.c:442 */ - free (schedule->component_functions); - schedule->component_functions = NULL; - - free(scratch_num); - free(scratch_indx); - - return OMPI_SUCCESS; - -Error: - if (NULL != scratch_num) { - free (scratch_num); - } - - if (NULL != scratch_indx) { - free (scratch_indx); - } - - if (NULL != schedule) { - if (NULL != schedule->component_functions) { - free(schedule->component_functions); - schedule->component_functions = NULL; - } - free (schedule); - *coll_desc = NULL; - } - - return ret; -} - - -int ml_coll_hier_reduce_setup(mca_coll_ml_module_t *ml_module) -{ - int alg, ret, topo_index=0; - mca_coll_ml_topology_t *topo_info = - &ml_module->topo_list[ml_module->collectives_topology_map[ML_REDUCE][ML_SMALL_MSG]]; - - if ( ml_module->max_fn_calls < topo_info->n_levels ) { - ml_module->max_fn_calls = topo_info->n_levels; - } - - - alg = mca_coll_ml_component.coll_config[ML_REDUCE][ML_SMALL_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_REDUCE][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_REDUCE] = NULL; - return OMPI_ERROR; - } - - ret = mca_coll_ml_build_static_reduce_schedule(&ml_module->topo_list[topo_index], - &ml_module->coll_ml_reduce_functions[alg]); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to setup static reduce")); - return ret; - } - - - return OMPI_SUCCESS; -} - -void ml_coll_hier_reduce_cleanup(mca_coll_ml_module_t *ml_module) -{ - int alg, i, topo_index=0; - mca_coll_ml_topology_t *topo_info = - &ml_module->topo_list[ml_module->collectives_topology_map[ML_REDUCE][ML_SMALL_MSG]]; - - if ( ml_module->max_fn_calls < topo_info->n_levels ) { - ml_module->max_fn_calls = topo_info->n_levels; - } - - - alg = mca_coll_ml_component.coll_config[ML_REDUCE][ML_SMALL_MSG].algorithm_id; - topo_index = ml_module->collectives_topology_map[ML_REDUCE][alg]; - if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { - ML_ERROR(("No topology index or algorithm was defined")); - topo_info->hierarchical_algorithms[ML_REDUCE] = NULL; - return; - } - - if (NULL == ml_module->coll_ml_reduce_functions[alg]) { - return; - } - - if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr) { - for (i=0; itopo_list[topo_index].n_levels; i++) { - if (ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]) { - free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i]); - ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr[i] = NULL; - } - } - - free(ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr); - ml_module->coll_ml_reduce_functions[alg]->comp_fn_arr = NULL; - } - - ml_module->coll_ml_reduce_functions[alg]->component_functions = NULL; - - free(ml_module->coll_ml_reduce_functions[alg]); - ml_module->coll_ml_reduce_functions[alg] = NULL; -} diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c deleted file mode 100644 index 181e229a11..0000000000 --- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c +++ /dev/null @@ -1,521 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#include "ompi/include/ompi/constants.h" - -int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module, - mca_coll_ml_topology_t *topo_info, - int up_function_idx, - int top_function_idx, - int down_function_idx, - int collective) -{ - /* local variables */ - int i, j, cnt, value_to_set = -1; - int ret = OMPI_SUCCESS, num_up_levels; - - int num_hierarchies = topo_info->n_levels; - int global_high_hierarchy_index = topo_info->global_highest_hier_group_index; - - bool call_for_top_function, prev_is_zero; - - int *scratch_indx = NULL, *scratch_num = NULL; - - coll_ml_collective_description_t *collective_alg = NULL; - mca_bcol_base_module_t *bcol_module = NULL, - *prev_bcol = NULL; - - /* RLG: one blocking barrier collective algorithm - this is really a hack, - * we need to figure out how to do this in a bit more extensible - * manner. - */ - collective_alg = (coll_ml_collective_description_t *) - malloc(sizeof(coll_ml_collective_description_t)); - if (NULL == collective_alg) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - /* am I a member of the highest level subgroup ? */ - if (global_high_hierarchy_index == - topo_info->component_pairs[num_hierarchies - 1].bcol_index) { - /* The process that is member of highest level subgroup - should call for top algorithms in addition to fan-in/out steps*/ - call_for_top_function = true; - /* hier level run only top algorithm, so we deduct 1 */ - num_up_levels = num_hierarchies - 1; - /* Top algorithm is called only once, so we deduct 1 */ - collective_alg->n_functions = 2 * num_hierarchies - 1; - } else { - /* The process is not member of highest level subgroup, - as result it does not call for top algorithm, - but it calls for all fan-in/out steps */ - call_for_top_function = false; - num_up_levels = num_hierarchies; - collective_alg->n_functions = 2 * num_hierarchies; - } - - ML_VERBOSE(10, ("high_index %d == bcol_index %d: Call top %d, num_up_levels %d, collective_alg->n_functions %d", - global_high_hierarchy_index, - topo_info->component_pairs[num_hierarchies - 1].bcol_index, - call_for_top_function, - num_up_levels, - collective_alg->n_functions )); - - /* allocate space for the functions */ - collective_alg->functions = (mca_bcol_base_function_t *) - calloc(collective_alg->n_functions, sizeof(mca_bcol_base_function_t)); - if( NULL == collective_alg->functions) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - /* Algorithm Description: - * ===================== - * The algorithm used here for an N level system - * - up to level N-2, inclusive : up algorithm (fan in in barrier, reduce in Allreduce) - * - level N-1: top algorithm (barrier or allreduce) - * - level N-2, to level 0: down algorithm (fanout) - */ - - - /* Starting scratch_num and scratch_index calculations */ - /* =================================================== */ - - /* Figure out how many of the same bcols are called in a row. - * The index of the bcol in row we store in scratch_indx and - * the total number of bcols in the row we store in scratch_num */ - scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int)); - if(NULL == scratch_indx) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - scratch_num = (int *) malloc(sizeof(int) * (2 * num_hierarchies)); - if(NULL == scratch_num) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Error; - } - - /* We go through all stages of algorithm (up, top, down) - * and calculate bcol index. If previous bcol is the same type as current - * one the counter index is increased, other way the index is zero */ - prev_bcol = NULL; - /* going up */ - for (i = 0, cnt = 0; i < num_up_levels; ++i, ++cnt) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - scratch_indx[cnt] = 0; - prev_bcol = GET_BCOL(topo_info, i); - } - } - - /* top - only if the proc arrive to highest_level_is_global_highest_level */ - if (call_for_top_function) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, num_hierarchies - 1))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - scratch_indx[cnt] = 0; - prev_bcol = GET_BCOL(topo_info, num_hierarchies - 1); - } - - ++cnt; - } - - /* going down */ - for (i = num_up_levels - 1; i >= 0; --i, ++cnt) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - scratch_indx[cnt] = 0; - prev_bcol = GET_BCOL(topo_info, i); - } - } - - /* - * Calculate the number of the same bcols in row. - * We parse the index array, if index is zero - * it means that the row is done and we start - * to calculate next bcols row. The maximum number - * for the row is equal to maximal bcol index in the row + 1 - */ - i = cnt - 1; - prev_is_zero = true; - do { - if (prev_is_zero) { - value_to_set = scratch_indx[i] + 1; - prev_is_zero = false; - } - - if (0 == scratch_indx[i]) { - prev_is_zero = true; - } - - scratch_num[i] = value_to_set; - --i; - } while(i >= 0); - - /* =========================================================== */ - /* We are done with scratch_num and scratch_index calculations */ - - /* Setup function call for each algorithm step */ - cnt = 0; - /* up phase */ - for (i = 0; i < num_up_levels; i++) { - bcol_module = GET_BCOL(topo_info, i); - collective_alg->functions[cnt].fn_idx = up_function_idx; - collective_alg->functions[cnt].bcol_module = bcol_module; - collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt]; - collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt]; - ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", - collective, cnt, collective_alg->functions[cnt].fn_idx, - collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls, - collective_alg->functions[cnt].n_of_this_type_in_a_row)); - ++cnt; - } - - /* top function */ - if (call_for_top_function) { - bcol_module = GET_BCOL(topo_info, num_hierarchies - 1); - collective_alg->functions[cnt].fn_idx = top_function_idx; - collective_alg->functions[cnt].bcol_module = bcol_module; - collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt]; - collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt]; - ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", - collective, cnt, collective_alg->functions[cnt].fn_idx, - collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls, - collective_alg->functions[cnt].n_of_this_type_in_a_row)); - ++cnt; - } - - /* down phase*/ - for (i = num_up_levels - 1; i >= 0; i--) { - bcol_module = GET_BCOL(topo_info, i); - collective_alg->functions[cnt].fn_idx = down_function_idx; - collective_alg->functions[cnt].bcol_module = bcol_module; - collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt]; - collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt]; - ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", - collective, cnt, collective_alg->functions[cnt].fn_idx, - collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls, - collective_alg->functions[cnt].n_of_this_type_in_a_row)); - ++cnt; - } - - /* figure out how many times this bcol is used in this collective call */ - for (i = 0; i < collective_alg->n_functions; i++) { - mca_bcol_base_module_t *current_bcol= - collective_alg->functions[i].bcol_module; - - cnt = 0; - for (j = 0; j < collective_alg->n_functions; ++j) { - if (current_bcol == - collective_alg->functions[j].bcol_module) { - collective_alg->functions[j].index_of_this_type_in_collective = cnt; - ML_VERBOSE(10, ("Pasha: Setting collective [collective code %d][count %d], fn_idx %d, collective_alg->functions[i].index_of_this_type_in_collective %d", - collective, cnt, i, - collective_alg->functions[j].index_of_this_type_in_collective)); - cnt++; - } - } - - collective_alg->functions[i].n_of_this_type_in_collective=cnt; - ML_VERBOSE(10, ("Pasha: Setting collective [collective code %d][count %d], fn_idx %d, collective_alg->functions[i].n_of_this_type_in_collective %d", - collective, cnt, i, - collective_alg->functions[i].n_of_this_type_in_collective)); - } - - /* set Barrier algorithm */ - topo_info->hierarchical_algorithms[collective] = collective_alg; - /* Setup maximum number function calls, it is used for resource allocation */ - ml_module->max_fn_calls = (collective_alg->n_functions > ml_module->max_fn_calls) ? - collective_alg->n_functions : ml_module->max_fn_calls; - /* Ishai: What is this n_buffers? I did not find where it is being used*/ - topo_info->hierarchical_algorithms[collective]->n_buffers = 1; - - /* Release temporary memories */ - free(scratch_indx); - free(scratch_num); - - return OMPI_SUCCESS; - -Error: - if (NULL != collective_alg) { - free(collective_alg->functions); - } - - free(collective_alg); - free(scratch_indx); - free(scratch_num); - - return ret; -} - -int ml_coll_hier_allreduce_setup(mca_coll_ml_module_t *ml_module) -{ - int topo_index = - ml_module->collectives_topology_map[ML_ALLREDUCE][ML_SMALL_DATA_ALLREDUCE]; - int ret = ml_coll_up_and_down_hier_setup(ml_module, - &ml_module->topo_list[topo_index], - BCOL_REDUCE, - BCOL_ALLREDUCE, - BCOL_BCAST, - BCOL_ALLREDUCE); - - if (OMPI_SUCCESS == ret) { - return ret; - } - - /* Make sure to reset the allreduce pointer to NULL */ - ml_module->topo_list[topo_index].hierarchical_algorithms[BCOL_ALLREDUCE] = NULL; - return ret; -} - -#if 0 -/* - * Manju: New setup function in coll_ml_hier_algorithms_reduce_setup.c - */ -/* Ishai: Reduce is not an hier algorithm (it is rooted) - it needs a different ML algorithm */ -/* Need to rewrite */ -int ml_coll_hier_reduce_setup(mca_coll_ml_module_t *ml_module) -{ - int topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][ML_SMALL_DATA_GATHER]; - /* Hierarchy Setup */ - int ret = ml_coll_up_and_down_hier_setup(ml_module, - &ml_module->topo_list[topo_index], - BCOL_REDUCE, /*NULL,*/ - BCOL_REDUCE, - BCOL_REDUCE, /*NULL,*/ - BCOL_REDUCE); - if (OMPI_SUCCESS == ret) { - return ret; - } - /* Make sure to reset the bcast pointer to NULL */ - ml_module->topo_list[topo_index].hierarchical_algorithms[BCOL_BCAST] = NULL; - return ret; -} -#endif - -int ml_coll_barrier_constant_group_data_setup( - mca_coll_ml_topology_t *topo_info, - mca_coll_ml_collective_operation_description_t *schedule) -{ - /* local variables */ - int i, j, cnt, value_to_set = -1, ret = OMPI_SUCCESS, num_up_levels, - num_hierarchies = topo_info->n_levels, n_functions = schedule->n_fns, - global_high_hierarchy_index = topo_info->global_highest_hier_group_index; - - bool call_for_top_function, prev_is_zero; - mca_coll_ml_utility_data_t *constant_group_data = NULL; - - int *scratch_indx = NULL, *scratch_num = NULL; - - mca_bcol_base_module_t *prev_bcol = NULL, - *bcol_module = NULL; - - /* Am I a member of the highest level subgroup ? */ - if (global_high_hierarchy_index == - topo_info->component_pairs[num_hierarchies - 1].bcol_index) { - /* The process that is member of highest level subgroup - should call for top algorithms in addition to fan-in/out steps*/ - call_for_top_function = true; - /* hier level run only top algorithm, so we deduct 1 */ - num_up_levels = num_hierarchies - 1; - } else { - /* The process is not member of highest level subgroup, - as result it does not call for top algorithm, - but it calls for all fan-in/out steps */ - call_for_top_function = false; - num_up_levels = num_hierarchies; - } - - /* Algorithm Description: - * ===================== - * The algorithm used here for an N level system - * - up to level N-2, inclusive : up algorithm (Fan-In in Barrier) - * - level N-1: top algorithm (Barrier algth) - * - level N-2, to level 0: down algorithm (Fan-out) - */ - - - /* Starting scratch_num and scratch_index calculations */ - /* =================================================== */ - - /* Figure out how many of the same bcols are called in a row. - * The index of the bcol in row we store in scratch_indx and - * the total number of bcols in the row we store in scratch_num */ - scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int)); - if(NULL == scratch_indx) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Const_Data_Setup_Error; - } - - scratch_num = (int *) malloc(sizeof(int) * (2 * num_hierarchies)); - if(NULL == scratch_num) { - ML_ERROR(("Can't allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto Const_Data_Setup_Error; - } - - /* We go through all stages of algorithm (up, top, down) - * and calculate bcol index. If previous bcol is the same type as current - * one the counter index is increased, other way the index is zero */ - prev_bcol = NULL; - - /* Going up */ - for (i = 0, cnt = 0; i < num_up_levels; ++i, ++cnt) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - scratch_indx[cnt] = 0; - prev_bcol = GET_BCOL(topo_info, i); - } - } - - /* Top - only if the proc arrive to highest_level_is_global_highest_level */ - if (call_for_top_function) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, num_hierarchies - 1))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - scratch_indx[cnt] = 0; - prev_bcol = GET_BCOL(topo_info, num_hierarchies - 1); - } - - ++cnt; - } - - /* Going down */ - for (i = num_up_levels - 1; i >= 0; --i, ++cnt) { - if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { - scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; - } else { - scratch_indx[cnt] = 0; - prev_bcol = GET_BCOL(topo_info, i); - } - } - - /* - * Calculate the number of the same bcols in row. - * We parse the index array, if index is zero - * it means that the row is done and we start - * to calculate next bcols row. The maximum number - * for the row is equal to maximal bcol index in the row + 1 - */ - i = cnt - 1; - prev_is_zero = true; - do { - if (prev_is_zero) { - value_to_set = scratch_indx[i] + 1; - prev_is_zero = false; - } - - if (0 == scratch_indx[i]) { - prev_is_zero = true; - } - - scratch_num[i] = value_to_set; - --i; - } while(i >= 0); - - /* =========================================================== */ - /* We are done with scratch_num and scratch_index calculations */ - - /* Setup function call for each algorithm step */ - cnt = 0; - - /* Up phase */ - for (i = 0; i < num_up_levels; ++i) { - bcol_module = GET_BCOL(topo_info, i); - constant_group_data = &schedule->component_functions[cnt].constant_group_data; - - constant_group_data->bcol_module = bcol_module; - constant_group_data->index_in_consecutive_same_bcol_calls = scratch_indx[cnt]; - constant_group_data->n_of_this_type_in_a_row = scratch_num[cnt]; - - ++cnt; - } - - /* Top function */ - if (call_for_top_function) { - bcol_module = GET_BCOL(topo_info, num_hierarchies - 1); - constant_group_data = &schedule->component_functions[cnt].constant_group_data; - - constant_group_data->bcol_module = bcol_module; - constant_group_data->index_in_consecutive_same_bcol_calls = scratch_indx[cnt]; - constant_group_data->n_of_this_type_in_a_row = scratch_num[cnt]; - - ++cnt; - } - - /* Down phase */ - for (i = num_up_levels - 1; i >= 0; --i) { - bcol_module = GET_BCOL(topo_info, i); - constant_group_data = &schedule->component_functions[cnt].constant_group_data; - - constant_group_data->bcol_module = bcol_module; - - /* All Fan-Outs will be done in parallel */ - constant_group_data->index_in_consecutive_same_bcol_calls = 0; - constant_group_data->n_of_this_type_in_a_row = 1; - - ++cnt; - } - - /* Figure out how many times this bcol is used in this collective call */ - for (i = 0; i < n_functions; ++i) { - struct mca_coll_ml_compound_functions_t *component_functions = - schedule->component_functions; - mca_bcol_base_module_t *current_bcol = - component_functions[i].constant_group_data.bcol_module; - - /* silence clang warning about possible NULL dereference of component_functions. - * this case is a developer error if it occurs */ - assert (NULL != component_functions && NULL != constant_group_data); - - cnt = 0; - for (j = 0; j < n_functions; ++j) { - if (current_bcol == - component_functions[j].constant_group_data.bcol_module) { - constant_group_data->index_of_this_type_in_collective = cnt; - - ++cnt; - } - } - - component_functions[i].constant_group_data.n_of_this_type_in_collective = cnt; - } - - MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule); - - /* Release temporary memories */ - free(scratch_num); - free(scratch_indx); - - return OMPI_SUCCESS; - -Const_Data_Setup_Error: - free(scratch_indx); - free(scratch_num); - - return ret; -} diff --git a/ompi/mca/coll/ml/coll_ml_inlines.h b/ompi/mca/coll/ml/coll_ml_inlines.h deleted file mode 100644 index d54b3b37aa..0000000000 --- a/ompi/mca/coll/ml/coll_ml_inlines.h +++ /dev/null @@ -1,639 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#ifndef MCA_COLL_ML_INLINES_H -#define MCA_COLL_ML_INLINES_H - -#include "ompi_config.h" - -BEGIN_C_DECLS - -static inline __opal_attribute_always_inline__ int ml_fls(int num) -{ - int i = 1; - int j = 0; - - if (0 == num) { - return 0; - } - - while (i < num) { - i *= 2; - j++; - } - - if (i > num) { - j--; - } - - return j; -} - -static inline __opal_attribute_always_inline__ - int mca_coll_ml_buffer_recycling(mca_coll_ml_collective_operation_progress_t *ml_request) -{ - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *)ml_request->coll_module; - mca_bcol_base_memory_block_desc_t *ml_memblock = ml_module->payload_block; - uint64_t bank_index = ml_request->fragment_data.buffer_desc->bank_index; - int rc; - - opal_atomic_add(&ml_memblock->bank_release_counters[bank_index], 1); - - /* Check if the bank is ready for recycling */ - if (ml_memblock->bank_release_counters[bank_index] == - ml_memblock->num_buffers_per_bank ) { - ml_memblock->ready_for_memsync[bank_index] = true; - - ML_VERBOSE(10, ("Sync count %d, bank %d", ml_memblock->memsync_counter, bank_index)); - assert(ml_memblock->bank_is_busy); - if (ml_memblock->memsync_counter == (int)bank_index) { - while(ml_memblock->ready_for_memsync[ml_memblock->memsync_counter]) { - ML_VERBOSE(10, ("Calling for service barrier: ml_buffer_index - %d %d %d == %d.", - ml_request->fragment_data.buffer_desc->buffer_index, - ml_memblock->memsync_counter, - ml_memblock->bank_release_counters[ml_memblock->memsync_counter], - ml_memblock->num_buffers_per_bank)); - /* Setting the ready flag to 0 - unready - done */ - ml_memblock->ready_for_memsync[ml_memblock->memsync_counter] = false; - - rc = mca_coll_ml_memsync_intra(ml_module, ml_memblock->memsync_counter); - if (OMPI_SUCCESS != rc) { - ML_ERROR(("Failed to start memory sync !!!")); - return rc; - } - - opal_atomic_add(&ml_memblock->memsync_counter, 1); - if (ml_memblock->memsync_counter == (int)ml_memblock->num_banks) { - ml_memblock->memsync_counter = 0; - } - ML_VERBOSE(10, ("After service barrier.")); - } - } else { - ML_VERBOSE(10, ("Out of order %d", ml_memblock->memsync_counter)); - } - } - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ int coll_ml_fragment_completion_processing( - mca_coll_ml_collective_operation_progress_t *coll_op) -{ - /* local variables */ - int ret = OMPI_SUCCESS; - size_t bytes_in_this_frag; - struct full_message_t *full_msg_desc = coll_op->fragment_data.message_descriptor; - bool ready_to_release = true, out_of_resource = false; - - ML_VERBOSE(10, ("Coll_op %p processing completion", coll_op)); - /* Call unpack/pack function */ - if (OPAL_LIKELY(NULL != coll_op->process_fn)) { - ret = coll_op->process_fn(coll_op); - switch(ret) { - case OMPI_SUCCESS: - ML_VERBOSE(10, ("unpack done")); - ready_to_release = true; - break; - case ORTE_ERR_NO_MATCH_YET: - ML_VERBOSE(10, ("unexpected packet")); - ready_to_release = false; - break; - default: - ML_ERROR(("Error, unexpected error code %d", ret)); - return ret; - } - } - - bytes_in_this_frag = coll_op->fragment_data.fragment_size; - - ML_VERBOSE(10, ("Delivered %d bytes in frag %d total %d", - full_msg_desc->n_bytes_delivered, - bytes_in_this_frag, - full_msg_desc->n_bytes_total)); - - /* check for full message completion */ - if(full_msg_desc->n_bytes_delivered + bytes_in_this_frag == - full_msg_desc->n_bytes_total) { - /* message complete - don't update number of bytes delivered, just - * mark the message complete - */ - full_msg_desc->n_bytes_delivered += bytes_in_this_frag; - - /* decrement the number of fragments */ - full_msg_desc->n_active--; - - ML_VERBOSE(10, ("Signaling completion")); - - /* here we need to be sure that we point to the first fragment only */ - ompi_request_complete(&(coll_op->fragment_data.message_descriptor->super), true); - coll_op->fragment_data.message_descriptor->super.req_status.MPI_ERROR = OMPI_SUCCESS; - } else { - assert(NULL != coll_op->fragment_data.buffer_desc); - /* update the number of bytes delivered */ - full_msg_desc->n_bytes_delivered += bytes_in_this_frag; - /* decrement the number of fragments */ - full_msg_desc->n_active--; - /* here we need to start the next fragment */ - ML_VERBOSE(10, ("Launch frags for %p", coll_op)); - if (full_msg_desc->n_bytes_scheduled < full_msg_desc->n_bytes_total) { - ret = coll_op->fragment_data.message_descriptor->fragment_launcher(coll_op); - if (OPAL_UNLIKELY(OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret)) { - out_of_resource = true; - } else if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Failed to launch fragment")); - return ret; - } - } - } - - if (ready_to_release) { - /* Check if we have to recycle memory. - * Note: It is safe to recycle ML buffers since the ML buffer data - * already was unpacked to user buffer - */ - if (NULL != coll_op->fragment_data.buffer_desc) { - ret = mca_coll_ml_buffer_recycling(coll_op); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } - } - /* if this is not fragment 0, return fragment to the free list. - * fragment 0 will be returned in mca_ml_request_free() which - * is called from the MPI wait() and test() routines. - * We can recover the pointer to the fragement descriptor from - * the MPI level request object, wich is the first element - * in the fragment descriptor. - */ - /* I contend that this is a bug. This is not the right way to check - * for the first fragment as it assumes that the first fragment would always - * for any collective have zero as the first offset or that other subsequent - * fragments would not. It is not safe to assume this. The correct check is - * the following one - */ - - ML_VERBOSE(10, ("Master ? %p %d", coll_op, coll_op->fragment_data.offset_into_user_buffer)); - /* This check is in fact a bug. Not the correct definiton of first - * fragment. First fragment is the only fragment that satisfies the - * following criteria - */ - /*if (0 != coll_op->fragment_data.offset_into_user_buffer && - !out_of_resource) { - */ - if (((&coll_op->full_message != coll_op->fragment_data.message_descriptor) && - !out_of_resource) || IS_COLL_SYNCMEM(coll_op)) { - /* non-zero offset ==> this is not fragment 0 */ - CHECK_AND_RECYCLE(coll_op); - } - } - - /* return */ - return OMPI_SUCCESS; -} - -/* task completion */ -static inline __opal_attribute_always_inline__ int coll_ml_task_dependency_processing( - mca_coll_ml_task_status_t *task) -{ - /* update dependencies */ - mca_coll_ml_collective_operation_progress_t *my_schedule_instance = - task->ml_coll_operation; - int n_dependent_tasks = task->rt_num_dependent_tasks; - int dep_task; - - for (dep_task = 0; dep_task < n_dependent_tasks; dep_task++) - { - int task_index; - task_index = task->rt_dependent_task_indices[dep_task]; - my_schedule_instance->dag_description.status_array[task_index].n_dep_satisfied++; - } - - /* return */ - return OMPI_SUCCESS; -} - -/* collective task completion processing - - * "task" may be removed from list in this routine. - * Thread safety is assumed to be handled outside this routine. - */ -static inline __opal_attribute_always_inline__ int mca_coll_ml_task_completion_processing( - mca_coll_ml_task_status_t **task_status_g, opal_list_t *list) -{ - /* local variables */ - int ret = OMPI_SUCCESS; - mca_coll_ml_task_status_t *task_status = *task_status_g; - - mca_coll_ml_collective_operation_progress_t *coll_op = - task_status->ml_coll_operation; - - /* Pasha: Since all our collectives so far use the root - flag, I replacing the call for custom call back function - with setting root_flag. - If we will see that we need some custom functionality, - we will enable it later. - */ - - task_status->ml_coll_operation->variable_fn_params.root_flag = true; - -#if 0 - /* process task completion function, - if any was defined */ - if (OPAL_LIKELY(NULL != task_status->task_comp_fn)) { - ret = task_status->task_comp_fn(task_status); - if (ret != OMPI_SUCCESS) { - return ret; - } - } -#endif - - /* update dependencies */ - ret = coll_ml_task_dependency_processing(task_status); - if (ret != OMPI_SUCCESS) { - ML_VERBOSE(3,("Failed to coll_ml_task_dependency_processing")); - return ret; - } - - /* process task completion function, - if any was defined */ - if (OPAL_LIKELY(NULL != task_status->task_comp_fn)) { - ret = task_status->task_comp_fn(task_status); - if (ret != OMPI_SUCCESS) { - ML_VERBOSE(3,("Failed to task_comp_fn")); - return ret; - } - } - - /* remove the descriptor from the incomplete list - (Pasha: if the list was provided) */ - /* No need to put this an any new list - it is associcated - * with the mca_coll_ml_collective_operation_progress_t - * descriptor already - */ - - if (NULL != list) { - (*task_status_g) = (mca_coll_ml_task_status_t *) - opal_list_remove_item(list, (opal_list_item_t *)(task_status)); - } - - /* update completion counter */ - coll_op->dag_description.num_tasks_completed++; - - if(coll_op->dag_description.num_tasks_completed == - coll_op->coll_schedule->n_fns) - { - /* the actual fragment descriptor is not on any list, as - * we can get at it from the task descriptors - */ - ret = coll_ml_fragment_completion_processing(coll_op); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(3,("Failed to coll_ml_fragment_completion_processing")); - return ret; - } - } - - /* return */ - return ret; -} - -static inline __opal_attribute_always_inline__ int mca_coll_ml_generic_collectives_append_to_queue( - mca_coll_ml_collective_operation_progress_t *op_prog, - mca_coll_ml_task_setup_fn_t task_setup) -{ - int fn_index; - mca_coll_ml_collective_operation_description_t *op_desc = - op_prog->coll_schedule; - mca_coll_ml_compound_functions_t *func = NULL; - mca_coll_ml_task_status_t *task_status = NULL; - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - - ML_VERBOSE(9, ("Calling mca_coll_ml_generic_collectives_launcher")); - - /* Init all tasks, before we start them */ - for (fn_index = 0; fn_index < op_desc->n_fns; fn_index++) { - func = &op_desc->component_functions[fn_index]; - task_status = &op_prog->dag_description.status_array[fn_index]; - - ML_VERBOSE(9, ("Processing function index %d", fn_index)); - - assert(NULL != func); - - /* Init task status */ - task_status->n_dep_satisfied = 0; /* start from zero */ - task_status->bcol_fn = func->bcol_function; - /* setup run time parametres */ - /* Pasha: do we need the if proctection ? */ - if (OPAL_LIKELY(NULL != task_setup)) { - task_setup(task_status, fn_index, func); - } - - /* the pointer to operation progress supposed to be set during - construction time. Just want to make sure that it is ok */ - assert(task_status->ml_coll_operation == op_prog); - - /* We assume that all pointer to functions are defined and it - is not reson to check for null */ - assert(NULL != func->bcol_function->coll_fn); - - /* In order to preserve ordering on all ranks we have to add it to tail */ - /* TBD: Need to review the way we launch fragments */ - ML_VERBOSE(9, ("The task %p dependency is %d, appending it on pending list", - (void *)task_status, func->num_dependencies)); - OPAL_THREAD_LOCK(&(mca_coll_ml_component.pending_tasks_mutex)); - opal_list_append(&cm->pending_tasks, (opal_list_item_t *)task_status); - OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.pending_tasks_mutex)); - } - - ML_VERBOSE(9, ("Collective was launched !")); - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ int mca_coll_ml_generic_collectives_launcher( - mca_coll_ml_collective_operation_progress_t *op_prog, - mca_coll_ml_task_setup_fn_t task_setup) -{ - int fn_index; - int rc, ret; - mca_coll_ml_collective_operation_description_t *op_desc = - op_prog->coll_schedule; - mca_coll_ml_compound_functions_t *func = NULL; - mca_coll_ml_task_status_t *task_status = NULL; - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - - ML_VERBOSE(9, ("Calling mca_coll_ml_generic_collectives_launcher")); - - /* Init all tasks, before we start them */ - for (fn_index = 0; fn_index < op_desc->n_fns; fn_index++) { - func = &op_desc->component_functions[fn_index]; - task_status = &op_prog->dag_description.status_array[fn_index]; - - ML_VERBOSE(9, ("Processing function index %d", fn_index)); - - assert(NULL != func); - - /* Init task status */ - task_status->n_dep_satisfied = 0; /* start from zero */ - /* task_status->my_index_in_coll_schedule = fn_index; - pasha: the value is set during init */ - task_status->bcol_fn = func->bcol_function; - /* Pasha: disabling support for custom complition functions - task_status->task_comp_fn = func->task_comp_fn; - */ - - /* setup run time parametres */ - /* Pasha: do we need the if proctection ? */ - if (OPAL_LIKELY(NULL != task_setup)) { - task_setup(task_status, fn_index, func); - } - - /* the pointer to operation progress supposed to be set during - construction time. Just want to make sure that it is ok */ - assert(task_status->ml_coll_operation == op_prog); - /* Task status is done */ - - /* launch the task and put it on corresponding list (if required) */ - - /* We assume that all pointer to functions are defined and it - is not reason to check for null */ - assert(NULL != func->bcol_function->coll_fn); - } - - /* try to start startable */ - for (fn_index = 0; fn_index < op_desc->n_fns; fn_index++) { - func = &op_desc->component_functions[fn_index]; - task_status = &op_prog->dag_description.status_array[fn_index]; - /* fire the collective immediately if it has no dependencies */ - if (0 == task_status->rt_num_dependencies) { - rc = func->bcol_function->coll_fn(&op_prog->variable_fn_params, - /* Pasha: Need to update the prototype of the func, - right now it is ugly hack for compilation */ - (struct mca_bcol_base_function_t *)&func->constant_group_data); - switch(rc) { - case BCOL_FN_NOT_STARTED: - /* put it on pending list */ - ML_VERBOSE(9, ("Call to bcol collecitive return BCOL_FN_NOT_STARTED, putting the task on pending list")); - OPAL_THREAD_LOCK(&(mca_coll_ml_component.pending_tasks_mutex)); - opal_list_append(&cm->pending_tasks, (opal_list_item_t *)task_status); - OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.pending_tasks_mutex)); - break; - case BCOL_FN_STARTED: - /* put it on started list */ - ML_VERBOSE(9, ("Call to bcol collecitive return BCOL_FN_STARTED, puting the task on active list")); - OPAL_THREAD_LOCK(&(mca_coll_ml_component.active_tasks_mutex)); - opal_list_append(&cm->active_tasks, (opal_list_item_t *)task_status); - OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.active_tasks_mutex)); - break; - case BCOL_FN_COMPLETE: - /* the task is done ! lets start relevant dependencies */ - ML_VERBOSE(9, ("Call to bcol collecitive return BCOL_FN_COMPLETE")); - /* the task does not belong to any list, yes. So passing NULL */ - ret = mca_coll_ml_task_completion_processing(&task_status, NULL); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(9, ("Failed to mca_coll_ml_task_completion_processing")); - return ret; - } - break; - default: - ML_ERROR(("Unknow exit status %d", rc)); - return OMPI_ERROR; - } - } else { - /* the task is depend on other, lets put it on pending list */ - ML_VERBOSE(9, ("The task %p dependency is %d, putting it on pending list", - (void *)task_status, func->num_dependencies)); - OPAL_THREAD_LOCK(&(mca_coll_ml_component.pending_tasks_mutex)); - opal_list_append(&cm->pending_tasks, (opal_list_item_t *)task_status); - OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.pending_tasks_mutex)); - } - } - ML_VERBOSE(9, ("Collective was launched !")); - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ mca_coll_ml_collective_operation_progress_t * -mca_coll_ml_alloc_op_prog_single_frag_dag( - mca_coll_ml_module_t *ml_module, - mca_coll_ml_collective_operation_description_t *coll_schedule, - const void *src, void *dst, size_t total_bytes, - size_t offset_into_user_buffer - ) -{ - opal_free_list_item_t *item; - mca_coll_ml_collective_operation_progress_t *coll_op = NULL; - ompi_request_t *req; - - /* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */ - item = opal_free_list_wait (&(ml_module->coll_ml_collective_descriptors)); - - coll_op = (mca_coll_ml_collective_operation_progress_t *) item; - ML_VERBOSE(10, (">>> Allocating coll op %p", coll_op)); - assert(NULL != coll_op); - assert(coll_op->dag_description.status_array[0].item.opal_list_item_refcount == 0); - req = &(coll_op->full_message.super); - - OMPI_REQUEST_INIT(req, false); - /* Mark the request ACTIVE. It is critical for MPI_Test()*/ - req->req_state = OMPI_REQUEST_ACTIVE; - req->req_status._cancelled = 0; - req->req_status.MPI_ERROR = OMPI_SUCCESS; - - MCA_COLL_ML_OP_BASIC_SETUP(coll_op, total_bytes, - offset_into_user_buffer, src, dst, coll_schedule); - - /* We do not set sequential, since it is not sequential call */ - coll_op->dag_description.num_tasks_completed = 0; - - /* Release reference counter have to be zero */ - assert(0 == coll_op->pending); - - return coll_op; -} - -static inline __opal_attribute_always_inline__ mca_coll_ml_collective_operation_progress_t * -mca_coll_ml_duplicate_op_prog_single_frag_dag( - mca_coll_ml_module_t *ml_module, - mca_coll_ml_collective_operation_progress_t *old_op) -{ - mca_coll_ml_collective_operation_progress_t *new_op = NULL; - - new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_bcast_functions[old_op->fragment_data.current_coll_op], - old_op->fragment_data.message_descriptor->dest_user_addr, - (void *)old_op->fragment_data.message_descriptor->src_user_addr, - old_op->fragment_data.message_descriptor->n_bytes_total, - old_op->fragment_data.message_descriptor->n_bytes_scheduled); - - new_op->fragment_data.current_coll_op = old_op->fragment_data.current_coll_op; - new_op->fragment_data.message_descriptor = old_op->fragment_data.message_descriptor; - - return new_op; -} - -static inline __opal_attribute_always_inline__ mca_coll_ml_collective_operation_progress_t * - mca_coll_ml_alloc_op_prog_single_frag_seq( - mca_coll_ml_module_t *ml_module, - mca_coll_ml_collective_operation_description_t *coll_schedule, - void *src, void *dst, - size_t total_bytes, - size_t offset_into_user_buffer - ) -{ - opal_free_list_item_t *item; - mca_coll_ml_collective_operation_progress_t *coll_op = NULL; - - /* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */ - item = opal_free_list_wait (&(ml_module->coll_ml_collective_descriptors)); - - coll_op = (mca_coll_ml_collective_operation_progress_t *) item; - - assert(NULL != coll_op); - - MCA_COLL_ML_OP_BASIC_SETUP(coll_op, total_bytes, - offset_into_user_buffer, src, dst, coll_schedule); - - /* set sequential data */ - /* pasha - do we have something to set ? */ - - return coll_op; -} - -static inline __opal_attribute_always_inline__ - void mca_coll_ml_convertor_get_send_frag_size(mca_coll_ml_module_t *ml_module, - size_t *frag_size, struct full_message_t *message_descriptor) -{ - size_t fragment_size = *frag_size; - opal_convertor_t *dummy_convertor = &message_descriptor->dummy_convertor; - - /* The last frag needs special service */ - if (fragment_size > - (size_t) message_descriptor->send_converter_bytes_packed) { - *frag_size = message_descriptor->send_converter_bytes_packed; - message_descriptor->send_converter_bytes_packed = 0; - - return; - } - if( (message_descriptor->dummy_conv_position + fragment_size) > - message_descriptor->n_bytes_total ) { - message_descriptor->dummy_conv_position = (message_descriptor->dummy_conv_position + fragment_size) - - message_descriptor->n_bytes_total; - } else { - message_descriptor->dummy_conv_position += fragment_size; - } - - opal_convertor_generic_simple_position(dummy_convertor, &message_descriptor->dummy_conv_position); - *frag_size -= dummy_convertor->partial_length; - - message_descriptor->send_converter_bytes_packed -= (*frag_size); -} - -static inline __opal_attribute_always_inline__ int -mca_coll_ml_launch_sequential_collective (mca_coll_ml_collective_operation_progress_t *coll_op) -{ - mca_bcol_base_coll_fn_desc_t *bcol_func; - int ifunc, n_fn, ih, ret; - mca_coll_ml_collective_operation_description_t *sched = - coll_op->coll_schedule; - - n_fn = sched->n_fns; - ih = coll_op->sequential_routine.current_active_bcol_fn; - - /* if collectives are already pending just add this one to the list */ - if (opal_list_get_size (&mca_coll_ml_component.sequential_collectives)) { - opal_list_append(&mca_coll_ml_component.sequential_collectives, (opal_list_item_t *) coll_op); - - return OMPI_SUCCESS; - } - - for (ifunc = ih; ifunc < n_fn; ifunc++, coll_op->sequential_routine.current_active_bcol_fn++) { - ret = coll_op->sequential_routine.seq_task_setup(coll_op); - if (OMPI_SUCCESS != ret) { - return ret; - } - - bcol_func = (sched->component_functions[ifunc].bcol_function); - ret = bcol_func->coll_fn(&coll_op->variable_fn_params, - (struct mca_bcol_base_function_t *) &sched->component_functions[ifunc].constant_group_data); - - if (BCOL_FN_COMPLETE == ret) { - if (ifunc == n_fn - 1) { - ret = coll_ml_fragment_completion_processing(coll_op); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing"); - } - - return OMPI_SUCCESS; - } - } else { - if (BCOL_FN_STARTED == ret) { - coll_op->sequential_routine.current_bcol_status = SEQ_TASK_IN_PROG; - } else { - coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING; - } - - ML_VERBOSE(10, ("Adding pending bcol to the progress list to access by ml_progress func-id %d", ifunc)); - opal_list_append(&mca_coll_ml_component.sequential_collectives, (opal_list_item_t *) coll_op); - - break; - } - } - - return OMPI_SUCCESS; -} - -END_C_DECLS - -#endif diff --git a/ompi/mca/coll/ml/coll_ml_lex.h b/ompi/mca/coll/ml/coll_ml_lex.h deleted file mode 100644 index d09fe45bf9..0000000000 --- a/ompi/mca/coll/ml/coll_ml_lex.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef COLL_ML_LEX_H_ -#define COLL_ML_LEX_H_ - -#include "opal_config.h" -#include - -BEGIN_C_DECLS - -int coll_ml_config_yylex(void); -int coll_ml_config_init_buffer(FILE *file); -int coll_ml_config_yylex_destroy(void); - -extern FILE *coll_ml_config_yyin; -extern bool coll_ml_config_parse_done; -extern char *coll_ml_config_yytext; -extern int coll_ml_config_yynewlines; - -/* - * Make lex-generated files not issue compiler warnings - */ -#define YY_STACK_USED 0 -#define YY_ALWAYS_INTERACTIVE 0 -#define YY_NEVER_INTERACTIVE 0 -#define YY_MAIN 0 -#define YY_NO_UNPUT 1 -#define YY_SKIP_YYWRAP 1 - -enum { - COLL_ML_CONFIG_PARSE_DONE, - COLL_ML_CONFIG_PARSE_ERROR, - COLL_ML_CONFIG_PARSE_NEWLINE, - COLL_ML_CONFIG_PARSE_SECTION, - COLL_ML_CONFIG_PARSE_COLLECTIVE, - COLL_ML_CONFIG_PARSE_EQUAL, - COLL_ML_CONFIG_PARSE_SINGLE_WORD, - COLL_ML_CONFIG_PARSE_VALUE, - COLL_ML_CONFIG_PARSE_MAX -}; -END_C_DECLS -#endif diff --git a/ompi/mca/coll/ml/coll_ml_lex.l b/ompi/mca/coll/ml/coll_ml_lex.l deleted file mode 100644 index 45c1e0aefb..0000000000 --- a/ompi/mca/coll/ml/coll_ml_lex.l +++ /dev/null @@ -1,141 +0,0 @@ -%option nounput -%option noinput - -%{ /* -*- C -*- */ -#include "opal_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif - -#include "coll_ml_lex.h" - -BEGIN_C_DECLS - -/* - * local functions - */ -static int coll_ml_config_yywrap(void); - -END_C_DECLS - -/* - * global variables - */ -int coll_ml_config_yynewlines = 1; -bool coll_ml_config_parse_done = false; -char *coll_ml_config_string = NULL; - -%} - -WHITE [\f\t\v ] -CHAR [A-Za-z0-9_\-\.] -NAME_CHAR [A-Za-z0-9_\-\.\\\/] - -%x comment -%x section_name -%x collective_name -%x section_end -%x collective_end -%x value - -%% - -{WHITE}*\n { ++coll_ml_config_yynewlines; - return COLL_ML_CONFIG_PARSE_NEWLINE; } -#.*\n { ++coll_ml_config_yynewlines; - return COLL_ML_CONFIG_PARSE_NEWLINE; } -"//".*\n { ++coll_ml_config_yynewlines; - return COLL_ML_CONFIG_PARSE_NEWLINE; } - -"/*" { BEGIN(comment); - return COLL_ML_CONFIG_PARSE_NEWLINE; } -[^*\n]* ; /* Eat up non '*'s */ -"*"+[^*/\n]* ; /* Eat '*'s not followed by a '/' */ -\n { ++coll_ml_config_yynewlines; - return COLL_ML_CONFIG_PARSE_NEWLINE; } -"*"+"/" { BEGIN(INITIAL); /* Done with block comment */ - return COLL_ML_CONFIG_PARSE_NEWLINE; } - -{WHITE}*\[{WHITE}* { BEGIN(collective_name); } -({NAME_CHAR}|{WHITE})*{NAME_CHAR}/{WHITE}*\] { - BEGIN(collective_end); - return COLL_ML_CONFIG_PARSE_COLLECTIVE; } -\n { ++coll_ml_config_yynewlines; - return COLL_ML_CONFIG_PARSE_ERROR; } -. { return COLL_ML_CONFIG_PARSE_ERROR; } -{WHITE}*\]{WHITE}*\n { - BEGIN(INITIAL); - ++coll_ml_config_yynewlines; - return COLL_ML_CONFIG_PARSE_NEWLINE; } - -{WHITE}*\<{WHITE}* { BEGIN(section_name); } -({NAME_CHAR}|{WHITE})*{NAME_CHAR}/{WHITE}*\> { - BEGIN(section_end); - return COLL_ML_CONFIG_PARSE_SECTION; } -\n { ++coll_ml_config_yynewlines; - return COLL_ML_CONFIG_PARSE_ERROR; } -. { return COLL_ML_CONFIG_PARSE_ERROR; } -{WHITE}*\>{WHITE}*\n { - BEGIN(INITIAL); - ++coll_ml_config_yynewlines; - return COLL_ML_CONFIG_PARSE_NEWLINE; } - -{WHITE}*"="{WHITE}* { BEGIN(value); - return COLL_ML_CONFIG_PARSE_EQUAL; } -{WHITE}+ ; /* whitespace */ -{CHAR}+ { return COLL_ML_CONFIG_PARSE_SINGLE_WORD; } - -{WHITE}*\n { BEGIN(INITIAL); - ++coll_ml_config_yynewlines; - return COLL_ML_CONFIG_PARSE_NEWLINE; } -[^\n]*[^\t \n]/[\t ]* { - return COLL_ML_CONFIG_PARSE_VALUE; } - -. { return COLL_ML_CONFIG_PARSE_ERROR; } -%% - -/* Old flex (2.5.4a? and older) does not define a destroy function */ -#if !defined(YY_FLEX_SUBMINOR_VERSION) -#define YY_FLEX_SUBMINOR_VERSION 0 -#endif - -#if (YY_FLEX_MAJOR_VERSION < 2) || (YY_FLEX_MAJOR_VERSION == 2 && (YY_FLEX_MINOR_VERSION < 5 || (YY_FLEX_MINOR_VERSION == 5 && YY_FLEX_SUBMINOR_VERSION < 5))) -int coll_ml_config_yylex_destroy(void) -{ - if (NULL != YY_CURRENT_BUFFER) { - yy_delete_buffer(YY_CURRENT_BUFFER); -#if defined(YY_CURRENT_BUFFER_LVALUE) - YY_CURRENT_BUFFER_LVALUE = NULL; -#else - YY_CURRENT_BUFFER = NULL; -#endif /* YY_CURRENT_BUFFER_LVALUE */ - } - return YY_NULL; -} -#endif - -static int coll_ml_config_yywrap(void) -{ - coll_ml_config_parse_done = true; - return 1; -} - - -/* - * Ensure that we have a valid yybuffer to use. Specifically, if this - * scanner is invoked a second time, finish_parsing() (above) will - * have been executed, and the current buffer will have been freed. - * Flex doesn't recognize this fact because as far as it's concerned, - * its internal state was already initialized, so it thinks it should - * have a valid buffer. Hence, here we ensure to give it a valid - * buffer. - */ -int coll_ml_config_init_buffer(FILE *file) -{ - YY_BUFFER_STATE buf = yy_create_buffer(file, YY_BUF_SIZE); - yy_switch_to_buffer(buf); - - return 0; -} diff --git a/ompi/mca/coll/ml/coll_ml_lmngr.c b/ompi/mca/coll/ml/coll_ml_lmngr.c deleted file mode 100644 index 1be3f4afe6..0000000000 --- a/ompi/mca/coll/ml/coll_ml_lmngr.c +++ /dev/null @@ -1,330 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "opal/class/opal_list.h" -#include "opal/threads/mutex.h" -#include "coll_ml.h" -#include "coll_ml_inlines.h" -#include "coll_ml_mca.h" -#include "coll_ml_lmngr.h" -#ifndef HAVE_POSIX_MEMALIGN -#include "opal/align.h" -#include "opal_stdint.h" -#endif -#include "opal/util/sys_limits.h" - -/* Constructor for list memory manager */ -static void construct_lmngr(mca_coll_ml_lmngr_t *lmngr) -{ - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - - ML_VERBOSE(7, ("Constructing new list manager %p", (void *)lmngr)); - - /* No real memory is allocated, only basic init. - The real memory will be allocated on demand, on first block allocation */ - - /* I caching this block size, alignment and list size - since maybe in future we will want to define different parameters - for lists */ - lmngr->list_block_size = cm->lmngr_block_size; - lmngr->list_alignment = cm->lmngr_alignment; - lmngr->list_size = cm->lmngr_size; - lmngr->n_resources = 0; - lmngr->base_addr = NULL; /* If the base addr is not null, the struct was initilized - and memory was allocated */ - /* Not sure that lock is required */ - OBJ_CONSTRUCT(&lmngr->mem_lock, opal_mutex_t); - - /* Only construct the list, no memry initialisation */ - OBJ_CONSTRUCT(&lmngr->blocks_list, opal_list_t); -} - -static void destruct_lmngr(mca_coll_ml_lmngr_t *lmngr) -{ - int max_nc = lmngr->n_resources; - int rc, i; - bcol_base_network_context_t *nc; - opal_list_item_t *item; - - ML_VERBOSE(6, ("Destructing list manager %p", (void *)lmngr)); - - while (NULL != (item = opal_list_remove_first(&lmngr->blocks_list))) { - OBJ_RELEASE(item); - } - - OBJ_DESTRUCT(&lmngr->blocks_list); - - if (NULL != lmngr->alloc_base) { - for( i = 0; i < max_nc; i++ ) { - nc = lmngr->net_context[i]; - rc = nc->deregister_memory_fn(nc->context_data, - lmngr->reg_desc[nc->context_id]); - if(rc != OMPI_SUCCESS) { - ML_ERROR(("Failed to unregister , lmngr %p", (void *)lmngr)); - } - } - - ML_VERBOSE(10, ("Release base addr %p", lmngr->alloc_base)); - - free(lmngr->alloc_base); - lmngr->alloc_base = NULL; - lmngr->base_addr = NULL; - } - - lmngr->list_block_size = 0; - lmngr->list_alignment = 0; - lmngr->list_size = 0; - lmngr->n_resources = 0; - - OBJ_DESTRUCT(&lmngr->mem_lock); -} - -OBJ_CLASS_INSTANCE(mca_coll_ml_lmngr_t, - opal_object_t, - construct_lmngr, - destruct_lmngr); - -int mca_coll_ml_lmngr_tune(mca_coll_ml_lmngr_t *lmngr, - size_t block_size, size_t list_size, size_t alignment) -{ - ML_VERBOSE(7, ("Tunning list manager")); - - if (OPAL_UNLIKELY(NULL == lmngr->base_addr)) { - ML_VERBOSE(7, ("The list manager is already initialized, you can not tune it")); - return OMPI_ERROR; - } - - lmngr->list_block_size = block_size; - lmngr->list_alignment = alignment; - lmngr->list_size = list_size; - - return OMPI_SUCCESS; -} - -int mca_coll_ml_lmngr_reg(void) -{ - int tmp, ret = OMPI_SUCCESS; - - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - -#define CHECK(expr) do {\ - tmp = (expr); \ - if (0 > tmp) ret = tmp; \ - } while (0) - - ML_VERBOSE(7, ("Setting parameters for list manager")); - - cm->lmngr_size = 8; - CHECK(mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, - "memory_manager_list_size", "Memory manager list size", - MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cm->lmngr_size)); - - /* The size list couldn't be less than possible max of ML modules, - it = max supported communicators by ML */ - if (cm->lmngr_size < cm->max_comm) { - cm->lmngr_size = cm->max_comm; - } - - mca_coll_ml_component.lmngr_block_size = cm->payload_buffer_size * - cm->n_payload_buffs_per_bank * - cm->n_payload_mem_banks * - cm->lmngr_size; - - CHECK(mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, - "memory_manager_block_size", "Memory manager block size", - MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_coll_ml_component.lmngr_block_size)); - - cm->lmngr_alignment = opal_getpagesize(); - CHECK(mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, - "memory_manager_alignment", "Memory manager alignment", - MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_coll_ml_component.lmngr_block_size)); - - return ret; -} - -static int lmngr_register(mca_coll_ml_lmngr_t *lmngr, bcol_base_network_context_t *nc) -{ - int rc, j; - int max_nc = lmngr->n_resources; - - rc = nc->register_memory_fn(nc->context_data, - lmngr->base_addr, - lmngr->list_size * lmngr->list_block_size, - &lmngr->reg_desc[nc->context_id]); - - if(rc != OMPI_SUCCESS) { - int ret_val; - ML_VERBOSE(7, ("Failed to register [%d], unrolling the registration", rc)); - /* deregistser the successful registrations */ - for( j = 0; j < max_nc; j++ ) { - /* set the registration parameter to point to the current - * resource description */ - nc = lmngr->net_context[j]; - ret_val = nc->deregister_memory_fn(nc->context_data, - lmngr->reg_desc[nc->context_id]); - if(ret_val != OMPI_SUCCESS) { - return ret_val; - } - } - - return rc; - } - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_lmngr_init(mca_coll_ml_lmngr_t *lmngr) -{ - int i, num_blocks; - int rc; - unsigned char *addr; - bcol_base_network_context_t *nc; - - ML_VERBOSE(7, ("List initialization")); - -#ifdef HAVE_POSIX_MEMALIGN - if((errno = posix_memalign(&lmngr->base_addr, - lmngr->list_alignment, - lmngr->list_size * lmngr->list_block_size)) != 0) { - ML_ERROR(("Failed to allocate memory: %d [%s]", errno, strerror(errno))); - return OMPI_ERROR; - } - lmngr->alloc_base = lmngr->base_addr; -#else - lmngr->alloc_base = - malloc(lmngr->list_size * lmngr->list_block_size + lmngr->list_alignment); - if(NULL == lmngr->alloc_base) { - ML_ERROR(("Failed to allocate memory: %d [%s]", errno, strerror(errno))); - return OMPI_ERROR; - } - - lmngr->base_addr = (void*)OPAL_ALIGN((uintptr_t)lmngr->alloc_base, - lmngr->list_alignment, uintptr_t); -#endif - - assert(lmngr->n_resources < MCA_COLL_ML_MAX_REG_INFO); - - for(i= 0 ;i < lmngr->n_resources ;i++) { - nc = lmngr->net_context[i]; - ML_VERBOSE(7, ("Call registration for resource index %d", i)); - rc = lmngr_register(lmngr, nc); - if (OMPI_SUCCESS != rc) { - ML_ERROR(("Failed to lmngr register: %d [%s]", errno, strerror(errno))); - return rc; - } - } - - /* slice the memory to blocks */ - addr = (unsigned char *) lmngr->base_addr; - for(num_blocks = 0; num_blocks < (int)lmngr->list_size; num_blocks++) { - mca_bcol_base_lmngr_block_t *item = OBJ_NEW(mca_bcol_base_lmngr_block_t); - item->base_addr = (void *)addr; - item->lmngr = lmngr; - /* ML_VERBOSE(10, ("Appending block # %d %p", num_blocks, (void *)addr)); */ - opal_list_append(&lmngr->blocks_list, (opal_list_item_t *)item); - /* advance the address */ - addr += lmngr->list_block_size; - } - - ML_VERBOSE(7, ("List initialization done %d", - opal_list_get_size(&lmngr->blocks_list))); - return OMPI_SUCCESS; -} - -mca_bcol_base_lmngr_block_t* mca_coll_ml_lmngr_alloc ( - mca_coll_ml_lmngr_t *lmngr) -{ - int rc; - opal_list_t *list = &lmngr->blocks_list; - - /* Check if the list manager was initialized */ - if(OPAL_UNLIKELY(NULL == lmngr->base_addr)) { - ML_VERBOSE(7 ,("Starting memory initialization")); - rc = mca_coll_ml_lmngr_init(lmngr); - if (OMPI_SUCCESS != rc) { - ML_ERROR(("Failed to init memory")); - return NULL; - } - } - - if(OPAL_UNLIKELY(opal_list_is_empty(list))) { - /* Upper layer need to handle the NULL */ - ML_VERBOSE(1, ("List manager is empty.")); - return NULL; - } - - return (mca_bcol_base_lmngr_block_t *)opal_list_remove_first(list); -} - -void mca_coll_ml_lmngr_free(mca_bcol_base_lmngr_block_t *block) -{ - opal_list_append(&block->lmngr->blocks_list, (opal_list_item_t *)block); -} - -int mca_coll_ml_lmngr_append_nc(mca_coll_ml_lmngr_t *lmngr, bcol_base_network_context_t *nc) -{ - int i, rc; - - ML_VERBOSE(7, ("Append new network context %p to list manager %p", - nc, lmngr)); - - if (NULL == nc) { - return OMPI_ERROR; - } - - /* check if we already have the context on the list. - if we do have - do not do anything, just return success - */ - if (OPAL_UNLIKELY(MCA_COLL_ML_MAX_REG_INFO == lmngr->n_resources)) { - ML_ERROR(("MPI overflows maximum supported network contexts is %d", MCA_COLL_ML_MAX_REG_INFO)); - return OMPI_ERROR; - } - - for (i = 0; i < lmngr->n_resources; i++) { - if (lmngr->net_context[i] == nc) { - ML_VERBOSE(7, ("It is not new ")); - return OMPI_SUCCESS; - } - } - - ML_VERBOSE(7, ("Adding new context")); - - /* Setting context id */ - nc->context_id = lmngr->n_resources; - lmngr->net_context[lmngr->n_resources] = nc; - - lmngr->n_resources++; - - /* Register the memory with new context */ - if (NULL != lmngr->base_addr) { - rc = lmngr_register(lmngr, nc); - if (OMPI_SUCCESS == rc) { - return rc; - } - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/ml/coll_ml_lmngr.h b/ompi/mca/coll/ml/coll_ml_lmngr.h deleted file mode 100644 index c07b3802b5..0000000000 --- a/ompi/mca/coll/ml/coll_ml_lmngr.h +++ /dev/null @@ -1,81 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_ML_LMNGR_H -#define MCA_ML_LMNGR_H - -#include "ompi_config.h" -#include "opal/class/opal_list.h" -#include "ompi/mca/bcol/bcol.h" - -#define MCA_COLL_ML_MAX_REG_INFO 32 - -/* LMNGR - List manager for registred memory */ -struct mca_coll_ml_lmngr_t { - opal_object_t super; - /* lock to control list access */ - opal_mutex_t mem_lock; - - /* list of memory chunks */ - opal_list_t blocks_list; - - /* base (allocated) address of the memory pool */ - void* base_addr; - void *alloc_base; - - /* size of memory chunks */ - size_t list_block_size; - - /* memory chunk alignment */ - size_t list_alignment; - - /* init list size */ - size_t list_size; - - /* number network context of resources - In other words, number of different registration - functions that will be used. For example in case - of iboffload for each device (PD) we will have - different entry - */ - int n_resources; - - /* registration descriptor */ - void * reg_desc[MCA_COLL_ML_MAX_REG_INFO]; - - /* bcol network context array */ - struct bcol_base_network_context_t * net_context[MCA_COLL_ML_MAX_REG_INFO]; -}; -typedef struct mca_coll_ml_lmngr_t mca_coll_ml_lmngr_t; -OBJ_CLASS_DECLARATION(mca_coll_ml_lmngr_t); - -/* read user defined parametres for list manager */ -int mca_coll_ml_lmngr_reg(void); -/* If programmer want to user other than default mca -parametres, he can use the tune function. The tune -function must be run before list initialization, -otherway error will be returned */ -int mca_coll_ml_lmngr_tune(mca_coll_ml_lmngr_t *lmngr, - size_t block_size, size_t list_size, size_t alignment); - -/* Append new network context to the existing list memory manager */ -int mca_coll_ml_lmngr_append_nc(mca_coll_ml_lmngr_t *lmngr, bcol_base_network_context_t *nc); - -/* Allocate a block from memory list manager */ -mca_bcol_base_lmngr_block_t* mca_coll_ml_lmngr_alloc ( - mca_coll_ml_lmngr_t *lmngr); - -/* Return block to list memory manager */ -void mca_coll_ml_lmngr_free (mca_bcol_base_lmngr_block_t *block); - -#endif diff --git a/ompi/mca/coll/ml/coll_ml_mca.c b/ompi/mca/coll/ml/coll_ml_mca.c deleted file mode 100644 index dd1e0ba49c..0000000000 --- a/ompi/mca/coll/ml/coll_ml_mca.c +++ /dev/null @@ -1,300 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include -#include -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "coll_ml.h" -#include "coll_ml_inlines.h" -#include "coll_ml_mca.h" -#include "coll_ml_lmngr.h" -#include "ompi/patterns/net/netpatterns.h" -#include "opal/mca/installdirs/installdirs.h" - -/* - * Local flags - */ -enum { - REGINT_NEG_ONE_OK = 0x01, - REGINT_GE_ZERO = 0x02, - REGINT_GE_ONE = 0x04, - REGINT_NONZERO = 0x08, - REGINT_MAX = 0x88 -}; - -enum { - REGSTR_EMPTY_OK = 0x01, - REGSTR_MAX = 0x88 -}; - -/* - * Enumerators - */ -mca_base_var_enum_value_t fragmentation_enable_enum[] = { - {0, "disable"}, - {1, "enable"}, - {2, "auto"}, - {-1, NULL} -}; - -mca_base_var_enum_value_t bcast_algorithms[] = { - {COLL_ML_STATIC_BCAST, "static"}, - {COLL_ML_SEQ_BCAST, "sequential"}, - {COLL_ML_UNKNOWN_BCAST, "unknown-root"}, - {-1, NULL} -}; - -/* - * utility routine for string parameter registration - */ -static int reg_string(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - const char* default_value, char **storage, - int flags) -{ - int index; - - *storage = (char *) default_value; - index = mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_STRING, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "coll", "ml", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGSTR_EMPTY_OK) && (NULL == *storage || 0 == strlen(*storage))) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} - -/* - * utility routine for integer parameter registration - */ -static int reg_int(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - int default_value, int *storage, int flags) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_INT, - NULL, 0, 0,OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "coll", "ml", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) { - return OMPI_SUCCESS; - } - - if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) || - (0 != (flags & REGINT_GE_ONE) && *storage < 1) || - (0 != (flags & REGINT_NONZERO) && 0 == *storage)) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} - -static int reg_bool(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - bool default_value, bool *storage) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL, - NULL, 0, 0,OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "coll", "ml", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - return OMPI_SUCCESS; -} - -static int reg_ullint(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - unsigned long long default_value, unsigned long long *storage, int flags) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, - NULL, 0, 0,OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "coll", "ml", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if ((0 != (flags & REGINT_GE_ONE) && *storage < 1) || - (0 != (flags & REGINT_NONZERO) && 0 == *storage)) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_verify_params(void) -{ - int dummy; - - /* Make sure that the the number of memory banks is a power of 2 */ - mca_coll_ml_component.n_payload_mem_banks = - ompi_roundup_to_power_radix(2, mca_coll_ml_component.n_payload_mem_banks, - &dummy); - - /* Make sure that the the number of buffers is a power of 2 */ - mca_coll_ml_component.n_payload_buffs_per_bank = - ompi_roundup_to_power_radix(2, mca_coll_ml_component.n_payload_buffs_per_bank, - &dummy); - - return OMPI_SUCCESS; -} - -int mca_coll_ml_register_params(void) -{ - mca_base_var_enum_t *new_enum; - int ret, tmp; - char *str = NULL; - - ret = OMPI_SUCCESS; -#define CHECK(expr) do { \ - tmp = (expr); \ - if (OMPI_SUCCESS != tmp) ret = tmp; \ - } while (0) - - /* register openib component parameters */ - - CHECK(reg_int("priority", NULL, "ML component priority" - "(from 0(low) to 90 (high))", 0, &mca_coll_ml_component.ml_priority, 0)); - - CHECK(reg_int("verbose", NULL, "Output some verbose ML information " - "(0 = no output, nonzero = output)", 0, &mca_coll_ml_component.verbose, 0)); - - CHECK(reg_int("max_comm", NULL, "Maximum number of communicators that can use coll/ml", 24, - (int *) &mca_coll_ml_component.max_comm, 0)); - - CHECK(reg_int("min_comm_size", NULL, "Minimum size of communicator to use coll/ml", 0, - &mca_coll_ml_component.min_comm_size, 0)); - - CHECK(reg_int("n_payload_mem_banks", NULL, "Number of payload memory banks", 2, - &mca_coll_ml_component.n_payload_mem_banks, 0)); - - CHECK(reg_int("n_payload_buffs_per_bank", NULL, "Number of payload buffers per bank", 16, - &mca_coll_ml_component.n_payload_buffs_per_bank, 0)); - - /* RLG: need to handle alignment and size */ - CHECK(reg_ullint("payload_buffer_size", NULL, "Size of payload buffers", 4*1024, - &mca_coll_ml_component.payload_buffer_size, 0)); - - /* get the pipeline depth, default is 2 */ - CHECK(reg_int("pipeline_depth", NULL, "Size of fragmentation pipeline", 2, - &mca_coll_ml_component.pipeline_depth, 0)); - - CHECK(reg_int("free_list_init_size", NULL, "Initial size of free lists in coll/ml", 128, - &mca_coll_ml_component.free_list_init_size, 0)); - - CHECK(reg_int("free_list_grow_size", NULL, "Initial size of free lists in coll/ml", 64, - &mca_coll_ml_component.free_list_grow_size, 0)); - - CHECK(reg_int("free_list_max_size", NULL, "Initial size of free lists in coll/ml", -1, - &mca_coll_ml_component.free_list_max_size, 0)); - - mca_coll_ml_component.use_knomial_allreduce = 1; - - tmp = mca_base_var_enum_create ("coll_ml_bcast_algorithm", bcast_algorithms, &new_enum); - if (OPAL_SUCCESS != tmp) { - return tmp; - } - - mca_coll_ml_component.bcast_algorithm = COLL_ML_STATIC_BCAST; - tmp = mca_base_component_var_register (&mca_coll_ml_component.super.collm_version, "bcast_algorithm", - "Algorithm to use for broadcast", MCA_BASE_VAR_TYPE_INT, - new_enum, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_coll_ml_component.bcast_algorithm); - OBJ_RELEASE(new_enum); - if (0 > tmp) { - ret = tmp; - } - - CHECK(reg_bool("disable_allgather", NULL, "Disable Allgather", false, - &mca_coll_ml_component.disable_allgather)); - - CHECK(reg_bool("disable_reduce", NULL, "Disable Reduce", false, - &mca_coll_ml_component.disable_reduce)); - - tmp = mca_base_var_enum_create ("coll_ml_enable_fragmentation_enum", fragmentation_enable_enum, &new_enum); - if (OPAL_SUCCESS != tmp) { - return tmp; - } - - /* default to auto-enable fragmentation */ - mca_coll_ml_component.enable_fragmentation = 2; - tmp = mca_base_component_var_register (&mca_coll_ml_component.super.collm_version, "enable_fragmentation", - "Disable/Enable fragmentation for large messages", MCA_BASE_VAR_TYPE_INT, - new_enum, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_coll_ml_component.enable_fragmentation); - if (0 > tmp) { - ret = tmp; - } - OBJ_RELEASE(new_enum); - - asprintf(&str, "%s/mca-coll-ml.config", - opal_install_dirs.opaldatadir); - if (NULL == str) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - CHECK(reg_string("config_file", NULL, - "ML collectives configuration file", - str, &mca_coll_ml_component.config_file_name, - 0)); - free(str); - - /* Reading parameters for list manager */ - CHECK(mca_coll_ml_lmngr_reg()); - - /* Verify the parameters */ - CHECK(mca_coll_ml_verify_params()); - - return ret; -} diff --git a/ompi/mca/coll/ml/coll_ml_mca.h b/ompi/mca/coll/ml/coll_ml_mca.h deleted file mode 100644 index 7730bd284d..0000000000 --- a/ompi/mca/coll/ml/coll_ml_mca.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - /** @file */ - -#ifndef MCA_COLL_ML_MCA_H -#define MCA_COLL_ML_MCA_H - -#include -#include "ompi_config.h" - -int mca_coll_ml_register_params(void); - -#endif diff --git a/ompi/mca/coll/ml/coll_ml_memsync.c b/ompi/mca/coll/ml/coll_ml_memsync.c deleted file mode 100644 index de0c322cfb..0000000000 --- a/ompi/mca/coll/ml/coll_ml_memsync.c +++ /dev/null @@ -1,175 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#include "ompi_config.h" - -#include "ompi/constants.h" -#include "opal/threads/mutex.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/coll/coll.h" -#include "opal/sys/atomic.h" -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#include "ompi/mca/coll/ml/coll_ml_allocation.h" - -static int mca_coll_ml_memsync_recycle_memory(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *)coll_op->coll_module; - mca_bcol_base_memory_block_desc_t *ml_memblock = ml_module->payload_block; - mca_coll_ml_collective_operation_progress_t *pending_op = NULL; - int bank = coll_op->full_message.bank_index_to_recycle; - int rc; - bool have_resources = true; - - assert(bank >= 0 || - bank < (int)ml_memblock->num_banks || - ML_MEMSYNC == coll_op->fragment_data.current_coll_op); - - ML_VERBOSE(10,("MEMSYNC: bank %d was recycled coll_op %p", bank, coll_op)); - - /* set the bank as free */ - - ml_memblock->bank_is_busy[bank] = false; - ml_memblock->bank_release_counters[bank] = 0; - - /* Check if we have any requests that are waiting for memory */ - while(opal_list_get_size(&ml_module->waiting_for_memory_list) && have_resources) { - pending_op = (mca_coll_ml_collective_operation_progress_t *) - opal_list_get_first(&ml_module->waiting_for_memory_list); - - ML_VERBOSE(10, ("Trying to start pending %p", pending_op)); - assert(pending_op->pending & REQ_OUT_OF_MEMORY); - rc = pending_op->fragment_data.message_descriptor->fragment_launcher(pending_op); - switch (rc) { - case OMPI_SUCCESS: - ML_VERBOSE(10, ("Pending fragment was started %p", pending_op)); - pending_op->pending ^= REQ_OUT_OF_MEMORY; - opal_list_remove_item(&ml_module->waiting_for_memory_list, - (opal_list_item_t *)pending_op); - if (0 != pending_op->fragment_data.offset_into_user_buffer) { - /* non-zero offset ==> this is not fragment 0 */ - CHECK_AND_RECYCLE(pending_op); - } - break; - case OMPI_ERR_TEMP_OUT_OF_RESOURCE: - ML_VERBOSE(10, ("Already on the list %p", pending_op)); - have_resources = false; - break; - default: - ML_ERROR(("Error happened %d", rc)); - return rc; - } - } - - ML_VERBOSE(10, ("Memsync done %p", coll_op)); - return OMPI_SUCCESS; -} - -static void mca_coll_ml_barrier_task_setup( - mca_coll_ml_task_status_t *task_status, - int index, mca_coll_ml_compound_functions_t *func) -{ - task_status->rt_num_dependencies = func->num_dependencies; - task_status->rt_num_dependent_tasks = func->num_dependent_tasks; - task_status->rt_dependent_task_indices = func->dependent_task_indices; -} - -static inline __opal_attribute_always_inline__ int mca_coll_ml_memsync_launch(mca_coll_ml_module_t *ml_module, - ompi_request_t **req, int bank_index) -{ - mca_coll_ml_collective_operation_progress_t *coll_op; - - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_memsync_function, - NULL, NULL, 0, 0); - - assert(NULL != coll_op); - - ML_VERBOSE(10, ("Get coll request %p", coll_op)); - - coll_op->fragment_data.buffer_desc = NULL; - - /* Caching bank index for future memory recycling callback */ - coll_op->full_message.bank_index_to_recycle = bank_index; - - coll_op->fragment_data.current_coll_op = ML_MEMSYNC; - /* I don't want to define one more parameter, so under root - * we pass buffer index */ - coll_op->variable_fn_params.root = bank_index; - /* As well it's little bit ugly, since it is no wait for this request, - * in order to recycle it we have to set offset to some value > 1 */ - coll_op->fragment_data.offset_into_user_buffer = 1; - coll_op->variable_fn_params.buffer_index = MCA_COLL_ML_NO_BUFFER; - coll_op->variable_fn_params.sequence_num = -1; /* It should be safe to use -1 */ - /* Pointer to a coll finalize function */ - if (OPAL_LIKELY(ml_module->initialized)) { - coll_op->process_fn = mca_coll_ml_memsync_recycle_memory; - } else { - /* No post work on first call */ - coll_op->process_fn = NULL; - } - - ML_VERBOSE(10,("Memsync start %p", &coll_op)); - - return mca_coll_ml_generic_collectives_append_to_queue(coll_op, mca_coll_ml_barrier_task_setup); -} - -/** - * Non blocking memory syncronization - */ -int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *ml_module, int bank_index) -{ - int rc; - ompi_request_t *req; - - ML_VERBOSE(8, ("MEMSYNC start")); - - if (OPAL_UNLIKELY(0 == opal_list_get_size(&ml_module->active_bcols_list))) { - /* Josh's change: In the case where only p2p is active, we have no way - * to reset the bank release counters to zero, I am doing that here since it - * would actually be "correct" to do it outside of this conditional, however - * I suspect that reseting the value to zero elsewhere would result in corrupted - * flow for non-contiguous data types - */ - - /* nasty hack to ensure that resources are released in the single level - * ptp case. - */ - mca_coll_ml_collective_operation_progress_t dummy_coll; - - dummy_coll.coll_module = (mca_coll_base_module_t *) ml_module; - dummy_coll.fragment_data.current_coll_op = ML_MEMSYNC; - dummy_coll.full_message.bank_index_to_recycle = bank_index; - - /* Handling special case when memory syncronization is not required */ - rc = mca_coll_ml_memsync_recycle_memory(&dummy_coll); - if(OPAL_UNLIKELY(rc != OMPI_SUCCESS)){ - ML_ERROR(("Failed to flush the list.")); - return rc; - } - } else { - /* retain the communicator until the operation is finished. the communicator - * will be released by CHECK_AND_RECYCLE */ - OBJ_RETAIN(ml_module->comm); - - rc = mca_coll_ml_memsync_launch(ml_module, &req, bank_index); - if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { - ML_ERROR(("Failed to launch a barrier.")); - return rc; - } - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/ml/coll_ml_module.c b/ompi/mca/coll/ml/coll_ml_module.c deleted file mode 100644 index 05651a6c07..0000000000 --- a/ompi/mca/coll/ml/coll_ml_module.c +++ /dev/null @@ -1,3122 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Most of the description of the data layout is in the - * coll_ml_module.c file. - */ - -#include "ompi_config.h" - -#include -#include -#include -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/coll/base/base.h" -#include "ompi/mca/sbgp/base/base.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/mca/sbgp/sbgp.h" -#include "ompi/patterns/comm/coll_ops.h" -#include "ompi/mca/coll/ml/coll_ml.h" - -#include "opal/util/argv.h" -#include "opal/datatype/opal_datatype.h" -#include "opal/util/output.h" -#include "opal/util/arch.h" -#include "opal/align.h" - -#include "coll_ml.h" -#include "coll_ml_inlines.h" -#include "coll_ml_select.h" -#include "coll_ml_custom_utils.h" -#include "coll_ml_allocation.h" - -static int coll_ml_parse_topology (sub_group_params_t *sub_group_meta_data, size_t sub_group_count, - int *list_of_ranks_in_all_subgroups, int level_one_size); - -/* #define NEW_LEADER_SELECTION */ - -struct ranks_proxy_t { - /* number of subgroups for which the rank is a proxy */ - int number_subgroups; - /* subgrou indecies */ - int *subgroup_index; -}; -typedef struct rank_proxy_t rank_proxy_t; - -#define PROVIDE_SUFFICIENT_MEMORY(ptr, dummy_ptr, ptr_size, unit_type, in_use, \ - n_to_add,n_to_grow) \ - do { \ - if ((in_use) + (n_to_add) > (ptr_size)) { \ - (dummy_ptr) = (unit_type *) \ - realloc(ptr, sizeof(unit_type) * ((ptr_size) + (n_to_grow))); \ - if (NULL != (dummy_ptr)) { \ - (ptr) = (dummy_ptr); \ - (ptr_size) += (n_to_grow); \ - } \ - } \ - } while (0) - -/* - * Local functions - */ - -static int ml_module_enable(mca_coll_base_module_t *module, - struct ompi_communicator_t *comm); - -static int mca_coll_ml_fill_in_route_tab(mca_coll_ml_topology_t *topo, - ompi_communicator_t *comm); - -static void -mca_coll_ml_module_construct(mca_coll_ml_module_t *module) -{ - int index_topo, coll_i, st_i; - mca_coll_ml_topology_t *topo; - - memset ((char *) module + sizeof (module->super), 0, sizeof (*module) - sizeof (module->super)); - - /* It's critical to reset data_offset to zero */ - module->data_offset = -1; - - /* If the topology support zero level and no fragmentation was requested */ - for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) { - topo = &module->topo_list[index_topo]; - topo->global_lowest_hier_group_index = -1; - topo->global_highest_hier_group_index = -1; - topo->number_of_all_subgroups = -1; - topo->n_levels = -1; - topo->all_bcols_mode = ~(0); /* set to all bits */ - topo->status = COLL_ML_TOPO_DISABLED; /* all topologies are not used by default */ - } - - for (coll_i = 0; coll_i < ML_NUM_OF_FUNCTIONS; coll_i++) { - for (st_i = 0; st_i < MCA_COLL_MAX_NUM_SUBTYPES; st_i++) { - module->collectives_topology_map[coll_i][st_i] = ML_UNDEFINED; - } - } - - for (coll_i = 0; coll_i < BCOL_NUM_OF_FUNCTIONS; ++coll_i) { - module->small_message_thresholds[coll_i] = BCOL_THRESHOLD_UNLIMITED; - } - - OBJ_CONSTRUCT(&module->active_bcols_list, opal_list_t); - OBJ_CONSTRUCT(&module->waiting_for_memory_list, opal_list_t); - OBJ_CONSTRUCT(&module->fragment_descriptors, opal_free_list_t); - OBJ_CONSTRUCT(&module->message_descriptors, opal_free_list_t); - OBJ_CONSTRUCT(&module->coll_ml_collective_descriptors, opal_free_list_t); - - memset (&module->fallback, 0, sizeof (module->fallback)); -} - -#define ML_RELEASE_FALLBACK(_coll_ml, _coll) \ - do { \ - if (_coll_ml->fallback.coll_ ## _coll ## _module) { \ - OBJ_RELEASE(_coll_ml->fallback.coll_ ## _coll ## _module); \ - _coll_ml->fallback.coll_ ## _coll ## _module = NULL; \ - } \ - } while (0); - -static void -mca_coll_ml_module_destruct(mca_coll_ml_module_t *module) -{ - int i, j, k,fnc, index_topo; - mca_coll_ml_topology_t *topo; - - ML_VERBOSE(4, ("ML module destruct")); - - for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) { - topo = &module->topo_list[index_topo]; - if (COLL_ML_TOPO_DISABLED == topo->status) { - /* skip the topology */ - continue; - } - - if (NULL != topo->component_pairs) { - for(i = 0; i < topo->n_levels; ++i) { - for(j = 0; j < topo->component_pairs[i].num_bcol_modules; ++j) { - OBJ_RELEASE(topo->component_pairs[i].bcol_modules[j]); - } - /* free the array of bcol module */ - free(topo->component_pairs[i].bcol_modules); - - OBJ_RELEASE(topo->component_pairs[i].subgroup_module); - } - - free(topo->component_pairs); - } - - /* gvm Leak FIX Free collective algorithms structure */ - for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) { - if (NULL != topo->hierarchical_algorithms[fnc]){ - free(topo->hierarchical_algorithms[fnc]); - } - } - - /* free up the route vector memory */ - if (NULL != topo->route_vector) { - free(topo->route_vector); - } - /* free resrouce description */ - if(NULL != topo->array_of_all_subgroups) { - for( k=0 ; k < topo->number_of_all_subgroups ; k++ ) { - if(0 < topo->array_of_all_subgroups[k].n_ranks) { - free(topo->array_of_all_subgroups[k].rank_data); - topo->array_of_all_subgroups[k].rank_data = NULL; - } - } - free(topo->array_of_all_subgroups); - topo->array_of_all_subgroups = NULL; - } - if (NULL != topo->hier_layout_info) { - free(topo->hier_layout_info); - topo->hier_layout_info = NULL; - } - } - - OPAL_LIST_DESTRUCT(&(module->active_bcols_list)); - OBJ_DESTRUCT(&(module->waiting_for_memory_list)); - - /* gvm Leak FIX Remove fragment free list */ - OBJ_DESTRUCT(&(module->fragment_descriptors)); - OBJ_DESTRUCT(&(module->message_descriptors)); - /* push mca_bcol_base_memory_block_desc_t back on list manager */ - mca_coll_ml_free_block(module->payload_block); - /* release the cinvertor if it was allocated */ - if (NULL != module->reference_convertor) { - OBJ_RELEASE(module->reference_convertor); - } - - OBJ_DESTRUCT(&(module->coll_ml_collective_descriptors)); - - if (NULL != module->coll_ml_barrier_function) { - if (NULL != module->coll_ml_barrier_function->component_functions) { - free(module->coll_ml_barrier_function->component_functions); - module->coll_ml_barrier_function->component_functions = NULL; - } - free(module->coll_ml_barrier_function); - module->coll_ml_barrier_function = NULL; - } - - if (module->coll_ml_memsync_function) { - if (module->coll_ml_memsync_function->component_functions) { - free(module->coll_ml_memsync_function->component_functions); - module->coll_ml_memsync_function->component_functions = NULL; - } - free(module->coll_ml_memsync_function); - module->coll_ml_memsync_function = NULL; - } - - ml_coll_hier_allreduce_cleanup_new(module); - ml_coll_hier_allgather_cleanup(module); - ml_coll_hier_bcast_cleanup(module); - ml_coll_hier_reduce_cleanup(module); - - /* release saved collectives */ - ML_RELEASE_FALLBACK(module, allreduce); - ML_RELEASE_FALLBACK(module, allgather); - ML_RELEASE_FALLBACK(module, reduce); - ML_RELEASE_FALLBACK(module, bcast); - ML_RELEASE_FALLBACK(module, iallreduce); - ML_RELEASE_FALLBACK(module, iallgather); - ML_RELEASE_FALLBACK(module, ireduce); - ML_RELEASE_FALLBACK(module, ibcast); -} - - -static int mca_coll_ml_request_free(ompi_request_t** request) -{ - /* local variables */ - mca_coll_ml_collective_operation_progress_t *ml_request= - (mca_coll_ml_collective_operation_progress_t *)(*request); - mca_coll_ml_module_t *ml_module = OP_ML_MODULE(ml_request); - - /* The ML memory bank recycling check done, no we may - * return request and signal completion */ - - /* this fragement does not hold the message data, so ok to return */ - assert(0 == ml_request->pending); - //assert(0 == ml_request->fragment_data.offset_into_user_buffer); - assert(&ml_request->full_message == ml_request->fragment_data.message_descriptor); - assert(ml_request->dag_description.status_array[0].item.opal_list_item_refcount == 0); - ML_VERBOSE(10, ("Releasing Master %p", ml_request)); - /* Mark the request as invalid */ - OMPI_REQUEST_FINI(&ml_request->full_message.super); - opal_free_list_return (&(ml_module->coll_ml_collective_descriptors), - (opal_free_list_item_t *)ml_request); - - /* MPI needs to return with the request object set to MPI_REQUEST_NULL - */ - *request = MPI_REQUEST_NULL; - - return OMPI_SUCCESS; -} - -/* constructor for collective managment descriptor */ -static void mca_coll_ml_collective_operation_progress_construct -(mca_coll_ml_collective_operation_progress_t *desc) { - - /* initialize pointer */ - desc->dag_description.status_array = NULL; - - OBJ_CONSTRUCT(&desc->full_message.send_convertor, opal_convertor_t); - OBJ_CONSTRUCT(&desc->full_message.recv_convertor, opal_convertor_t); - - OBJ_CONSTRUCT(&desc->full_message.dummy_convertor, opal_convertor_t); - - /* intialize request free pointer */ - desc->full_message.super.req_free = mca_coll_ml_request_free; - - /* no cancel function */ - desc->full_message.super.req_cancel = NULL; - /* Collective request type */ - desc->full_message.super.req_type = OMPI_REQUEST_COLL; - /* RLG: Do we need to set req_mpi_object ? */ - - /* If not null , we have to release next fragment */ - desc->next_to_process_frag = NULL; - - /* pointer to previous fragment */ - desc->prev_frag = NULL; - - /* Pasha: moreinit */ - desc->pending = 0; -} - -/* destructor for collective managment descriptor */ -static void mca_coll_ml_collective_operation_progress_destruct -(mca_coll_ml_collective_operation_progress_t *desc) { - mca_coll_ml_module_t *ml_module = - (mca_coll_ml_module_t *) desc->coll_module; - - int i, max_dag_size = ml_module->max_dag_size; - - if (NULL != desc->dag_description.status_array) { - for (i = 0; i < max_dag_size; ++i) { - OBJ_DESTRUCT(&desc->dag_description.status_array[i].item); - } - - free(desc->dag_description.status_array); - desc->dag_description.status_array = NULL; - } - - OBJ_DESTRUCT(&desc->full_message.send_convertor); - OBJ_DESTRUCT(&desc->full_message.recv_convertor); - - OBJ_DESTRUCT(&desc->full_message.dummy_convertor); -} -/* initialize the full message descriptor - can pass in module specific - * initialization data - */ -static int init_ml_fragment_desc(opal_free_list_item_t *desc , void* ctx); -static int init_ml_message_desc(opal_free_list_item_t *desc , void* ctx) -{ - mca_coll_ml_module_t *module= (mca_coll_ml_module_t *) ctx; - mca_coll_ml_descriptor_t *msg_desc = (mca_coll_ml_descriptor_t *) desc; - - /* finish setting up the fragment descriptor */ - init_ml_fragment_desc((opal_free_list_item_t*)&(msg_desc->fragment),module); - - return OPAL_SUCCESS; -} - -/* initialize the fragment descriptor - can pass in module specific - * initialization data - */ -static int init_ml_fragment_desc(opal_free_list_item_t *desc , void* ctx) -{ - mca_coll_ml_module_t *module= (mca_coll_ml_module_t *) ctx; - mca_coll_ml_fragment_t *frag_desc = (mca_coll_ml_fragment_t *) desc; - - /* allocated array of function arguments */ - /* RLG - we have a problem if we don't get the memory */ - /* malloc-debug does not like zero allocations */ - if (module->max_fn_calls > 0) { - frag_desc->fn_args = (bcol_function_args_t *) - malloc(sizeof(bcol_function_args_t) * module->max_fn_calls); - } - - return OPAL_SUCCESS; -} -static void mca_coll_ml_bcol_list_item_construct(mca_coll_ml_bcol_list_item_t *item) -{ - item->bcol_module = NULL; -} -OBJ_CLASS_INSTANCE(mca_coll_ml_bcol_list_item_t, - opal_list_item_t, - mca_coll_ml_bcol_list_item_construct, - NULL); - -static void generate_active_bcols_list(mca_coll_ml_module_t *ml_module) -{ - int i, j, index_topo; - mca_coll_ml_topology_t *topo; - bool bcol_was_found; - mca_coll_ml_bcol_list_item_t *bcol_item = NULL; - mca_bcol_base_module_t *bcol_module = NULL; - - ML_VERBOSE(10, ("Generating active bcol list ")); - - for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) { - topo = &ml_module->topo_list[index_topo]; - if (COLL_ML_TOPO_DISABLED == topo->status) { - /* skip the topology */ - continue; - } - for( i = 0; i < topo->n_levels; i++) { - - for( j = 0; j < topo->component_pairs[i].num_bcol_modules; j++) { - bcol_module = topo->component_pairs[i].bcol_modules[j]; - - /* Check if the bcol provides synchronization function, if the - * function is not provided we skip this bcol, since it isn't used - * for memory synchronization (for instance - ptpcoll )*/ - if (NULL == GET_BCOL_SYNC_FN(bcol_module)) { - ML_VERBOSE(10,(" No sync function was provided by bcol %s", - bcol_module->bcol_component->bcol_version.mca_component_name)); - continue; - } - - bcol_was_found = false; - for(bcol_item = (mca_coll_ml_bcol_list_item_t *)opal_list_get_first(&ml_module->active_bcols_list); - !bcol_was_found && - bcol_item != (mca_coll_ml_bcol_list_item_t *)opal_list_get_end(&ml_module->active_bcols_list); - bcol_item = (mca_coll_ml_bcol_list_item_t *)opal_list_get_next((opal_list_item_t *)bcol_item)) { - if (bcol_module == bcol_item->bcol_module) { - bcol_was_found = true; - } - } - - /* append the item to the list if it was not found */ - if (!bcol_was_found) { - bcol_item = OBJ_NEW(mca_coll_ml_bcol_list_item_t); - bcol_item->bcol_module = bcol_module; - opal_list_append(&ml_module->active_bcols_list, (opal_list_item_t *)bcol_item); - } - - } - } - } -} - -static int calculate_buffer_header_size(mca_coll_ml_module_t *ml_module) -{ - mca_coll_ml_topology_t *topo; - mca_bcol_base_module_t *bcol_module; - - uint32_t offset = 0; - int i, j, *ranks_in_comm, kount = 0, - rc, data_offset = 0, index_topo, - comm_size = ompi_comm_size(ml_module->comm); - - ML_VERBOSE(10, ("Calculating offset for the ML")); - - /* probably a stupid thing to do, but we have to loop over twice */ - - for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) { - topo = &ml_module->topo_list[index_topo]; - if (COLL_ML_TOPO_DISABLED == topo->status) { - /* skip the topology */ - continue; - } - - for (i = 0; i < topo->n_levels; i++) { - for (j = 0; j < topo->component_pairs[i].num_bcol_modules; j++) { - bcol_module = topo->component_pairs[i].bcol_modules[j]; - if (0 < bcol_module->header_size) { - /* bump the kounter */ - kount++; - /* find the largest header request */ - if (offset < bcol_module->header_size) { - offset = bcol_module->header_size; - } - } - - /* Set bcol mode bits */ - topo->all_bcols_mode &= bcol_module->supported_mode; - } - } - - offset = OPAL_ALIGN(offset, BCOL_HEAD_ALIGN, uint32_t); - /* select largest offset between multiple topologies */ - if (data_offset < (int) offset) { - data_offset = (int) offset; - } - } - - ranks_in_comm = (int *) malloc(comm_size * sizeof(int)); - if (OPAL_UNLIKELY(NULL == ranks_in_comm)) { - ML_ERROR(("Memory allocation failed.")); - return OMPI_ERROR; - } - - for (i = 0; i < comm_size; ++i) { - ranks_in_comm[i] = i; - } - - rc = comm_allreduce_pml(&data_offset, &data_offset, 1, - MPI_INT, ompi_comm_rank(ml_module->comm), - MPI_MAX, comm_size, - ranks_in_comm, ml_module->comm); - free(ranks_in_comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - ML_ERROR(("comm_allreduce_pml failed.")); - return OMPI_ERROR; - } - - ml_module->data_offset = (uint32_t) data_offset; - - ML_VERBOSE(10, ("The offset is %d", ml_module->data_offset)); - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_register_bcols(mca_coll_ml_module_t *ml_module) -{ - /* local variables */ - int i, j, index_topo; - int ret = OMPI_SUCCESS; - mca_bcol_base_module_t *bcol_module; - mca_coll_ml_topology_t *topo; - - /* loop over all bcols and register the ml memory block which each */ - for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) { - topo = &ml_module->topo_list[index_topo]; - if (COLL_ML_TOPO_DISABLED == topo->status) { - /* skip the topology */ - continue; - } - - for (i = 0; i < topo->n_levels; i++) { - for (j = 0; j < topo->component_pairs[i].num_bcol_modules; j++) { - bcol_module = topo->component_pairs[i].bcol_modules[j]; - if (NULL != bcol_module->bcol_memory_init) { - ret = bcol_module->bcol_memory_init(ml_module->payload_block, - ml_module->data_offset, - bcol_module, - (NULL != bcol_module->network_context) ? - bcol_module->network_context->context_data: NULL); - if (OMPI_SUCCESS != ret) { - ML_ERROR(("Bcol registration failed on ml level!!")); - return ret; - } - } - } - } - } - - return OMPI_SUCCESS; -} - -static int ml_module_memory_initialization(mca_coll_ml_module_t *ml_module) -{ - int ret; - int nbanks, nbuffers, buf_size; - mca_coll_ml_component_t *cs = &mca_coll_ml_component; - - ml_module->payload_block = mca_coll_ml_allocate_block(cs,ml_module->payload_block); - - if (NULL == ml_module->payload_block) { - ML_VERBOSE(1, ("mca_coll_ml_allocate_block exited with error.")); - return OMPI_ERROR; - } - - /* get memory block parameters */ - nbanks = cs->n_payload_mem_banks; - nbuffers = cs->n_payload_buffs_per_bank; - buf_size = cs->payload_buffer_size; - - ML_VERBOSE(10, ("Call for initialize block.")); - - ret = mca_coll_ml_initialize_block(ml_module->payload_block, - nbuffers, nbanks, buf_size, ml_module->data_offset, - NULL); - if (OMPI_SUCCESS != ret) { - return ret; - } - - ML_VERBOSE(10, ("Call for register bcols.")); - - /* inititialize the memory with all of the bcols: - loop through the bcol modules and invoke the memory init */ - ret = mca_coll_ml_register_bcols(ml_module); - if (OMPI_SUCCESS != ret) { - ML_ERROR(("mca_coll_ml_register_bcols returned an error.")); - /* goto CLEANUP; */ - return ret; - } - - return OMPI_SUCCESS; -} - -/* do some sanity checks */ -static int check_global_view_of_subgroups( int n_procs_selected, - int n_procs_in, int ll_p1, int* all_selected, - mca_sbgp_base_module_t *module ) -{ - /* local variables */ - int ret=OMPI_SUCCESS; - int i, sum; - - bool local_leader_found=false; - - /* is there a single local-leader */ - for (i = 0; i < n_procs_selected; i++) { - if( ll_p1 == -all_selected[module->group_list[i]]) { - /* found the local leader */ - if( local_leader_found ) { - /* more than one local leader - don't know how to - * handle this, so bail - */ - ML_VERBOSE(1, ("More than a single leader for a group.")); - ret=OMPI_ERROR; - goto exit_ERROR; - } else { - local_leader_found=true; - } - } - } - - /* check to make sure that all agree on the same size of - * the group - */ - sum=0; - for (i = 0; i < n_procs_in; i++) { - if(ll_p1==all_selected[i]) { - sum++; - } else if( ll_p1 == -all_selected[i]) { - sum++; - } - } - if( sum != n_procs_selected ) { - ML_VERBOSE(1, ("number of procs in the group unexpected. Expected %d Got %d",n_procs_selected,sum)); - ret=OMPI_ERROR; - goto exit_ERROR; - } - /* check to make sure that all have the same list of ranks. - */ - for (i = 0; i < n_procs_selected; i++) { - if(ll_p1!=all_selected[module->group_list[i]] && - ll_p1!=-all_selected[module->group_list[i]] ) { - ret=OMPI_ERROR; - ML_VERBOSE(1, ("Mismatch in rank list - element #%d - %d ",i,all_selected[module->group_list[i]])); - goto exit_ERROR; - } - } - - /* return */ - return ret; - - exit_ERROR: - /* return */ - return ret; -} - -static int ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ranks_in_all_subgroups, int my_rank_in_list) -{ - int *list_n_connected; - int group_size, rank, i, j, knt, offset, k, my_sbgp = 0; - int my_root, level_one_knt; - sub_group_params_t *array_of_all_subgroup_ranks = topo-> - array_of_all_subgroups; - int num_total_subgroups = topo->number_of_all_subgroups; - int n_hier = topo->n_levels; - - hierarchy_pairs *pair = NULL; - mca_coll_ml_leader_offset_info_t *loc_leader = (mca_coll_ml_leader_offset_info_t *) - malloc(sizeof(mca_coll_ml_leader_offset_info_t)*(n_hier+1)); - - if (NULL == loc_leader) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* first thing I want to know is where does the first level end */ - level_one_knt = 0; - - while (level_one_knt < num_total_subgroups && 0 == array_of_all_subgroup_ranks[level_one_knt].level_in_hierarchy) { - level_one_knt++; - } - - /* fprintf(stderr,"PPP %d %d %d ", level_one_knt, array_of_all_subgroup_ranks[0].level_in_hierarchy, num_total_subgroups); */ - - /* I want to cache this number for unpack*/ - array_of_all_subgroup_ranks->level_one_index = level_one_knt; - - /* determine whether or not ranks are contiguous */ - topo->ranks_contiguous = true; - for (i = 0, knt = 0 ; i < level_one_knt && topo->ranks_contiguous ; ++i) { - for (j = 0 ; j < array_of_all_subgroup_ranks[i].n_ranks ; ++j, ++knt) { - if (knt != list_of_ranks_in_all_subgroups[knt]) { - topo->ranks_contiguous = false; - break; - } - } - } - - loc_leader[0].offset = 0; - - /* now find my first level offset, and my index in level one */ - for (i = 0, loc_leader[0].level_one_index = -1 ; i < level_one_knt ; ++i) { - offset = array_of_all_subgroup_ranks[i].index_of_first_element; - for (k = 0 ; k < array_of_all_subgroup_ranks[i].n_ranks ; ++k) { - rank = list_of_ranks_in_all_subgroups[k + offset]; - if (rank == my_rank_in_list) { - loc_leader[0].offset = offset; - loc_leader[0].level_one_index = k; - i = level_one_knt; - break; - } - } - } - - /* every rank MUST appear at level 0 */ - assert (loc_leader[0].level_one_index > -1); - - for (i = 0 ; i < n_hier ; ++i) { - pair = &topo->component_pairs[i]; - /* find the size of the group */ - group_size = pair->subgroup_module->group_size; - /* malloc some memory for the new list to cache - on the bcol module - */ - list_n_connected = (int *) calloc(group_size, sizeof (int)); - if (NULL == list_n_connected) { - free (loc_leader); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* next thing to do is to find out which subgroup I'm in - * at this particular level - */ - for (j = 0, knt = 0, my_sbgp = -1 ; j < num_total_subgroups && 0 > my_sbgp ; ++j) { - offset = array_of_all_subgroup_ranks[j].index_of_first_element; - - /* in the 1-level case just skip any group of size 1 and move on - * to the real group. */ - if (1 == n_hier && 1 == array_of_all_subgroup_ranks[j].n_ranks) { - continue; - } - - for (k = 0; k < array_of_all_subgroup_ranks[j].n_ranks; k++) { - rank = list_of_ranks_in_all_subgroups[k+offset]; - /* we can not use the level_in_topology flag to determine the - * level since not all levels may be represented so keep a count - * of the number of times this ranks shows up. when it has been - * seen the correct number of times we are done. */ - if (rank == my_rank_in_list && ++knt == (i+1)){ - my_sbgp = j; - /* tag whether I am a local leader or not at this level */ - loc_leader[i].leader = (my_rank_in_list == array_of_all_subgroup_ranks[j].root_rank_in_comm); - break; - } - } - } - - /* should have found a subgroup */ - assert (my_sbgp > -1); - - for (j = 0 ; j < group_size ; ++j) { - list_n_connected[j] = array_of_all_subgroup_ranks[my_sbgp]. - rank_data[j].num_of_ranks_represented; - } - - /* now find all sbgps that the root of this sbgp belongs to - * previous to this "my_sbgp" */ - my_root = array_of_all_subgroup_ranks[my_sbgp].root_rank_in_comm; - - for (j = 0, knt = 0 ; j < my_sbgp ; ++j) { - if (array_of_all_subgroup_ranks[j].root_rank_in_comm == my_root) { - for (k = 1; k < array_of_all_subgroup_ranks[j].n_ranks; ++k) { - knt += array_of_all_subgroup_ranks[j].rank_data[k]. - num_of_ranks_represented; - } - - } - } - - /* and then I add one for the root itself */ - list_n_connected[0] = knt + 1; - - /* now cache this on the bcol module */ - pair->bcol_modules[0]->list_n_connected = list_n_connected; - - /* I should do one more round here and figure out my offset at this level - * the calculation is simple: Am I a local leader in this level? If so, then I keep the offset - * from the previous level. Else, I find out how "far away" the local leader is from me and set - * this as the new offset. - */ - /* do this after first level */ - if (i > 0) { - /* if I'm not the local leader */ - if( !loc_leader[i].leader) { - /* then I am not a local leader at this level */ - offset = array_of_all_subgroup_ranks[my_sbgp].index_of_first_element; - for (k = 0, knt = 0 ; k < array_of_all_subgroup_ranks[my_sbgp].n_ranks ; ++k) { - rank = list_of_ranks_in_all_subgroups[k+offset]; - if (rank == my_rank_in_list) { - break; - } - - knt += list_n_connected[k]; - } - loc_leader[i].offset = loc_leader[i-1].offset - knt; - } else { - /* if I am the local leader, then keep the same offset */ - loc_leader[i].offset = loc_leader[i-1].offset; - } - } - - pair->bcol_modules[0]->hier_scather_offset = loc_leader[i].offset; - - /*setup the tree */ - pair->bcol_modules[0]->k_nomial_tree(pair->bcol_modules[0]); - } - - /* see if I am in the last subgroup, if I am, - * then I am a root for the bcast operation - */ - offset = array_of_all_subgroup_ranks[n_hier - 1].index_of_first_element; - for( i = 0; i < array_of_all_subgroup_ranks[n_hier - 1].n_ranks; i++){ - rank = list_of_ranks_in_all_subgroups[i + offset]; - if( rank == my_rank_in_list ){ - loc_leader[n_hier - 1].offset = 0; - loc_leader[n_hier - 1].leader = true; - } - } - - /* set the last offset to 0 and set the leader according to your top level position */ - loc_leader[n_hier].offset = 0; - if(loc_leader[n_hier - 1].leader){ - loc_leader[n_hier].leader = true; - } else { - loc_leader[n_hier].leader = false; - } - - /* what other goodies do I want to cache on the ml-module? */ - topo->hier_layout_info = loc_leader; - - return OMPI_SUCCESS; -} - -static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo, - ompi_communicator_t *comm, - int my_highest_group_index, int *map_to_comm_ranks, - int *num_total_subgroups, sub_group_params_t **array_of_all_subgroup_ranks, - int **list_of_ranks_in_all_subgroups) -{ - - int ret = OMPI_SUCCESS; - int i, in_buf, root, my_rank,sum; - int in_num_total_subgroups = *num_total_subgroups; - int *scratch_space = NULL; - - /* figure out who holds all the sub-group information - only those - * ranks in the top level know this data at this point */ - my_rank = ompi_comm_rank(comm); - if( (my_highest_group_index == topo->global_highest_hier_group_index ) - && - ( my_rank == - topo->component_pairs[topo->n_levels-1].subgroup_module->group_list[0]) - ) { - in_buf=my_rank; - } else { - /* since this will be a sum allreduce - contributing 0 will not - * change the value */ - in_buf=0; - } - ret = comm_allreduce_pml(&in_buf, &root, 1, MPI_INT, - my_rank, MPI_SUM, - ompi_comm_size(comm), map_to_comm_ranks, - comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("comm_allreduce_pml failed. root reduction")); - goto exit_ERROR; - } - - /* broadcast the number of groups */ - ret=comm_bcast_pml(num_total_subgroups, root, 1, - MPI_INT, my_rank, ompi_comm_size(comm), - map_to_comm_ranks,comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("comm_bcast_pml failed. num_total_subgroups bcast")); - goto exit_ERROR; - } - - scratch_space=(int *)malloc(4*sizeof(int)*(*num_total_subgroups)); - if (OPAL_UNLIKELY(NULL == scratch_space)) { - ML_VERBOSE(10, ("Cannot allocate memory scratch_space.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - if( my_rank == root ) { - for(i=0 ; i < (*num_total_subgroups) ; i++ ) { - scratch_space[4*i]=(*array_of_all_subgroup_ranks)[i].root_rank_in_comm; - scratch_space[4*i+1]=(*array_of_all_subgroup_ranks)[i].n_ranks; - scratch_space[4*i+2]=(*array_of_all_subgroup_ranks)[i].index_of_first_element; - scratch_space[4*i+3]=(*array_of_all_subgroup_ranks)[i].level_in_hierarchy; - } - } - ret=comm_bcast_pml(scratch_space, root, 4*(*num_total_subgroups), - MPI_INT, my_rank, ompi_comm_size(comm), - map_to_comm_ranks, comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("comm_allreduce_pml failed. scratch_space bcast")); - goto exit_ERROR; - } - if( my_rank != root ) { - if( in_num_total_subgroups != (*num_total_subgroups) ) { - /* free old array_of_all_subgroup_ranks array - need to fill it - * with the global data - assume that if the array size is the - * same, all data is correct, and in the same order */ - free((*array_of_all_subgroup_ranks)); - (*array_of_all_subgroup_ranks)=(sub_group_params_t *) - malloc(sizeof(sub_group_params_t)*(*num_total_subgroups)); - if (OPAL_UNLIKELY(NULL == (*array_of_all_subgroup_ranks))) { - ML_VERBOSE(10, ("Cannot allocate memory array_of_all_subgroup_ranks.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - for(i=0 ; i < (*num_total_subgroups) ; i++ ) { - (*array_of_all_subgroup_ranks)[i].root_rank_in_comm=scratch_space[4*i]; - (*array_of_all_subgroup_ranks)[i].n_ranks=scratch_space[4*i+1]; - (*array_of_all_subgroup_ranks)[i].index_of_first_element=scratch_space[4*i+2]; - (*array_of_all_subgroup_ranks)[i].level_in_hierarchy=scratch_space[4*i+3]; - } - } - } - /* figure out how many entries in all the subgroups - ranks that apear - * in k subgroups appear k times in the list */ - sum=0; - for(i=0 ; i < (*num_total_subgroups) ; i++ ) { - sum+=(*array_of_all_subgroup_ranks)[i].n_ranks; - } - if( in_num_total_subgroups != (*num_total_subgroups) && sum > 0 ) { - (*list_of_ranks_in_all_subgroups)=(int *) - realloc((*list_of_ranks_in_all_subgroups),sizeof(int)*sum); - if (OPAL_UNLIKELY(NULL == (*list_of_ranks_in_all_subgroups))) { - ML_VERBOSE(10, ("Cannot allocate memory *list_of_ranks_in_all_subgroups.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - } - ret = comm_bcast_pml(*list_of_ranks_in_all_subgroups, root, sum, - MPI_INT, my_rank, ompi_comm_size(comm), - map_to_comm_ranks, comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("Bcast failed for list_of_ranks_in_all_subgroups ")); - goto exit_ERROR; - } - - /* - * The data that is needed for a given rooted operation is: - * - subgroup,rank information for the source of the data. - * That is, which rank in the subgroup will recieve the - * data and distribute to the rest of the ranks. - * - the ranks that this data will be sent to. This is - * described by the ranks in the current subgroups, and - * the subroups for which each rank is a proxy for, - * recursively in the communication tree. - * - * The assumption is that data will be delived to each subgroup - * in an order, that is, all the data destined to subgroup rank 0 - * will appear 1st, then that for rank 1, etc. This implies that - * the data destined to rank 0, for example, will include the - * data for rank 0, as well as all the ranks that appear following - * it in the tree - in order. - * - * Proxies: A rank may be a proxy for more than a single subgroup. - * When a rank is proxy for more than a single subgroup, we - * maintain a fixed order of subgroups for which this is a - * proxy, with an assumption that the data for the first subgroup - * appears first in the list, then that for the second, etc. - * Since the data for the proxy (which is a member of this subgroup) - * appears only once in the data list, the assumption is that the - * proxy will be the root for this operation, and it is the first - * set of data in the data list. This means, that the data offset - * for the second ranks in each subgroup will include all the data - * for the previous subgroups, recursively. This lets us maintain - * the simple addressing scheme of contigous data per rank in - * the subcommunicator. - * - * The information needed for each rank in the subgroup are the - * group indices for which it is a proxy. - */ - /* - * fill in the vertecies in the hierarchichal communications graph - */ - - /* figure out how detailed connection information, so that we can - * can figure out how the data needs to be ordered for sending it - * though the tree in various collective algorithms that have per-rank - * data associated with them. - */ - - /* this function does a depth first traversal of the tree data and - * builds rank data and ensures that hierarchy level 0 is in the - * correct order for collective algorithms with per-rank data. - */ - coll_ml_parse_topology (*array_of_all_subgroup_ranks, *num_total_subgroups, - *list_of_ranks_in_all_subgroups, ompi_comm_size (comm)); - - /* The list of ranks in all subgroups is the same as the old sort list. This is the same - * order needed for both scatter and gather. */ - topo->sort_list = (*list_of_ranks_in_all_subgroups); - - /* return */ - exit_ERROR: - if (scratch_space) { - free(scratch_space); - } - - return ret; -} - -static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selected, - sub_group_params_t **sub_group_meta_data, - int *size_of_sub_group_meta_data, - int **list_of_ranks_in_all_subgroups, - int *size_of_list_of_ranks_in_all_subgroups, - int *num_ranks_in_list_of_ranks_in_all_subgroups, - int *num_total_subgroups, - int *map_to_comm_ranks, int level_in_hierarchy - ) { - - /* local data */ - int rc=OMPI_SUCCESS; - int rank_in_list,old_sg_size=(*num_total_subgroups); - int sg_index, array_id, offset, sg_id; - sub_group_params_t *dummy1 = NULL; - int32_t **dummy2 = NULL; - int32_t *dummy3 = NULL; - int32_t **temp = NULL; - int knt1 = 0, - knt2 = 0, - knt3 = 0; - - /* loop over all elements in the array of ranks selected, looking for - * newly selected ranks - these form the new subgroups */ - for(rank_in_list = 0 ; rank_in_list < size_of_all_selected ; rank_in_list++ ) { - int sg_root, current_rank_in_comm; - /* get root's rank in the communicator */ - sg_root=all_selected[rank_in_list]; - - if( 0 == sg_root ) { - /* this rank not selected - go to the next rank */ - continue; - } - - if( sg_root < 0 ) { - sg_root=-sg_root-1; - } else { - sg_root-=1; - } - - current_rank_in_comm=map_to_comm_ranks[rank_in_list]; - - /* loop over existing groups, and see if this is a member of a new group - * or if this group has already been found. - */ - for (sg_index = old_sg_size, sg_id = -1 ; sg_index < (*num_total_subgroups) ; sg_index++) { - if ((*sub_group_meta_data)[sg_index].root_rank_in_comm == sg_root) { - /* add rank to the list */ - (*sub_group_meta_data)[sg_index].n_ranks++; - sg_id = sg_index; - break; - } - } - - if (-1 == sg_id) { - /* did not find existing sub-group, create new one */ - /* intialize new subgroup */ - PROVIDE_SUFFICIENT_MEMORY((*sub_group_meta_data), dummy1, - (*size_of_sub_group_meta_data), - sub_group_params_t, (*num_total_subgroups), 1, 5); - if (OPAL_UNLIKELY(NULL == (*sub_group_meta_data))) { - ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - /* do this for the temporary memory slots */ - PROVIDE_SUFFICIENT_MEMORY(temp, dummy2, - knt1, int32_t *, knt2, 1, 5); - if (OPAL_UNLIKELY(NULL == temp)) { - ML_VERBOSE(10, ("Cannot allocate memory for temporary storage")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - (*sub_group_meta_data)[(*num_total_subgroups)].root_rank_in_comm = sg_root; - (*sub_group_meta_data)[(*num_total_subgroups)].n_ranks = 1; - - /* no need for this here - use a temporary ptr */ - temp[knt2]= - (int *)calloc(size_of_all_selected, sizeof(int)); - if (OPAL_UNLIKELY(NULL == temp[knt2] ) ){ - ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - sg_id = (*num_total_subgroups)++; - knt3 = ++knt2; - } else { - knt3 = sg_id - old_sg_size + 1; - } - - array_id = (*sub_group_meta_data)[sg_id].n_ranks-1; - temp[knt3-1][array_id] = current_rank_in_comm; - } - - /* linearize the data - one rank will ship this to all the other - * ranks the communicator - */ - /* make sure there is enough memory to hold the list */ - PROVIDE_SUFFICIENT_MEMORY((*list_of_ranks_in_all_subgroups),dummy3, - (*size_of_list_of_ranks_in_all_subgroups), - int, (*num_ranks_in_list_of_ranks_in_all_subgroups), - size_of_all_selected,size_of_all_selected); - if (OPAL_UNLIKELY(NULL == (*list_of_ranks_in_all_subgroups))) { - ML_VERBOSE(10, ("Cannot allocate memory for list_of_ranks_in_all_subgroups.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - /* loop over new subgroups */ - for( sg_id=old_sg_size ; sg_id < (*num_total_subgroups) ; sg_id++ ) { - offset=(*num_ranks_in_list_of_ranks_in_all_subgroups); - - (*sub_group_meta_data)[sg_id].index_of_first_element=offset; - - if ((*sub_group_meta_data)[sg_id].n_ranks && NULL == temp) { - return OMPI_ERROR; - } - - for( array_id=0 ; array_id < (*sub_group_meta_data)[sg_id].n_ranks ; - array_id++ ) { - (*list_of_ranks_in_all_subgroups)[offset+array_id]= - temp[sg_id-old_sg_size][array_id]; - } - (*num_ranks_in_list_of_ranks_in_all_subgroups)+= - (*sub_group_meta_data)[sg_id].n_ranks; - (*sub_group_meta_data)[sg_id].level_in_hierarchy=level_in_hierarchy; - /* this causes problems on XT5 starting at 6144 cores */ - free(temp[sg_id-old_sg_size]); - } - - /* clean up temporary storage */ - exit_ERROR: - if (NULL != temp) { - free(temp); - } - - /* return */ - return rc; -} - -static int topo_parse (sub_group_params_t *sub_group_meta_data, int index, int *dst, int *src, int *dst_offset) -{ - int src_offset = sub_group_meta_data[index].index_of_first_element; - int total_ranks_represented = 0, ranks_represented; - - if (0 == sub_group_meta_data[index].level_in_hierarchy) { - ML_VERBOSE(10, ("Copying data for index %d to %d. Ranks at this level: %d", index, *dst_offset, - sub_group_meta_data[index].n_ranks)); - - /* move level one subgroup data */ - memmove (dst + *dst_offset, src + src_offset, sizeof (int) * sub_group_meta_data[index].n_ranks); - - /* update the offset of this subgroup since it may have been moved */ - sub_group_meta_data[index].index_of_first_element = *dst_offset; - *dst_offset += sub_group_meta_data[index].n_ranks; - } - - ML_VERBOSE(10, ("Subgroup %d has %d ranks. level = %d", index, sub_group_meta_data[index].n_ranks, - sub_group_meta_data[index].level_in_hierarchy)); - - /* fill in subgroup ranks */ - sub_group_meta_data[index].rank_data=(rank_properties_t *) - malloc(sizeof(rank_properties_t) * sub_group_meta_data[index].n_ranks); - if (OPAL_UNLIKELY(NULL == sub_group_meta_data[index].rank_data)) { - ML_VERBOSE(10, ("Cannot allocate memory for rank_data ")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* recurse on all subgroups */ - for (int j = 0 ; j < sub_group_meta_data[index].n_ranks ; ++j) { - int rank = src[j + src_offset]; - int next_level; - - /* determine if this rank is the root of the subgroup */ - if (rank == sub_group_meta_data[index].root_rank_in_comm) { - sub_group_meta_data[index].root_index = j; - } - - sub_group_meta_data[index].rank_data[j].leaf = true; - sub_group_meta_data[index].rank_data[j].rank = rank; - - if (sub_group_meta_data[index].level_in_hierarchy) { - ML_VERBOSE(10, ("Looking for subgroup containing %d as root", rank)); - - for (next_level = index - 1 ; next_level >= 0 ; --next_level) { - if (rank == sub_group_meta_data[next_level].root_rank_in_comm) { - ML_VERBOSE(10, ("Subgroup %d has root %d", next_level, rank)); - break; - } - } - - /* all ranks are represented in the lowest level. this subgroup is not at the lowest level - * so it must be a root at a lower level */ - assert (next_level >= 0); - - /* not a leaf node */ - sub_group_meta_data[index].rank_data[j].leaf = false; - ranks_represented = topo_parse (sub_group_meta_data, next_level, dst, src, dst_offset); - if (0 > ranks_represented) { - return ranks_represented; - } - sub_group_meta_data[index].rank_data[j].num_of_ranks_represented = ranks_represented; - - total_ranks_represented += ranks_represented; - } else { - /* leaf node */ - sub_group_meta_data[index].rank_data[j].leaf = true; - sub_group_meta_data[index].rank_data[j].num_of_ranks_represented = 1; - - total_ranks_represented++; - } - - ML_VERBOSE(10, ("Group %d, level %d, index %d, rank %d represents %d ranks", index, - sub_group_meta_data[index].level_in_hierarchy, j, rank, - sub_group_meta_data[index].rank_data[j].num_of_ranks_represented)); - } - - return total_ranks_represented; -} - -/* put level one in leaf order */ -static int coll_ml_parse_topology (sub_group_params_t *sub_group_meta_data, size_t sub_group_count, - int *list_of_ranks_in_all_subgroups, int level_one_size) -{ - int *tmp_data; - int offset, rc; - - tmp_data = calloc (level_one_size, sizeof (int)); - if (NULL == tmp_data) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* do a DFS parse of the topology and ensure that level 1 is in the correct scatter/gather order */ - offset = 0; - rc = topo_parse (sub_group_meta_data, sub_group_count - 1, tmp_data, list_of_ranks_in_all_subgroups, &offset); - if (0 > rc) { - free (tmp_data); - return rc; - } - - /* all ranks in level one should be represented in the re-order buffer */ - assert (offset == level_one_size); - - /* copy re-ordered level 1 (0) */ - if (0 != offset) { - /* copy new level one data back into the list of all subgroups */ - memmove (list_of_ranks_in_all_subgroups, tmp_data, sizeof (int) * offset); - } - - free (tmp_data); - - return OMPI_SUCCESS; -} - -static int append_new_network_context(hierarchy_pairs *pair) -{ - int i; - int rc; - mca_coll_ml_lmngr_t *memory_manager = &mca_coll_ml_component.memory_manager; - bcol_base_network_context_t *nc = NULL; - - for (i = 0; i < pair->num_bcol_modules; i++) { - nc = pair->bcol_modules[i]->network_context; - if (NULL != nc) { - rc = mca_coll_ml_lmngr_append_nc(memory_manager, nc); - if (OMPI_SUCCESS != rc) { - return OMPI_ERROR; - } - /* caching the network context id on bcol */ - pair->bcol_modules[i]->context_index = nc->context_id; - } - } - - return OMPI_SUCCESS; -} - -static int ml_module_set_small_msg_thresholds(mca_coll_ml_module_t *ml_module) -{ - const mca_coll_ml_topology_t *topo_info; - mca_bcol_base_module_t *bcol_module; - hierarchy_pairs *pair; - - int i, j, rc, hier, *ranks_in_comm, n_hier, tp, - comm_size = ompi_comm_size(ml_module->comm); - - for (tp = 0; tp < COLL_ML_TOPO_MAX; ++tp) { - topo_info = &ml_module->topo_list[tp]; - if (COLL_ML_TOPO_DISABLED == topo_info->status) { - /* Skip the topology */ - continue; - } - - n_hier = topo_info->n_levels; - for (hier = 0; hier < n_hier; ++hier) { - pair = &topo_info->component_pairs[hier]; - - for (i = 0; i < pair->num_bcol_modules; ++i) { - bcol_module = pair->bcol_modules[i]; - - if (NULL != bcol_module->set_small_msg_thresholds) { - bcol_module->set_small_msg_thresholds(bcol_module); - } - - for (j = 0; j < BCOL_NUM_OF_FUNCTIONS; ++j) { - if (ml_module->small_message_thresholds[j] > - bcol_module->small_message_thresholds[j]) { - ml_module->small_message_thresholds[j] = - bcol_module->small_message_thresholds[j]; - } - } - } - - } - } - - ranks_in_comm = (int *) malloc(comm_size * sizeof(int)); - if (OPAL_UNLIKELY(NULL == ranks_in_comm)) { - ML_ERROR(("Memory allocation failed.")); - return OMPI_ERROR; - } - - for (i = 0; i < comm_size; ++i) { - ranks_in_comm[i] = i; - } - - rc = comm_allreduce_pml(ml_module->small_message_thresholds, - ml_module->small_message_thresholds, - BCOL_NUM_OF_FUNCTIONS, MPI_INT, - ompi_comm_rank(ml_module->comm), MPI_MIN, - comm_size, ranks_in_comm, ml_module->comm); - free(ranks_in_comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - ML_ERROR(("comm_allreduce_pml failed.")); - return OMPI_ERROR; - } - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_read_allbcols_settings(mca_coll_ml_module_t *ml_module, - int n_hierarchies) -{ - int i, j, - ret = OMPI_SUCCESS; - int *ranks_map = NULL, - *bcols_in_use = NULL, - *bcols_in_use_all_ranks = NULL; - bool use_user_bufs, limit_size_user_bufs; - ssize_t length_ml_payload; - int64_t frag_size; - const mca_bcol_base_component_2_0_0_t *bcol_component = NULL; - mca_base_component_list_item_t *bcol_cli = NULL; - int bcol_index; - - /* If this assert fails, it means that you changed initialization - * order and the date offset , that is critical for this section of code, - * have not been initilized. - * DO NOT REMOVE THIS ASSERT !!! - */ - assert(ml_module->data_offset >= 0); - - /* need to figure out which bcol's are participating - * in the hierarchy across the communicator, so that we can set - * appropriate segmentation parameters. - */ - bcols_in_use = (int *) calloc(2 * n_hierarchies, sizeof(int)); - if (OPAL_UNLIKELY(NULL == bcols_in_use)) { - ML_VERBOSE(10, ("Cannot allocate memory for bcols_in_use.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - /* setup pointers to arrays that will hold bcol parameters. Since - * given bols are not instantiated in all processes, need to get this - * information from those ranks that have instantiated these - * parameters - */ - bcols_in_use_all_ranks = bcols_in_use+n_hierarchies; - - /* get list of bcols that I am using */ - for (j = 0; j < COLL_ML_TOPO_MAX; j++) { - mca_coll_ml_topology_t *topo_info = &ml_module->topo_list[j]; - if (COLL_ML_TOPO_DISABLED == topo_info->status) { - /* skip the topology */ - continue; - } - - for(i = 0; i < topo_info->n_levels; i++ ) { - int ind; - ind = topo_info->component_pairs[i].bcol_index; - bcols_in_use[ind] = 1; - } - } - - /* set one to one mapping */ - ranks_map = (int *) malloc(sizeof(int) * ompi_comm_size(ml_module->comm)); - if (NULL == ranks_map) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - for (i = 0; i < ompi_comm_size(ml_module->comm); i++) { - ranks_map[i] = i; - } - - /* reduce over all the ranks to figure out which bcols are - * participating at this level - */ - ret = comm_allreduce_pml(bcols_in_use, bcols_in_use_all_ranks, - n_hierarchies, MPI_INT, ompi_comm_rank(ml_module->comm), - MPI_MAX, ompi_comm_size(ml_module->comm), - ranks_map, ml_module->comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("comm_allreduce_pml failed. bcols_in_use reduction")); - goto exit_ERROR; - } - - /* - * figure out fragmenation parameters - */ - - /* size of ml buffer */ - length_ml_payload = mca_coll_ml_component.payload_buffer_size - ml_module->data_offset; - - /* figure out if data will be segmented for pipelining - - * for non-contigous data will just use a fragment the size - * of the ml payload buffer */ - - /* check to see if any bcols impose a limit */ - limit_size_user_bufs = false; - use_user_bufs = true; - frag_size = length_ml_payload; - bcol_index = 0; - - OPAL_LIST_FOREACH(bcol_cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) { - /* check to see if this bcol is being used */ - if (!bcols_in_use_all_ranks[bcol_index++]) { - /* not in use */ - continue; - } - - bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component; - - /* check to see if user buffers can be used */ - if (!bcol_component->can_use_user_buffers) { - /* need to use library buffers, so all will do this */ - use_user_bufs = false; - } - - /* figure out fragement size */ - if (bcol_component->max_frag_size != FRAG_SIZE_NO_LIMIT ){ - /* user buffers need to be limited in size */ - limit_size_user_bufs = true; - - if (0 == frag_size) { - /* nothing set yet */ - frag_size = bcol_component->max_frag_size; - } else if (frag_size < bcol_component->max_frag_size) { - /* stricter constraint on fragment size */ - frag_size = bcol_component->max_frag_size; - } - } - } - - if (!use_user_bufs || limit_size_user_bufs) { - /* we need to limit the user buffer size or use library buffers */ - ml_module->fragment_size = frag_size; - } else { - /* entire message may be processed in single chunk */ - ml_module->fragment_size = FRAG_SIZE_NO_LIMIT; - } - - /* for non-contigous data - just use the ML buffers */ - ml_module->ml_fragment_size = length_ml_payload; - - /* set whether we can use user buffers */ - ml_module->use_user_buffers = use_user_bufs; - - ML_VERBOSE(10, ("Seting payload size to %d %d [%d %d]", - ml_module->ml_fragment_size, length_ml_payload, - mca_coll_ml_component.payload_buffer_size, - ml_module->data_offset)); - - exit_ERROR: - if (NULL != ranks_map) { - free(ranks_map); - } - if (NULL != bcols_in_use) { - free(bcols_in_use); - } - - return ret; -} - -static int ml_discover_hierarchy(mca_coll_ml_module_t *ml_module) -{ - ompi_proc_t *my_proc = NULL; - - int n_hierarchies = 0, - i = 0, ret = OMPI_SUCCESS; - - int size_bcol_list, size_sbgp_list; - - size_bcol_list = opal_list_get_size(&mca_bcol_base_components_in_use); - size_sbgp_list = opal_list_get_size(&mca_sbgp_base_components_in_use); - - if ((size_bcol_list != size_sbgp_list) || size_sbgp_list < 1 || size_bcol_list < 1) { - ML_ERROR(("Error: (size of mca_bcol_base_components_in_use = %d)" - " != (size of mca_sbgp_base_components_in_use = %d) or zero.", - size_bcol_list, size_sbgp_list)); - return OMPI_ERROR; - } - - n_hierarchies = size_sbgp_list; - - my_proc = ompi_proc_local(); - /* create the converter, for current implementation we - support homogenius comunicators only */ - ml_module->reference_convertor = - opal_convertor_create(my_proc->super.proc_arch, 0); - - if (OPAL_UNLIKELY(NULL == ml_module->reference_convertor)) { - return OMPI_ERROR; - } - - /* Do loop over all supported hiearchies. - To Do. We would like to have mca parameter that will allow control list - of topolgies that user would like use. Right now we will run - */ - for (i = 0; i < COLL_ML_TOPO_MAX; i++) { - if (COLL_ML_TOPO_ENABLED == ml_module->topo_list[i].status) { - ret = mca_coll_ml_component.topo_discovery_fn[i](ml_module, n_hierarchies); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } - } - } - - /* Local query for bcol header size */ - ret = calculate_buffer_header_size(ml_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } - - /* Get BCOL tuning, like support for zero copy, fragment size, and etc. - * This query involves global synchronization over all processes */ - ret = mca_coll_ml_read_allbcols_settings(ml_module, n_hierarchies); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } - /* Here is the safe point to call ml_module_memory_initialization , please - be very careful,if you decide to move this arround.*/ - ret = ml_module_memory_initialization(ml_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - /* make sure to release just allocated memory */ - mca_coll_ml_free_block(ml_module->payload_block); - return ret; - } - - ret = ml_module_set_small_msg_thresholds(ml_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - /* make sure to release just allocated memory */ - mca_coll_ml_free_block(ml_module->payload_block); - return ret; - } - - { - /* Syncronization barrier to make sure that all sides finsihed - * to register the memory */ - int ret, i; - int *comm_ranks = NULL; - - comm_ranks = (int *)calloc(ompi_comm_size(ml_module->comm), sizeof(int)); - if (OPAL_UNLIKELY(NULL == comm_ranks)) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - for (i = 0; i < ompi_comm_size(ml_module->comm); i++) { - comm_ranks[i] = i; - } - - ret = comm_allreduce_pml(&ret, &i, - 1, MPI_INT, ompi_comm_rank(ml_module->comm), - MPI_MIN, ompi_comm_size(ml_module->comm), comm_ranks, - ml_module->comm); - - free(comm_ranks); - - if (OMPI_SUCCESS != ret) { - ML_ERROR(("comm_allreduce - failed to collect max_comm data")); - return ret; - } - /* Barrier done */ - } - - return ret; -} - -static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - mca_coll_ml_topology_t *topo, int n_hierarchies, - const char *exclude_sbgp_name, const char *include_sbgp_name) -{ - /* local variables */ - char *ptr_output = NULL; - sbgp_base_component_keyval_t *sbgp_cli = NULL; - mca_base_component_list_item_t *bcol_cli = NULL; - hierarchy_pairs *pair = NULL; - - mca_sbgp_base_module_t *module = NULL; - ompi_proc_t **copy_procs = NULL, - *my_proc = NULL; - - const mca_sbgp_base_component_2_0_0_t *sbgp_component = NULL; - - - int i_hier = 0, n_hier = 0, ll_p1, bcol_index = 0, - n_procs_in = 0, group_index = 0, n_remain = 0, - i, j, ret = OMPI_SUCCESS, my_rank_in_list = 0, - n_procs_selected = 0, original_group_size = 0, i_am_done = 0, - local_leader, my_rank_in_subgroup, my_rank_in_remaining_list = 0, - my_rank_in_comm; - - int32_t my_lowest_group_index = -1, my_highest_group_index = -1; - - int *map_to_comm_ranks = NULL, *bcols_in_use = NULL; - - int32_t *all_selected = NULL, - *index_proc_selected = NULL; - - short all_reduce_buffer2_in[2]; - short all_reduce_buffer2_out[2]; - sub_group_params_t *array_of_all_subgroup_ranks=NULL; - /* this pointer should probably be an int32_t and not an int type */ - int32_t *list_of_ranks_in_all_subgroups=NULL; - int num_ranks_in_all_subgroups=0,num_total_subgroups=0; - int size_of_array_of_all_subgroup_ranks=0; - int size_of_list_of_ranks_in_all_subgroups=0; - int32_t in_allgather_value; - - if (NULL != exclude_sbgp_name && NULL != include_sbgp_name) { - ret = OMPI_ERROR; - goto exit_ERROR; - } - - ML_VERBOSE(10,("include %s exclude %s size %d", include_sbgp_name, exclude_sbgp_name, n_hierarchies)); - - /* allocates scratch space */ - all_selected = (int32_t *) calloc(ompi_comm_size(ml_module->comm), sizeof(int32_t)); - if (OPAL_UNLIKELY(NULL == all_selected)) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - map_to_comm_ranks = (int *) calloc(ompi_comm_size(ml_module->comm), sizeof(int)); - if (OPAL_UNLIKELY(NULL == map_to_comm_ranks)) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - /* - ** obtain list of procs - */ - /* create private copy for manipulation */ - copy_procs = (ompi_proc_t **) calloc(ompi_comm_size(ml_module->comm), - sizeof(ompi_proc_t *)); - if (OPAL_UNLIKELY(NULL == copy_procs)) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - for (i = 0; i < ompi_comm_size(ml_module->comm); i++) { - copy_procs[i] = ompi_comm_peer_lookup (ml_module->comm, i); - map_to_comm_ranks[i] = i; - } - - my_rank_in_comm = ompi_comm_rank (ml_module->comm); - n_procs_in = ompi_comm_size(ml_module->comm); - original_group_size = n_procs_in; - - /* setup information for all-reduce over out of band */ - index_proc_selected = (int32_t *) malloc(sizeof(int32_t) * n_procs_in); - if (OPAL_UNLIKELY(NULL == index_proc_selected)) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - /* get my proc pointer - used to identify myself in the list */ - my_proc = ompi_proc_local(); - my_rank_in_list = ompi_comm_rank(ml_module->comm); - - topo->component_pairs = (hierarchy_pairs *) calloc(n_hierarchies, sizeof(hierarchy_pairs)); - if (OPAL_UNLIKELY(NULL == topo->component_pairs)) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - n_hier = 0; - /* - * Algorithm for subgrouping: - * 1) Start with all the ranks in the communicator - * 2) iterate over all (exclusive) hierarchy selection rules - * A) Apply subgrouping function to the remaining set of ranks - * - After the call to subgrouping subgroup_module->group_list - * has the index of ranks selected, from the list or ranks - * passed in. - * - map_to_comm_ranks maintains the mapping of the remaining - * ranks, to their rank in the communicator - * B) Each rank initializes a scratch array the size of the - * remaining ranks to 0, and then fills in the entry that - * corresponds to itself only with the value -/+R. If the - * rank is the local leader for the subgroup, the value of -R - * is entered, other wise R is entered. R is the root of the - * selected subgroup plus 1, so that for rank 0, +R has a - * different value than -R. - * C) The vector is then reduced, with the results going to all - * ranks, over the list of remaining ranks. As a result, - * the ranks of a given subgroup will show up with the value R, - * for all but the local-leader, which will have the value of -R. - * This is also used for error checking. - * D) subgroup_module->group_list is changed to contain the ranks - * of each member of the group within the communicator. - * E) Local rank with the group is determined. - * F) the list or remaining ranks is compacted, removing all selected - * ranks that are not the local-leader of the group. - * map_to_comm_ranks is also compacted. - * 3) This is terminated once all ranks are selected. - */ - - /* loop over hierarchies */ - sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_first(&mca_sbgp_base_components_in_use); - bcol_cli = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use); - - ML_VERBOSE(10, ("Loop over hierarchies.")); - - i_hier = 0; - while ((opal_list_item_t *) sbgp_cli != opal_list_get_end(&mca_sbgp_base_components_in_use)){ - /* number of processes selected with this sbgp on all ranks */ - int global_n_procs_selected; - - /* silence clang warnings */ - assert (NULL != bcol_cli && NULL != sbgp_cli); - - /* - ** obtain the list of ranks in the current level - */ - - sbgp_component = (mca_sbgp_base_component_2_0_0_t *) sbgp_cli->component.cli_component; - - /* Skip excluded levels */ - if (NULL != exclude_sbgp_name) { - - ML_VERBOSE(10,("EXCLUDE compare %s to %s", include_sbgp_name, - sbgp_component->sbgp_version.mca_component_name)); - if(0 == strcmp(exclude_sbgp_name, - sbgp_component->sbgp_version.mca_component_name)) { - /* take the next element */ - sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli); - bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli); - continue; - } - } - - if (NULL != include_sbgp_name) { - ML_VERBOSE(10,("INCLUDE compare %s to %s", include_sbgp_name, - sbgp_component->sbgp_version.mca_component_name)); - if(0 != strcmp(include_sbgp_name, - sbgp_component->sbgp_version.mca_component_name)) { - /* take the next element */ - sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli); - bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli); - continue; - } - } - - ML_VERBOSE(10,("Passed include %s exclude %s", include_sbgp_name, exclude_sbgp_name)); - - /* discover subgroup */ - ML_VERBOSE(10, ("Discover subgroup: hier level - %d.", i_hier)); - module = sbgp_component->select_procs(copy_procs, n_procs_in, - ml_module->comm, - sbgp_cli->key_value, &ptr_output); - if (NULL == module) { - /* no module created */ - n_procs_selected = 0; - /* We must continue and participate in the allgather. - * It's not clear that one can enter this conditional - * during "normal" execution. We need to review - * all modules. - */ - - /* THE CODE SNIPPET COMMENTED OUT BELOW IS DANGEROUS CODE THAT - * COULD RESULT IN A HANG - THE "CONTINUE" STATEMENT MAY RESULT IN - * RANKS BYPASSING THE ALLGATHER IN NON-SYMMETRIC CASES - */ - - /* - sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli); - bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli); - continue; - */ - - /* Skipping subgroups of size one will cause these processes to be missed in list of level one - * indices. */ - } else if (NULL == module->group_list || (1 == module->group_size && i_hier)) { - /* bypass modules that have no group_list */ - n_procs_selected = 0; - OBJ_RELEASE(module); - module=NULL; - } else { - n_procs_selected = module->group_size; - } - - ML_VERBOSE(10, ("Hier level - %d; group size - %d", i_hier, n_procs_selected)); - - /* setup array indicating all procs that were selected */ - for (i = 0; i < n_procs_in; i++) { - index_proc_selected[i] = 0; - } - - /* figure out my rank in the subgroup */ - my_rank_in_subgroup=-1; - ll_p1=-1; - in_allgather_value = 0; - if (n_procs_selected) { - /* I need to contribute to the vector */ - for (group_index = 0; group_index < n_procs_selected; group_index++) { - /* set my rank within the group */ - if (map_to_comm_ranks[module->group_list[group_index]] == my_rank_in_comm) { - my_rank_in_subgroup=group_index; - module->my_index = group_index; - /* currently the indecies are still given in terms of - * the rank in the list of remaining ranks */ - my_rank_in_remaining_list=module->group_list[group_index]; - } - } - - if( -1 != my_rank_in_subgroup ) { - /* I am contributing to this subgroup */ - -#ifdef NEW_LEADER_SELECTION -#if 0 - int lleader_index; - /* Select the local leader */ - lleader_index = coll_ml_select_leader(ml_module,module, map_to_comm_ranks, - copy_procs,n_procs_selected); - - local_leader = map_to_comm_ranks[module->group_list[lleader_index]]; -#endif -#else - - /* local leader is rank within list or remaining ranks */ - local_leader = map_to_comm_ranks[module->group_list[0]]; - -#endif - ML_VERBOSE(10,("The local leader selected for hierarchy %d is rank %d ", - i_hier, local_leader)); - - ll_p1 = local_leader + 1; - if (local_leader == my_rank_in_comm) { - in_allgather_value = - index_proc_selected[my_rank_in_remaining_list] = -ll_p1; - } else { - in_allgather_value = - index_proc_selected[my_rank_in_remaining_list] = ll_p1; - } - } - } - - /* gather the information from all the other remaining ranks */ - ML_VERBOSE(10, ("Call for comm_allreduce_pml.")); - ret = comm_allgather_pml(&in_allgather_value, - all_selected, 1, MPI_INT, my_rank_in_list, - n_procs_in, map_to_comm_ranks ,ml_module->comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("comm_allreduce_pml failed.")); - goto exit_ERROR; - } - - /* do some sanity checks */ - if( -1 != my_rank_in_subgroup ) { - ret = check_global_view_of_subgroups(n_procs_selected, - n_procs_in, ll_p1, all_selected, module ); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("check_global_view_of_subgroups failed.")); - goto exit_ERROR; - } - } - - /* - ** change the list of procs stored on the module to ranks within - ** the communicator. - */ - - ML_VERBOSE(10, ("Change the list of procs; hier level - %d.", i_hier)); - for (group_index = 0; group_index < n_procs_selected; group_index++) { - module->group_list[group_index] = map_to_comm_ranks[module->group_list[group_index]]; - /* set my rank within the group */ - if (module->group_list[group_index] == ompi_comm_rank(ml_module->comm)) { - module->my_index = group_index; - } - } - - /* - * accumulate data on the new subgroups created - */ - /*XXX*/ - global_n_procs_selected = num_ranks_in_all_subgroups; - ret = get_new_subgroup_data(all_selected, n_procs_in, - &array_of_all_subgroup_ranks, - &size_of_array_of_all_subgroup_ranks, - &list_of_ranks_in_all_subgroups, - &size_of_list_of_ranks_in_all_subgroups, - &num_ranks_in_all_subgroups, - &num_total_subgroups, map_to_comm_ranks,i_hier); - - if( OMPI_SUCCESS != ret ) { - ML_VERBOSE(10, (" Error: get_new_subgroup_data returned %d ",ret)); - goto exit_ERROR; - } - - /* the global number of processes selected at this level is the difference - * in the number of procs in all subgroups between this level and the - * last */ - global_n_procs_selected = num_ranks_in_all_subgroups - global_n_procs_selected; - - /* am I done ? */ - i_am_done=0; - if ( (all_selected[my_rank_in_list] == ll_p1) && - /* if I was not a member of any group, still need to continue */ - n_procs_selected ){ - i_am_done = 1; - } - /* get my rank in the list */ - n_remain = 0; - my_rank_in_list = -1; - for (i = 0; i < n_procs_in; i++) { - if (all_selected[i] > 0 ) { - /* this proc will not be used in the next hierarchy */ - continue; - } - /* reset my_rank_in_list, n_procs_in */ - copy_procs[n_remain] = copy_procs[i]; - map_to_comm_ranks[n_remain] = map_to_comm_ranks[i]; - - if (my_proc == copy_procs[n_remain]){ - my_rank_in_list = n_remain; - } - - n_remain++; - } - - /* check to make sure we did not get a size 1 group if more than - * one rank are still remaning to be grouped */ - if ((1 == n_procs_selected) && n_remain > 1) { - OBJ_RELEASE(module); - n_procs_selected = 0; - } - - if( 0 < n_procs_selected ) { - /* increment the level counter */ - pair = &topo->component_pairs[n_hier]; - - /* add this to the list of sub-group/bcol pairs in use */ - pair->subgroup_module = module; - pair->bcol_component = (mca_bcol_base_component_t *) - ((mca_base_component_list_item_t *) bcol_cli)->cli_component; - - pair->bcol_index = bcol_index; - - /* create bcol modules */ - ML_VERBOSE(10, ("Create bcol modules.")); - pair->bcol_modules = pair->bcol_component->collm_comm_query(module, &pair->num_bcol_modules); - /* failed to create a new module */ - if (OPAL_UNLIKELY(NULL == pair->bcol_modules)) { - ML_VERBOSE(10, ("Failed to create new modules.")); - ret = OMPI_ERROR; - goto exit_ERROR; - } - - if (pair->bcol_component->need_ordering) { - topo->topo_ordering_info.num_bcols_need_ordering += pair->num_bcol_modules; - } - - /* Append new network contexts to our memory managment */ - ML_VERBOSE(10, ("Append new network contexts to our memory managment.")); - if (OPAL_UNLIKELY(OMPI_SUCCESS != append_new_network_context(pair))) { - ML_VERBOSE(10, ("Exit with error. - append new network context")); - ret = OMPI_ERROR; - goto exit_ERROR; - } - - for (i = 0; i < pair->num_bcol_modules; ++i) { - /* set the starting sequence number */ - pair->bcol_modules[i]->squence_number_offset = - mca_coll_ml_component.base_sequence_number; - - /* cache the sub-group size */ - pair->bcol_modules[i]->size_of_subgroup= - module->group_size; - - /* set the bcol id */ - pair->bcol_modules[i]->bcol_id = (int16_t) bcol_index; - - /* Set bcol mode bits */ - topo->all_bcols_mode &= (( mca_bcol_base_module_t *) pair->bcol_modules[i])->supported_mode; - } - - /* - * set largest power of 2 for this group - */ - module->n_levels_pow2 = ml_fls(module->group_size); - /* silence a clang warning */ - assert (module->n_levels_pow2 > 0 && module->n_levels_pow2 < 32); - module->pow_2 = 1 << module->n_levels_pow2; - - n_hier++; - - if (-1 == my_lowest_group_index) { - my_lowest_group_index = bcol_index; - } - - my_highest_group_index = bcol_index; - } - - /* if n_remain is 1, and the communicator size is not 1, and module - ** is not NULL, I am done - */ - if ((1 == n_remain) && (1 < original_group_size) && - (NULL != module)) { - i_am_done = 1; - } - - /* am I done ? */ - if (1 == i_am_done) { - /* nothing more to do */ - goto SelectionDone; - } - - /* take the next element */ - sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli); - bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli); - - /* if no processes were selected anywhere with this sbgp module don't bother - * incrementing the hierarchy index. this resolves issues where (for example) - * process binding is not enabled or supported. */ - if (global_n_procs_selected) { - /* The way initialization is currently written *all* ranks MUST appear - * in the first level (0) of the hierarchy. If any rank is not in the first - * level then the calculation of gather/scatter offsets will be wrong. - * NTH: DO NOT REMOVE this assert until this changes! */ - assert (i_hier || global_n_procs_selected == n_procs_in); - i_hier++; - } - - ++bcol_index; - - n_procs_in = n_remain; - } - - SelectionDone: - - if (topo->topo_ordering_info.num_bcols_need_ordering > 0) { - for (j = 0; j < n_hier; ++j) { - pair = &topo->component_pairs[j]; - if (pair->bcol_component->need_ordering) { - for (i = 0; i < pair->num_bcol_modules; ++i) { - pair->bcol_modules[i]->next_inorder = &topo->topo_ordering_info.next_inorder; - } - } - } - } - - /* If I was not done, it means that we skipped all subgroups and no hierarchy was build */ - if (0 == i_am_done) { - - if (NULL != include_sbgp_name || NULL != exclude_sbgp_name) { - /* User explicitly asked for specific type of topology, which generates empty group */ - opal_show_help("help-mpi-coll-ml.txt", - "empty-sub-group", true, - NULL != include_sbgp_name ? include_sbgp_name : exclude_sbgp_name); - ret = OMPI_ERROR; - goto exit_ERROR; - } - - ML_VERBOSE(10, ("Constructing empty hierarchy")); - ret = OMPI_SUCCESS; - goto exit_ERROR; - } - - topo->n_levels = n_hier; - - /* Find lowest and highest index of the groups in this communicator. - ** This will be needed in deciding where in the hierarchical collective - ** sequence of calls these particular groups belong. - ** It is done with one allreduce call to save allreduce overhead. - */ - all_reduce_buffer2_in[0] = (short)my_lowest_group_index; - all_reduce_buffer2_in[1] = (short)-my_highest_group_index; - /* restore map to ranks for the original communicator */ - for (i = 0; i < ompi_comm_size(ml_module->comm); i++) { - map_to_comm_ranks[i] = i; - } - - ret = comm_allreduce_pml(all_reduce_buffer2_in, all_reduce_buffer2_out, - 2, MPI_SHORT, ompi_comm_rank(ml_module->comm), - MPI_MIN, original_group_size, - map_to_comm_ranks, ml_module->comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("comm_allreduce_pml failed. all_reduce_buffer2_in reduction")); - goto exit_ERROR; - } - - topo->global_lowest_hier_group_index = all_reduce_buffer2_out[0]; - topo->global_highest_hier_group_index = -all_reduce_buffer2_out[1]; - - ML_VERBOSE(10, ("The lowest index and highest index was successfully found.")); - - ML_VERBOSE(10, ("ml_discover_hierarchy done, n_levels %d lowest_group_index %d highest_group_index %d," - " original_group_size %d my_lowest_group_index %d my_highest_group_index %d", - topo->n_levels, topo->global_lowest_hier_group_index, - topo->global_highest_hier_group_index, - original_group_size, - my_lowest_group_index, - my_highest_group_index)); - - /* - * setup detailed subgroup information - */ - ret = ml_setup_full_tree_data(topo, ml_module->comm, my_highest_group_index, - map_to_comm_ranks,&num_total_subgroups,&array_of_all_subgroup_ranks, - &list_of_ranks_in_all_subgroups); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_VERBOSE(10, ("comm_allreduce_pml failed: bcols_in_use reduction %d ",ret)); - goto exit_ERROR; - } - - /* cache the ML hierarchical description on the tree */ - topo->number_of_all_subgroups = num_total_subgroups; - topo->array_of_all_subgroups = array_of_all_subgroup_ranks; - - ret = ml_init_k_nomial_trees(topo, list_of_ranks_in_all_subgroups, ompi_comm_rank(ml_module->comm)); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - goto exit_ERROR; - } - - /* Set the route table if know-root type of algorithms is used */ - if (COLL_ML_STATIC_BCAST == mca_coll_ml_component.bcast_algorithm) { - ret = mca_coll_ml_fill_in_route_tab(topo, ml_module->comm); - if (OMPI_SUCCESS != ret) { - ML_ERROR(("mca_coll_ml_fill_in_route_tab returned an error.")); - goto exit_ERROR; - } - } - - /* - ** If all ranks are selected, there will be a single rank that remains - - ** the root of the last group. Check to make sure that all ranks are - ** selected, and if not, return an error. We can't handle the collectives - ** correctly with this module. - */ - - exit_ERROR: - - ML_VERBOSE(10, ("Discovery done")); - - /* free temp resources */ - if (NULL != all_selected) { - free(all_selected); - all_selected = NULL; - } - - if (NULL != copy_procs) { - free(copy_procs); - copy_procs = NULL; - } - - if (NULL != map_to_comm_ranks) { - free(map_to_comm_ranks); - map_to_comm_ranks = NULL; - } - - if (NULL != index_proc_selected) { - free(index_proc_selected); - index_proc_selected = NULL; - } - - if (NULL != bcols_in_use) { - free(bcols_in_use); - bcols_in_use = NULL; - } - - if (NULL != list_of_ranks_in_all_subgroups) { - free(list_of_ranks_in_all_subgroups); - list_of_ranks_in_all_subgroups = NULL; - } - - return ret; -} - -void mca_coll_ml_allreduce_matrix_init(mca_coll_ml_module_t *ml_module, - const mca_bcol_base_component_2_0_0_t *bcol_component) -{ - int op, dt, et; - - for (op = 0; op < OMPI_OP_NUM_OF_TYPES; ++op) { - for (dt = 0; dt < OMPI_DATATYPE_MAX_PREDEFINED; ++dt) { - for (et = 0; et < BCOL_NUM_OF_ELEM_TYPES; ++et) { - ml_module->allreduce_matrix[op][dt][et] = - bcol_component->coll_support(op, dt, et); - } - } - } -} - -int mca_coll_ml_fulltree_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies) -{ - return mca_coll_ml_tree_hierarchy_discovery(ml_module, - &ml_module->topo_list[COLL_ML_HR_FULL], - n_hierarchies, NULL, NULL); -} - -int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies) -{ - mca_base_component_list_item_t *bcol_cli; - const mca_bcol_base_component_2_0_0_t *bcol_component; - - sbgp_base_component_keyval_t *sbgp_cli; - const mca_sbgp_base_component_2_0_0_t *sbgp_component; - - sbgp_cli = (sbgp_base_component_keyval_t *) - opal_list_get_first(&mca_sbgp_base_components_in_use); - - OPAL_LIST_FOREACH(bcol_cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) { - bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component; - - /* silence false-positive clang warning */ - assert (NULL != sbgp_cli); - - if (NULL != bcol_component->coll_support_all_types && - !bcol_component->coll_support_all_types(BCOL_ALLREDUCE)) { - mca_base_component_list_item_t *bcol_cli_next; - const mca_bcol_base_component_2_0_0_t *bcol_component_next; - - bcol_cli_next = (mca_base_component_list_item_t *) - opal_list_get_next((opal_list_item_t *) bcol_cli); - - mca_coll_ml_component.need_allreduce_support = true; - mca_coll_ml_allreduce_matrix_init(ml_module, bcol_component); - - sbgp_component = (mca_sbgp_base_component_2_0_0_t *) - sbgp_cli->component.cli_component; - - ML_VERBOSE(10, ("Topology build: sbgp %s will be excluded.", - sbgp_component->sbgp_version.mca_component_name)); - - - /* If there isn't additional component supports all types => print warning */ - if (1 == opal_list_get_size(&mca_bcol_base_components_in_use) || - (opal_list_item_t *) bcol_cli_next == - opal_list_get_end(&mca_bcol_base_components_in_use)) { - opal_show_help("help-mpi-coll-ml.txt", - "allreduce-not-supported", true, - bcol_component->bcol_version.mca_component_name); - - } else { - bcol_component_next = (mca_bcol_base_component_2_0_0_t *) - bcol_cli_next->cli_component; - - if (NULL != bcol_component_next->coll_support_all_types && - !bcol_component_next->coll_support_all_types(BCOL_ALLREDUCE)) { - - opal_show_help("help-mpi-coll-ml.txt", - "allreduce-alt-nosupport", true, - bcol_component->bcol_version.mca_component_name); - - } - } - - return mca_coll_ml_tree_hierarchy_discovery(ml_module, - &ml_module->topo_list[COLL_ML_HR_ALLREDUCE], - n_hierarchies, sbgp_component->sbgp_version.mca_component_name, NULL); - } - - sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli); - } - - return OMPI_SUCCESS; -} - -int mca_coll_ml_fulltree_exclude_basesmsocket_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies) -{ - return mca_coll_ml_tree_hierarchy_discovery(ml_module, - &ml_module->topo_list[COLL_ML_HR_NBS], - n_hierarchies, "basesmsocket", NULL); -} - -int mca_coll_ml_fulltree_ptp_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies) -{ - return mca_coll_ml_tree_hierarchy_discovery(ml_module, - &ml_module->topo_list[COLL_ML_HR_SINGLE_PTP], - n_hierarchies, NULL, "p2p"); -} - -int mca_coll_ml_fulltree_iboffload_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module, - int n_hierarchies) -{ - return mca_coll_ml_tree_hierarchy_discovery(ml_module, - &ml_module->topo_list[COLL_ML_HR_SINGLE_IBOFFLOAD], - n_hierarchies, NULL, "ibnet"); -} - -#define IS_REACHABLE 1 -#define IS_NOT_REACHABLE -1 - -static int mca_coll_ml_fill_in_route_tab(mca_coll_ml_topology_t *topo, ompi_communicator_t *comm) -{ - int i, rc, level, comm_size = 0, - my_rank = ompi_comm_rank(comm); - - int32_t **route_table = NULL; - int32_t *all_reachable_ranks = NULL; - mca_sbgp_base_module_t *sbgp_group = NULL; - comm_size = ompi_comm_size(comm); - - all_reachable_ranks = (int32_t *) malloc(comm_size * sizeof(int32_t)); - if (NULL == all_reachable_ranks) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - for (i = 0; i < comm_size; ++i) { - all_reachable_ranks[i] = IS_NOT_REACHABLE; - } - - route_table = (int32_t **) calloc(topo->n_levels, sizeof(int32_t *)); - if (NULL == route_table) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - topo->route_vector = (mca_bcol_base_route_info_t *) - calloc(comm_size, sizeof(mca_bcol_base_route_info_t)); - if (NULL == topo->route_vector) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - all_reachable_ranks[my_rank] = IS_REACHABLE; - - for (level = 0; level < topo->n_levels; ++level) { - sbgp_group = topo->component_pairs[level].subgroup_module; - - route_table[level] = (int32_t *) malloc(comm_size * sizeof(int32_t)); - if (NULL == route_table[level]) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - rc = OMPI_ERR_OUT_OF_RESOURCE; - goto exit_ERROR; - } - - for (i = 0; i < comm_size; ++i) { - if (IS_NOT_REACHABLE != all_reachable_ranks[i]) { - all_reachable_ranks[i] = sbgp_group->my_index; - } - } - - rc = comm_allreduce_pml(all_reachable_ranks, - route_table[level], - comm_size, - MPI_INT, sbgp_group->my_index, - MPI_MAX, sbgp_group->group_size, - sbgp_group->group_list, - comm); - if (OMPI_SUCCESS != rc) { - ML_VERBOSE(10, ("comm_allreduce failed.")); - goto exit_ERROR; - } - - for (i = 0; i < comm_size; ++i) { - if (IS_NOT_REACHABLE != - route_table[level][i]) { - all_reachable_ranks[i] = IS_REACHABLE; - } - } - } - - assert(0 < level); - - /* If there are unreachable ranks => - reach them through leader of my upper layer */ - for (i = 0; i < comm_size; ++i) { - if (IS_NOT_REACHABLE == - route_table[level - 1][i]) { - route_table[level - 1][i] = 0; - } - } - - free(all_reachable_ranks); - - for (i = 0; i < comm_size; ++i) { - for (level = 0; level < topo->n_levels; ++level) { - if (IS_NOT_REACHABLE != route_table[level][i]) { - topo->route_vector[i].level = level; - topo->route_vector[i].rank = route_table[level][i]; - break; - } - } - } - -#if OPAL_ENABLE_DEBUG -#define COLL_ML_ROUTE_BUFF_SIZE (1024*1024) - /* Only bother creating the string if we're actually going to - print it out (i.e., if the verbose level is >= 10) */ - if (mca_coll_ml_component.verbose >= 10) { - int ii, jj; - char *buff, *output; - - output = buff = calloc(1, COLL_ML_ROUTE_BUFF_SIZE); - assert(NULL != output); - - sprintf(output, "ranks: "); - - output = buff + strlen(buff); - assert(COLL_ML_ROUTE_BUFF_SIZE + buff > output); - - for(ii = 0; ii < comm_size; ++ii) { - sprintf(output, " %2d", ii); - - output = buff + strlen(buff); - assert(COLL_ML_ROUTE_BUFF_SIZE + buff > output); - } - - for (ii = 0; ii < topo->n_levels; ++ii) { - sprintf(output, "\nlevel: %d ", ii); - - output = buff + strlen(buff); - assert(COLL_ML_ROUTE_BUFF_SIZE + buff > output); - for(jj = 0; jj < comm_size; ++jj) { - sprintf(output, " %2d", route_table[ii][jj]); - - output = buff + strlen(buff); - assert(COLL_ML_ROUTE_BUFF_SIZE + buff > output); - } - } - - sprintf(output, "\n\nThe vector is:\n============\nranks: "); - - output = buff + strlen(buff); - assert(COLL_ML_ROUTE_BUFF_SIZE + buff > output); - - for(ii = 0; ii < comm_size; ++ii) { - sprintf(output, " %6d", ii); - - output = buff + strlen(buff); - assert(COLL_ML_ROUTE_BUFF_SIZE + buff > output); - } - - sprintf(output, "\nlevel x rank: "); - - output = buff + strlen(buff); - assert(COLL_ML_ROUTE_BUFF_SIZE + buff > output); - - for(ii = 0; ii < comm_size; ++ii) { - sprintf(output, " (%d, %d)", - topo->route_vector[ii].level, - topo->route_vector[ii].rank); - - output = buff + strlen(buff); - assert(COLL_ML_ROUTE_BUFF_SIZE + buff > output); - } - - ML_VERBOSE(10, ("\nThe table is:\n============%s", buff)); - free(buff); - } -#endif - - for (level = 0; level < topo->n_levels; ++level) { - free(route_table[level]); - } - - free(route_table); - - return OMPI_SUCCESS; - - exit_ERROR: - - ML_VERBOSE(10, ("Exit with error status - %d.", rc)); - if (NULL != route_table) { - for (level = 0; level < topo->n_levels; ++level) { - if (NULL != route_table[level]) { - free(route_table[level]); - } - } - - free(route_table); - } - - free(all_reachable_ranks); - - return rc; -} - -static void init_coll_func_pointers(mca_coll_ml_module_t *ml_module) -{ - mca_coll_base_module_2_1_0_t *coll_base = &ml_module->super; - - int iboffload_used = - mca_coll_ml_check_if_bcol_is_used("iboffload", ml_module, COLL_ML_TOPO_MAX); - - /* initialize coll component function pointers */ - coll_base->coll_module_enable = ml_module_enable; - coll_base->ft_event = NULL; - - if (mca_coll_ml_component.disable_allgather) { - coll_base->coll_allgather = NULL; - coll_base->coll_iallgather = NULL; - } else { - coll_base->coll_allgather = mca_coll_ml_allgather; - coll_base->coll_iallgather = mca_coll_ml_allgather_nb; - } - - coll_base->coll_allgatherv = NULL; - - if (mca_coll_ml_component.use_knomial_allreduce) { - if (true == mca_coll_ml_component.need_allreduce_support) { - coll_base->coll_allreduce = mca_coll_ml_allreduce_dispatch; - coll_base->coll_iallreduce = mca_coll_ml_allreduce_dispatch_nb; - } else { - coll_base->coll_allreduce = mca_coll_ml_allreduce; - coll_base->coll_iallreduce = mca_coll_ml_allreduce_nb; - } - } else { - coll_base->coll_allreduce = NULL; - } - - coll_base->coll_alltoall = NULL; - coll_base->coll_ialltoall = NULL; - - coll_base->coll_alltoallv = NULL; - coll_base->coll_alltoallw = NULL; - - coll_base->coll_barrier = mca_coll_ml_barrier_intra; - - /* Use the sequential broadcast */ - if (COLL_ML_SEQ_BCAST == mca_coll_ml_component.bcast_algorithm) { - coll_base->coll_bcast = mca_coll_ml_bcast_sequential_root; - } else { - coll_base->coll_bcast = mca_coll_ml_parallel_bcast; - } - - coll_base->coll_exscan = NULL; - coll_base->coll_gather = NULL; - /* - coll_base->coll_gather = mca_coll_ml_gather; - */ - /* Current iboffload/ptpcoll version have no support for gather */ - if (iboffload_used || - mca_coll_ml_check_if_bcol_is_used("ptpcoll", ml_module, COLL_ML_TOPO_MAX)) { - coll_base->coll_gather = NULL; - } - - - coll_base->coll_gatherv = NULL; - if (mca_coll_ml_component.disable_reduce) { - coll_base->coll_reduce = NULL; - } else { - coll_base->coll_reduce = mca_coll_ml_reduce; - } - coll_base->coll_reduce_scatter = NULL; - coll_base->coll_scan = NULL; - coll_base->coll_scatter = NULL; -#if 0 - coll_base->coll_scatter = mca_coll_ml_scatter_sequential; -#endif - coll_base->coll_scatterv = NULL; - - coll_base->coll_iallgatherv = NULL; - coll_base->coll_ialltoallv = NULL; - coll_base->coll_ialltoallw = NULL; - coll_base->coll_ibarrier = mca_coll_ml_ibarrier_intra; - - coll_base->coll_ibcast = mca_coll_ml_parallel_bcast_nb; - coll_base->coll_iexscan = NULL; - coll_base->coll_igather = NULL; - coll_base->coll_igatherv = NULL; - coll_base->coll_ireduce = mca_coll_ml_reduce_nb; - coll_base->coll_ireduce_scatter = NULL; - coll_base->coll_iscan = NULL; - coll_base->coll_iscatter = NULL; - coll_base->coll_iscatterv = NULL; -} - -static int init_lists(mca_coll_ml_module_t *ml_module) -{ - mca_coll_ml_component_t *cs = &mca_coll_ml_component; - int num_elements = cs->free_list_init_size; - int max_elements = cs->free_list_max_size; - int elements_per_alloc = cs->free_list_grow_size; - size_t length_payload = 0; - size_t length; - int ret; - - /* initialize full message descriptors - moving this to the - * module, as the fragment has resrouce requirements that - * are communicator dependent */ - /* no data associated with the message descriptor */ - - length = sizeof(mca_coll_ml_descriptor_t); - ret = opal_free_list_init(&(ml_module->message_descriptors), length, - opal_cache_line_size, OBJ_CLASS(mca_coll_ml_descriptor_t), - length_payload, 0, - num_elements, max_elements, elements_per_alloc, - NULL, 0, NULL, - init_ml_message_desc, ml_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ML_ERROR(("opal_free_list_init exit with error")); - return ret; - } - - /* initialize fragement descriptors - always associate one fragment - * descriptr with full message descriptor, so that we can minimize - * small message latency */ - - /* create a free list of fragment descriptors */ - /*length_payload=sizeof(something);*/ - length = sizeof(mca_coll_ml_fragment_t); - ret = opal_free_list_init (&(ml_module->fragment_descriptors), length, - opal_cache_line_size, OBJ_CLASS(mca_coll_ml_fragment_t), - length_payload, 0, - num_elements, max_elements, elements_per_alloc, - NULL, 0, NULL, - init_ml_fragment_desc, ml_module); - if (OMPI_SUCCESS != ret) { - ML_ERROR(("opal_free_list_init exit with error")); - return ret; - } - - return OMPI_SUCCESS; -} - -static int check_for_max_supported_ml_modules(struct ompi_communicator_t *comm) -{ - int i, ret; - mca_coll_ml_component_t *cs = &mca_coll_ml_component; - int *comm_ranks = NULL; - - comm_ranks = (int *)calloc(ompi_comm_size(comm), sizeof(int)); - if (OPAL_UNLIKELY(NULL == comm_ranks)) { - ML_VERBOSE(10, ("Cannot allocate memory.")); - return OMPI_ERR_OUT_OF_RESOURCE; - } - for (i = 0; i < ompi_comm_size(comm); i++) { - comm_ranks[i] = i; - } - - ret = comm_allreduce_pml(&cs->max_comm, &cs->max_comm, - 1 , MPI_INT, ompi_comm_rank(comm), - MPI_MIN, ompi_comm_size(comm), comm_ranks, - comm); - free(comm_ranks); - if (OMPI_SUCCESS != ret) { - ML_ERROR(("comm_allreduce - failed to collect max_comm data")); - return ret; - } - - if (0 >= cs->max_comm || - ompi_comm_size(comm) < cs->min_comm_size) { - return OMPI_ERROR; - } else { - --cs->max_comm; - } - - return OMPI_SUCCESS; -} - -#if OPAL_ENABLE_DEBUG -#define DEBUG_ML_COMM_QUERY() \ - do { \ - static int verbosity_level = 5; \ - static int module_num = 0; \ - ML_VERBOSE(10, ("ML module - %p num %d for comm - %p, " \ - "comm size - %d, ML component prio - %d.", \ - ml_module, ++module_num, comm, ompi_comm_size(comm), *priority)); \ - /* For now I want to always print that we enter ML - \ - at the past there was an issue that we did not enter ML and actually run with tuned. \ - Still I do not want to print it for each module - only for the first. */ \ - ML_VERBOSE(verbosity_level, ("ML module - %p was successfully created", ml_module)); \ - verbosity_level = 10; \ - } while(0) - -#else -#define DEBUG_ML_COMM_QUERY() -#endif - -static int mca_coll_ml_need_multi_topo(int bcol_collective) -{ - mca_base_component_list_item_t *bcol_cli; - const mca_bcol_base_component_2_0_0_t *bcol_component; - - for (bcol_cli = (mca_base_component_list_item_t *) - opal_list_get_first(&mca_bcol_base_components_in_use); - (opal_list_item_t *) bcol_cli != - opal_list_get_end(&mca_bcol_base_components_in_use); - bcol_cli = (mca_base_component_list_item_t *) - opal_list_get_next((opal_list_item_t *) bcol_cli)) { - bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component; - if (NULL != bcol_component->coll_support_all_types && - !bcol_component->coll_support_all_types(bcol_collective)) { - return true; - } - } - - return false; -} - -/* We may call this function ONLY AFTER algorithm initialization */ -static int setup_bcast_table(mca_coll_ml_module_t *module) -{ - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - bool has_zero_copy; - - /* setup bcast index table */ - if (COLL_ML_STATIC_BCAST == cm->bcast_algorithm) { - module->bcast_fn_index_table[0] = ML_BCAST_SMALL_DATA_KNOWN; - - has_zero_copy = !!(MCA_BCOL_BASE_ZERO_COPY & - module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_KNOWN]->topo_info->all_bcols_mode); - - if (1 == cm->enable_fragmentation || (2 == cm->enable_fragmentation && !has_zero_copy)) { - module->bcast_fn_index_table[1] = ML_BCAST_SMALL_DATA_KNOWN; - } else if (!has_zero_copy) { - - opal_show_help("help-mpi-coll-ml.txt", - "fragmentation-disabled", true); - return OMPI_ERROR; - - } else { - module->bcast_fn_index_table[1] = ML_BCAST_LARGE_DATA_KNOWN; - } - } else { - module->bcast_fn_index_table[0] = ML_BCAST_SMALL_DATA_UNKNOWN; - - if (NULL == module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]) { - - opal_show_help("help-mpi-coll-ml.txt", - "static-bcast-disabled", true); - - return OMPI_ERROR; - } - - has_zero_copy = !!(MCA_BCOL_BASE_ZERO_COPY & - module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]->topo_info->all_bcols_mode); - - if (1 == cm->enable_fragmentation || (2 == cm->enable_fragmentation && !has_zero_copy)) { - module->bcast_fn_index_table[1] = ML_BCAST_SMALL_DATA_UNKNOWN; - } else if (!has_zero_copy) { - - opal_show_help("help-mpi-coll-ml.txt", - "fragmentation-disabled", true); - - return OMPI_ERROR; - } else { - /* If the topology support zero level and no fragmentation was requested */ - module->bcast_fn_index_table[1] = ML_BCAST_LARGE_DATA_UNKNOWN; - } - } - - return OMPI_SUCCESS; -} - -static void ml_check_for_enabled_topologies (int map[][MCA_COLL_MAX_NUM_SUBTYPES], mca_coll_ml_topology_t *topo_list) -{ - int coll_i, st_i; - for (coll_i = 0; coll_i < MCA_COLL_MAX_NUM_COLLECTIVES; coll_i++) { - for (st_i = 0; st_i < MCA_COLL_MAX_NUM_SUBTYPES; st_i++) { - if (map[coll_i][st_i] > -1) { - /* The topology is used, so set it to enabled */ - assert(map[coll_i][st_i] <= COLL_ML_TOPO_MAX); - topo_list[map[coll_i][st_i]].status = COLL_ML_TOPO_ENABLED; - } - } - } -} - -static void setup_default_topology_map(mca_coll_ml_module_t *ml_module) -{ - int i, j; - for (i = 0; i < MCA_COLL_MAX_NUM_COLLECTIVES; i++) { - for (j = 0; j < MCA_COLL_MAX_NUM_SUBTYPES; j++) { - ml_module->collectives_topology_map[i][j] = -1; - } - } - - ml_module->collectives_topology_map[ML_BARRIER][ML_BARRIER_DEFAULT] = COLL_ML_HR_FULL; - - ml_module->collectives_topology_map[ML_BCAST][ML_BCAST_SMALL_DATA_KNOWN] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_BCAST][ML_BCAST_SMALL_DATA_UNKNOWN] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_BCAST][ML_BCAST_SMALL_DATA_SEQUENTIAL] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_BCAST][ML_BCAST_LARGE_DATA_KNOWN] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_BCAST][ML_BCAST_LARGE_DATA_UNKNOWN] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_BCAST][ML_BCAST_LARGE_DATA_UNKNOWN] = COLL_ML_HR_FULL; - - ml_module->collectives_topology_map[ML_ALLGATHER][ML_SMALL_DATA_ALLGATHER] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_ALLGATHER][ML_LARGE_DATA_ALLGATHER] = COLL_ML_HR_FULL; - - ml_module->collectives_topology_map[ML_GATHER][ML_SMALL_DATA_GATHER] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_GATHER][ML_LARGE_DATA_GATHER] = COLL_ML_HR_FULL; - - ml_module->collectives_topology_map[ML_ALLTOALL][ML_SMALL_DATA_ALLTOALL] = COLL_ML_HR_SINGLE_IBOFFLOAD; - ml_module->collectives_topology_map[ML_ALLTOALL][ML_LARGE_DATA_ALLTOALL] = COLL_ML_HR_SINGLE_IBOFFLOAD; - - ml_module->collectives_topology_map[ML_ALLREDUCE][ML_SMALL_DATA_ALLREDUCE] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_ALLREDUCE][ML_LARGE_DATA_ALLREDUCE] = COLL_ML_HR_FULL; - - if (mca_coll_ml_need_multi_topo(BCOL_ALLREDUCE)) { - ml_module->collectives_topology_map[ML_ALLREDUCE][ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE] = COLL_ML_HR_ALLREDUCE; - ml_module->collectives_topology_map[ML_ALLREDUCE][ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE] = COLL_ML_HR_ALLREDUCE; - } - - ml_module->collectives_topology_map[ML_REDUCE][ML_SMALL_DATA_REDUCE] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_REDUCE][ML_LARGE_DATA_REDUCE] = COLL_ML_HR_FULL; - - - ml_module->collectives_topology_map[ML_SCATTER][ML_SCATTER_SMALL_DATA_KNOWN] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_SCATTER][ML_SCATTER_N_DATASIZE_BINS] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_SCATTER][ML_SCATTER_SMALL_DATA_UNKNOWN] = COLL_ML_HR_FULL; - ml_module->collectives_topology_map[ML_SCATTER][ML_SCATTER_SMALL_DATA_SEQUENTIAL] = COLL_ML_HR_FULL; -} - -#define GET_CF(I, J) (&mca_coll_ml_component.coll_config[I][J]); - -static void load_cached_config(mca_coll_ml_module_t *ml_module) -{ - int c_idx, m_idx, alg; - per_collective_configuration_t *cf = NULL; - - for (c_idx = 0; c_idx < ML_NUM_OF_FUNCTIONS; c_idx++) { - for (m_idx = 0; m_idx < ML_NUM_MSG; m_idx++) { - cf = GET_CF(c_idx, m_idx); - /* load topology tunings */ - if (ML_UNDEFINED != cf->topology_id && - ML_UNDEFINED != cf->algorithm_id) { - alg = - cf->algorithm_id; - ml_module->collectives_topology_map[c_idx][alg] = - cf->topology_id; - } - } - } -} - -/* Pasha: In future I would suggest to convert this configuration to some sophisticated mca parameter or - even configuration file. On this stage of project I will set it statically and later we will change it - to run time parameter */ -static void setup_topology_coll_map(mca_coll_ml_module_t *ml_module) -{ - /* Load default topology setup */ - setup_default_topology_map(ml_module); - - /* Load configuration file */ - load_cached_config(ml_module); - - ml_check_for_enabled_topologies(ml_module->collectives_topology_map, ml_module->topo_list); -} - -/* query to see if the module is available for use on the given - * communicator, and if so, what it's priority is. This is where - * the backing shared-memory file is created. - */ -mca_coll_base_module_t * -mca_coll_ml_comm_query(struct ompi_communicator_t *comm, int *priority) -{ - /* local variables */ - int ret = OMPI_SUCCESS; - - mca_coll_ml_module_t *ml_module = NULL; - mca_coll_ml_component_t *cs = &mca_coll_ml_component; - bool iboffload_was_requested = mca_coll_ml_check_if_bcol_is_requested("iboffload"); - - ML_VERBOSE(10, ("ML comm query start.")); - - /** - * No support for inter-communicator yet. - */ - if (OMPI_COMM_IS_INTER(comm)) { - *priority = -1; - return NULL; - } - - if (MPI_THREAD_MULTIPLE == ompi_mpi_thread_provided) { - ML_VERBOSE(10, ("coll:ml: MPI_THREAD_MULTIPLE not suppported; skipping this component")); - *priority = -1; - return NULL; - } - - - /* NTH: Disabled this check until we have a better one. */ -#if 0 - if (!ompi_rte_proc_is_bound) { - /* do not enable coll/ml unless this process is bound (for now) */ - *priority = -1; - return NULL; - } -#endif - - /** - * If it is inter-communicator and size is less than 2 we have specialized modules - * to handle the intra collective communications. - */ - if (OMPI_COMM_IS_INTRA(comm) && ompi_comm_size(comm) < 2) { - ML_VERBOSE(10, ("It is inter-communicator and size is less than 2.")); - *priority = -1; - return NULL; - } - - /** - * In current implementation we limit number of supported ML modules in cases when - * iboffload companent was requested - */ - if (iboffload_was_requested) { - ret = check_for_max_supported_ml_modules(comm); - if (OMPI_SUCCESS != ret) { - /* We have nothing to cleanup yet, so just return NULL */ - ML_VERBOSE(10, ("check_for_max_supported_ml_modules returns ERROR, return NULL")); - *priority = -1; - return NULL; - } - } - - ML_VERBOSE(10, ("Create ML module start.")); - - /* allocate and initialize an ml module */ - ml_module = OBJ_NEW(mca_coll_ml_module_t); - if (NULL == ml_module) { - return NULL; - } - - /* Get our priority */ - *priority = cs->ml_priority; - - /** Set initial ML values **/ - ml_module->comm = comm; - /* set the starting sequence number */ - ml_module->collective_sequence_num = cs->base_sequence_number; - ml_module->no_data_collective_sequence_num = cs->base_sequence_number; - /* initialize the size of the largest collective communication description */ - ml_module->max_fn_calls = 0; - -#ifdef NEW_LEADER_SELECTION - coll_ml_construct_resource_graphs(ml_module); -#endif - - /* Set topology - function map */ - setup_topology_coll_map(ml_module); - - /** - * This is the core of the function: - * setup communicator hierarchy - the ml component is available for - * caching information about the sbgp modules selected. - */ - ret = ml_discover_hierarchy(ml_module); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(1, ("ml_discover_hierarchy exited with error.")); - goto CLEANUP; - } - - /* gvm Disabled for debuggin */ - ret = mca_coll_ml_build_filtered_fn_table(ml_module); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(1, ("mca_coll_ml_build_filtered_fn_table returned an error.")); - goto CLEANUP; - } - - /* Generate active bcols list */ - generate_active_bcols_list(ml_module); - - /* setup collective schedules - note that a given bcol may have more than - one module instantiated. We may want to use the same collective cap - capabilities over more than one set of procs. Each module will store - the relevant information for a given set of procs */ - ML_VERBOSE(10, ("Call for setup schedule.")); - ret = ml_coll_schedule_setup(ml_module); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(1, ("ml_coll_schedule_setup exit with error")); - goto CLEANUP; - } - - /* Setup bcast table */ - ML_VERBOSE(10, ("Setup bcast table")); - ret = setup_bcast_table(ml_module); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(1, ("setup_bcast_table exit with error")); - goto CLEANUP; - } - - ML_VERBOSE(10, ("Setup pointer to collectives calls.")); - init_coll_func_pointers(ml_module); - - ML_VERBOSE(10, ("Setup free lists")); - ret = init_lists(ml_module); - if (OMPI_SUCCESS != ret) { - goto CLEANUP; - } - - DEBUG_ML_COMM_QUERY(); - - /* Compute the bruck's buffer constant -- temp buffer requirements */ - { - int comm_size =ompi_comm_size(comm); - int count = 1, log_comm_size = 0; - - /* compute log of comm_size */ - while (count < comm_size) { - count = count << 1; - log_comm_size++; - } - - ml_module->brucks_buffer_threshold_const = - (comm_size / 2 + comm_size % 2) * (log_comm_size) ; - - - ml_module->log_comm_size = log_comm_size; - } - - if (iboffload_was_requested) { - /* HACK: Calling memory sync barrier first time to make sure - * that iboffload create qps for service barrier in right order, - * otherwise we may have deadlock and really nasty data corruptions. - * If you plan to remove this one - please talk to me first. - * Pasha. - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - Work around for deadlock caused by connection setup - for asyc service barrier. Asyc service barrier use own set of - MQ and QP _BUT_ the exchange operation uses the MQ that is used for - primary set of collectives operations like Allgahter, Barrier,etc. - As result exchange wait operation could be pushed to primary MQ and - cause dead-lock. - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - Create connection for service barrier and memory address exchange - for ml buffers and asyc service barrier - */ - ret = mca_coll_ml_memsync_intra(ml_module, 0); - if (OMPI_SUCCESS != ret) { - goto CLEANUP; - } - opal_progress(); - } - - /* The module is ready */ - ml_module->initialized = true; - - return &(ml_module->super); - - CLEANUP: - /* Vasily: RLG: Need to cleanup free lists */ - if (NULL != ml_module) { - OBJ_RELEASE(ml_module); - } - - return NULL; -} - -/* copied slightly modified from coll/hcoll */ -#define ML_SAVE_FALLBACK(_coll_ml, _coll) \ - do { \ - _coll_ml->fallback.coll_ ## _coll = comm->c_coll.coll_ ## _coll; \ - _coll_ml->fallback.coll_ ## _coll ## _module = comm->c_coll.coll_ ## _coll ## _module; \ - if (comm->c_coll.coll_ ## _coll && comm->c_coll.coll_ ## _coll ## _module) { \ - OBJ_RETAIN(_coll_ml->fallback.coll_ ## _coll ## _module); \ - } \ - } while(0) - -static void ml_save_fallback_colls (mca_coll_ml_module_t *coll_ml, - struct ompi_communicator_t *comm) -{ - memset (&coll_ml->fallback, 0, sizeof (coll_ml->fallback)); - /* save lower-priority collectives to handle cases not yet handled - * by coll/ml */ - ML_SAVE_FALLBACK(coll_ml, allreduce); - ML_SAVE_FALLBACK(coll_ml, allgather); - ML_SAVE_FALLBACK(coll_ml, reduce); - ML_SAVE_FALLBACK(coll_ml, bcast); - ML_SAVE_FALLBACK(coll_ml, iallreduce); - ML_SAVE_FALLBACK(coll_ml, iallgather); - ML_SAVE_FALLBACK(coll_ml, ireduce); - ML_SAVE_FALLBACK(coll_ml, ibcast); -} - -/* - * Init module on the communicator - */ -static int -ml_module_enable(mca_coll_base_module_t *module, - struct ompi_communicator_t *comm) -{ - /* local variables */ - char output_buffer[2 * MPI_MAX_OBJECT_NAME]; - - ml_save_fallback_colls ((mca_coll_ml_module_t *) module, comm); - - memset(&output_buffer[0], 0, sizeof(output_buffer)); - snprintf(output_buffer, sizeof(output_buffer), "%s (cid %d)", comm->c_name, - comm->c_contextid); - - ML_VERBOSE(10, ("coll:ml:enable: new communicator: %s.", output_buffer)); - - /* All done */ - return OMPI_SUCCESS; -} - -OBJ_CLASS_INSTANCE(mca_coll_ml_module_t, - mca_coll_base_module_t, - mca_coll_ml_module_construct, - mca_coll_ml_module_destruct); - -OBJ_CLASS_INSTANCE(mca_coll_ml_collective_operation_progress_t, - ompi_request_t, - mca_coll_ml_collective_operation_progress_construct, - mca_coll_ml_collective_operation_progress_destruct); diff --git a/ompi/mca/coll/ml/coll_ml_payload_buffers.h b/ompi/mca/coll/ml/coll_ml_payload_buffers.h deleted file mode 100644 index d4ac765342..0000000000 --- a/ompi/mca/coll/ml/coll_ml_payload_buffers.h +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_ML_PAYLOAD_BUFFERS_H -#define MCA_ML_PAYLOAD_BUFFERS_H - -#include "ompi/include/ompi/constants.h" -#include "opal/threads/mutex.h" - -struct buffer_t { - /* payload */ - void *payload; - - /* next payload buffer - need this because of wrap around, and - * because we want to allocate several buffers at once, but only - * manipulate one entry - */ - struct buffer_t *next_buffer; -}; -typedef struct buffer_t buffer_t; - -struct ml_buffers_t { - /* fifo size */ - int fifo_size; - - /* write index - next to allocate */ - int head_index; - opal_mutex_t head_lock; - - /* read index - next to free */ - int tail_index; - - /* number available - used to detect full queue */ - int n_segments_available; - - /* mask - assumes that fifo link is a power of 2 */ - int mask; - - /* fifo */ - buffer_t *fifo; -}; - -typedef struct ml_buffers_t ml_buffers_t; - -/* Initialization function */ - -static inline int ml_fifo_init( - int fifo_size, void *memory_chunk, size_t size_of_memory_chunk, - size_t segment_alignment, - size_t segment_size, ml_buffers_t *buffer_fifo) -{ - /* local variable */ - ptrdiff_t allocation_base, memory_chunk_ptr; - size_t memory_to_allocate, allocated_fifo_size, - allocated_segment_size, seg; - - /* make sure fifo size is power of 2, and round up if not - want - * efficient addressing */ - if( 0 >= fifo_size ) { - return OMPI_ERROR; - } - allocated_fifo_size=1; - while ( allocated_fifo_size < (size_t)fifo_size ) { - allocated_fifo_size*=2; - } - - /* set buffer size to match its alignment - round size up */ - allocated_segment_size=segment_size; - if( 0 >= segment_alignment ) { - /* multiples of alignmnet */ - allocated_segment_size=( (allocated_segment_size-1)/segment_alignment)+1; - allocated_segment_size=allocated_segment_size*segment_alignment; - } - - /* adjust base pointer to segment alignment */ - memory_chunk_ptr = (ptrdiff_t )memory_chunk; - allocation_base=( ( memory_chunk_ptr-1)/segment_alignment)+1; - allocation_base=allocated_segment_size*segment_alignment; - - /* check for input consistency */ - memory_to_allocate=size_of_memory_chunk-(allocation_base-memory_chunk_ptr); - if( (allocated_segment_size * allocated_fifo_size) < memory_to_allocate ) { - return OMPI_ERROR; - } - - /* allocate the fifo array */ - buffer_fifo->fifo=(buffer_t *)malloc(sizeof(buffer_t)*allocated_fifo_size); - if( NULL == buffer_fifo->fifo) { - return OMPI_ERROR; - } - - /* Initialize structure */ - for( seg=0 ; seg < allocated_fifo_size ; seg++ ) { - buffer_fifo->fifo[seg].payload= - (void *)(allocation_base+seg*allocated_segment_size); - } - for( seg=0 ; seg < allocated_fifo_size-1 ; seg++ ) { - buffer_fifo->fifo[seg].next_buffer= - &(buffer_fifo->fifo[seg+1]); - } - buffer_fifo->fifo[allocated_fifo_size-1].next_buffer= - &(buffer_fifo->fifo[0]); - - buffer_fifo->head_index=0; - buffer_fifo->tail_index=0; - buffer_fifo->n_segments_available=allocated_fifo_size; - buffer_fifo->fifo_size=allocated_fifo_size; - buffer_fifo->mask=buffer_fifo->fifo_size-1; - OBJ_CONSTRUCT(&(buffer_fifo->head_lock), opal_mutex_t); - - /* return */ - return OMPI_SUCCESS; -} - -/* - * Allocate several buffers. Either all requested buffers are allocated, - * or none are allocated. - */ -static inline buffer_t *ml_fifo_alloc_n_buffers(int n_to_allocate, - ml_buffers_t *buffer_fifo) -{ - /* local variables */ - buffer_t *ret=NULL; - - /* RLG - probably want to try a few times before giving up */ - if(!OPAL_THREAD_TRYLOCK(&(buffer_fifo->head_lock))) { - if( buffer_fifo->n_segments_available >= n_to_allocate ) { - ret=&(buffer_fifo->fifo[buffer_fifo->head_index]); - buffer_fifo->head_index=(buffer_fifo->head_index+n_to_allocate); - /* wrap around */ - buffer_fifo->head_index&=buffer_fifo->mask; - - buffer_fifo->n_segments_available -= n_to_allocate; - } - OPAL_THREAD_UNLOCK(&(buffer_fifo->head_lock)); - } /* end of allocatoin */ - - return ret; -} - -/* return buffers */ -static inline void ml_fifo_return_n_buffers(int n_to_return, - ml_buffers_t *buffer_fifo) -{ - - OPAL_THREAD_LOCK(&(buffer_fifo->head_lock)); - - /* move tail pointer - RLG: Do we really need the tail pointer ? */ - buffer_fifo->tail_index=(buffer_fifo->tail_index+n_to_return); - /* wrap around */ - buffer_fifo->tail_index&=buffer_fifo->mask; - - /* adjust number of available buffers */ - buffer_fifo->n_segments_available += n_to_return; - - OPAL_THREAD_UNLOCK(&(buffer_fifo->head_lock)); - -} - -#endif - diff --git a/ompi/mca/coll/ml/coll_ml_progress.c b/ompi/mca/coll/ml/coll_ml_progress.c deleted file mode 100644 index 602331f785..0000000000 --- a/ompi/mca/coll/ml/coll_ml_progress.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi/mca/coll/ml/coll_ml.h" - -/* - * This routine is used to progress a series of communication - * primitives. - * - * Assumptions: - * - A message is described by a message descriptor - * - Each message has a setup function associated with it, which is - * algorithm specific. When a fragment is being prepared, this - * progress is used to setup the arguments that will be passed into - * each routine called to complete a given function. The idea here - * is that when the progress routines is called, the full communication - * pattern has already been described in the setup function, with - * progress function being generic. - * - Each fragment is described by a fragment descriptor - * - Each message descriptor has a fragment descriptor permanently - * associated with it. - * - The message will be proressed as long as the individul - * functions complete. When an indivicual funciton does not - * complete, the current state will be saved, for future - * restart. - * - return status - * OMPI_COMPLETE: funciton completed - * OMPI_INCOMPLETE: need to continue progressing the funciton - * any other return value - error condition - */ - -int coll_ml_progress_individual_message(mca_coll_ml_fragment_t *frag_descriptor) -{ - /* local variables */ - int fn_index, ret = OMPI_SUCCESS; - uint32_t n_frags_complete; - int starting_fn_index=frag_descriptor->current_fn_index; - coll_ml_collective_description_t *local_comm_description= - frag_descriptor->full_msg_descriptor->local_comm_description; - - /* loop over functions */ - for( fn_index=starting_fn_index ; fn_index < local_comm_description->n_functions; - fn_index ++ ) { - mca_bcol_base_module_t *bcol_module= - local_comm_description->functions[fn_index].bcol_module; - ret =(bcol_module->bcol_function_table[local_comm_description->functions[fn_index].fn_idx]) - (&(frag_descriptor->fn_args[fn_index]), &local_comm_description->functions[fn_index]); - if( ML_OMPI_COMPLETE != ret ) { - /* since function incomplete, need to decide what to do */ - if( ML_OMPI_INCOMPLETE == ret ) { - /* need to return to this later */ - /* mark where to continue */ - frag_descriptor->current_fn_index=fn_index; - /* RLG - is this really best ? Only advantage is that - * if we exit the loop, we can assume message is - * complete - */ - return OMPI_SUCCESS; - } else { - /* some sort of error condition */ - frag_descriptor->current_fn_index=fn_index; - return ret; - } - } - - } - - /* looks like we are done */ - /* increment counter for number of completed fragments */ - n_frags_complete = OPAL_THREAD_ADD_SIZE_T( - &(frag_descriptor->full_msg_descriptor->frags_complete), 1); - - /* - * release resrouces - */ - - /* fragment resources */ - - /* full message resources */ - if ( n_frags_complete == frag_descriptor->full_msg_descriptor->n_fragments) - { - /* free any fragments that still need to be freed - * NOTE: at this level we do not handle any resrouces - * aside from the pre-registered buffers, all these - * are handled in the bcol level */ - - /* return the buffers to the ml free list */ - - /* mark as complete - so MPI can complete - * the message descriptor will be freed by a call - * to mpi_test/mpi_wait/... as the message descriptor - * also holds the mpi request object */ - - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/ml/coll_ml_reduce.c b/ompi/mca/coll/ml/coll_ml_reduce.c deleted file mode 100644 index cfec0743a7..0000000000 --- a/ompi/mca/coll/ml/coll_ml_reduce.c +++ /dev/null @@ -1,528 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file */ - -#include "ompi_config.h" - -#include "ompi/constants.h" -#include "opal/threads/mutex.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/coll/coll.h" -#include "ompi/mca/bcol/bcol.h" -#include "opal/sys/atomic.h" -#include "ompi/mca/coll/ml/coll_ml.h" -#include "ompi/mca/coll/ml/coll_ml_allocation.h" -#include "ompi/mca/coll/ml/coll_ml_inlines.h" -#define REDUCE_SMALL_MESSAGE_THRESHOLD 2048 - -static int mca_coll_ml_reduce_unpack(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - int ret; - /* need to put in more */ - int count = coll_op->variable_fn_params.count; - ompi_datatype_t *dtype = coll_op->variable_fn_params.dtype; - - void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr + - (uintptr_t)coll_op->fragment_data.offset_into_user_buffer); - void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr + - (size_t)coll_op->variable_fn_params.rbuf_offset); - - ret = ompi_datatype_copy_content_same_ddt(dtype, (int32_t) count, (char *) dest, - (char *) src); - if (ret < 0) { - return OMPI_ERROR; - } - - if (coll_op->variable_fn_params.root_flag) { - ML_VERBOSE(1,("In reduce unpack %d", - *(int *)((unsigned char*) src))); - } - - ML_VERBOSE(10, ("sbuf addr %p, sbuf offset %d, sbuf val %lf, rbuf addr %p, rbuf offset %d, rbuf val %lf.", - coll_op->variable_fn_params.sbuf, coll_op->variable_fn_params.sbuf_offset, - *(double *) ((unsigned char *) coll_op->variable_fn_params.sbuf + - (size_t) coll_op->variable_fn_params.sbuf_offset), - coll_op->variable_fn_params.rbuf, coll_op->variable_fn_params.rbuf_offset, - *(double *) ((unsigned char *) coll_op->variable_fn_params.rbuf + - (size_t) coll_op->variable_fn_params.rbuf_offset))); - - return OMPI_SUCCESS; -} - - -static int -mca_coll_ml_reduce_task_setup (mca_coll_ml_collective_operation_progress_t *coll_op) -{ - int fn_idx, h_level, next_h_level, my_index; - mca_sbgp_base_module_t *sbgp; - mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info; - - fn_idx = coll_op->sequential_routine.current_active_bcol_fn; - h_level = coll_op->coll_schedule->component_functions[fn_idx].h_level; - next_h_level = (fn_idx < coll_op->coll_schedule->n_fns - 1) ? - coll_op->coll_schedule->component_functions[fn_idx+1].h_level : -1; - sbgp = topo->component_pairs[h_level].subgroup_module; - my_index = sbgp->my_index; - - if (coll_op->variable_fn_params.root_flag) { - ML_VERBOSE(1,("In task completion Data in receiver buffer %d ", - *(int *)((unsigned char*) coll_op->variable_fn_params.rbuf + - coll_op->variable_fn_params.rbuf_offset))); - } - - /* determine the root for this level of the hierarchy */ - if (coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].level == next_h_level || - coll_op->global_root == sbgp->group_list[my_index]) { - /* I am the global root or I will be talking to the global root in the next round. */ - coll_op->variable_fn_params.root = my_index; - } else if (coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].level == h_level) { - /* the root is in this level of my hierarchy */ - coll_op->variable_fn_params.root = coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].rank; - } else { - coll_op->variable_fn_params.root = 0; - } - - /* Set the route vector for this root */ - coll_op->variable_fn_params.root_route = - &coll_op->coll_schedule->topo_info->route_vector[sbgp->group_list[coll_op->variable_fn_params.root]]; - - /* Am I the root of this hierarchy? */ - coll_op->variable_fn_params.root_flag = (my_index == coll_op->variable_fn_params.root); - - /* For hierarchy switch btw source and destination buffer - * No need to make this switch for the first call .. - * */ - if (0 < fn_idx) { - int tmp_offset = coll_op->variable_fn_params.sbuf_offset; - coll_op->variable_fn_params.sbuf_offset = - coll_op->variable_fn_params.rbuf_offset; - coll_op->variable_fn_params.rbuf_offset = tmp_offset; - } - - return OMPI_SUCCESS; -} - -static int mca_coll_ml_reduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op) -{ - /* local variables */ - void *buf; - - size_t dt_size; - int ret, frag_len, count; - - ptrdiff_t lb, extent; - - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; - mca_coll_ml_collective_operation_progress_t *new_op; - - mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op); - - ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent); - if (ret < 0) { - return OMPI_ERROR; - } - - dt_size = (size_t) extent; - - /* Keep the pipeline filled with fragments */ - while (coll_op->fragment_data.message_descriptor->n_active < - coll_op->fragment_data.message_descriptor->pipeline_depth) { - /* If an active fragment happens to have completed the collective during - * a hop into the progress engine, then don't launch a new fragment, - * instead break and return. - */ - if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled - == coll_op->fragment_data.message_descriptor->n_bytes_total) { - break; - } - - /* Get an ml buffer */ - src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op)); - if (NULL == src_buffer_desc) { - /* If there exist outstanding fragments, then break out - * and let an active fragment deal with this later, - * there are no buffers available. - */ - if (0 < coll_op->fragment_data.message_descriptor->n_active) { - return OMPI_SUCCESS; - } else { - /* It is useless to call progress from here, since - * ml progress can't be executed as result ml memsync - * call will not be completed and no memory will be - * recycled. So we put the element on the list, and we will - * progress it later when memsync will recycle some memory*/ - - /* The fragment is already on list and - * the we still have no ml resources - * Return busy */ - if (coll_op->pending & REQ_OUT_OF_MEMORY) { - ML_VERBOSE(10,("Out of resources %p", coll_op)); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - coll_op->pending |= REQ_OUT_OF_MEMORY; - opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list), - (opal_list_item_t *)coll_op); - ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op)); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - } - - /* Get a new collective descriptor and initialize it */ - new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_reduce_functions[ML_SMALL_DATA_REDUCE], - coll_op->fragment_data.message_descriptor->src_user_addr, - coll_op->fragment_data.message_descriptor->dest_user_addr, - coll_op->fragment_data.message_descriptor->n_bytes_total, - coll_op->fragment_data.message_descriptor->n_bytes_scheduled); - - ML_VERBOSE(1,(" In Reduce fragment progress %d %d ", - coll_op->fragment_data.message_descriptor->n_bytes_total, - coll_op->fragment_data.message_descriptor->n_bytes_scheduled)); - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op, - src_buffer_desc->buffer_index, src_buffer_desc); - - new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op; - new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor; - - /* set the task setup callback */ - new_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup; - /* We need this address for pointer arithmetic in memcpy */ - buf = (void*)coll_op->fragment_data.message_descriptor->src_user_addr; - /* calculate the number of data types in this packet */ - count = (coll_op->fragment_data.message_descriptor->n_bytes_total - - coll_op->fragment_data.message_descriptor->n_bytes_scheduled < - ((size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_REDUCE]/4 )? - (coll_op->fragment_data.message_descriptor->n_bytes_total - - coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size : - (size_t) coll_op->variable_fn_params.count); - - /* calculate the fragment length */ - frag_len = count * dt_size; - - ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count, - (char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t) - coll_op->fragment_data.message_descriptor->n_bytes_scheduled)); - if (ret < 0) { - return OMPI_ERROR; - } - - /* if root unpack the data */ - if (ompi_comm_rank(ml_module->comm) == coll_op->global_root ) { - new_op->process_fn = mca_coll_ml_reduce_unpack; - new_op->variable_fn_params.root_flag = true; - } else { - new_op->process_fn = NULL; - new_op->variable_fn_params.root_flag = false; - } - - new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route; - - /* Setup fragment specific data */ - new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; - new_op->fragment_data.buffer_desc = src_buffer_desc; - new_op->fragment_data.fragment_size = frag_len; - (new_op->fragment_data.message_descriptor->n_active)++; - - /* Set in Reduce Buffer arguments */ - ML_SET_VARIABLE_PARAMS_BCAST(new_op, OP_ML_MODULE(new_op), count, - coll_op->variable_fn_params.dtype, src_buffer_desc, - 0, (ml_module->payload_block->size_buffer - - ml_module->data_offset)/2, frag_len, - src_buffer_desc->data_addr); - - new_op->variable_fn_params.buffer_size = frag_len; - new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; - new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; - new_op->variable_fn_params.root = coll_op->variable_fn_params.root; - new_op->global_root = coll_op->global_root; - new_op->variable_fn_params.op = coll_op->variable_fn_params.op; - new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; - new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING; - MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); - - ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d", - new_op->variable_fn_params.buffer_size, - new_op->fragment_data.fragment_size, - new_op->fragment_data.message_descriptor->n_bytes_scheduled)); - /* initialize first coll */ - new_op->sequential_routine.seq_task_setup(new_op); - - /* append this collective !! */ - OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); - opal_list_append(&mca_coll_ml_component.sequential_collectives, - (opal_list_item_t *)new_op); - OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); - - } - - return OMPI_SUCCESS; -} - -static inline __opal_attribute_always_inline__ -int parallel_reduce_start (const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_ml_module_t *ml_module, - ompi_request_t **req, - int small_data_reduce, - int large_data_reduce) { - ptrdiff_t lb, extent; - size_t pack_len, dt_size; - mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL; - mca_coll_ml_collective_operation_progress_t * coll_op = NULL; - bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count); - mca_coll_ml_component_t *cm = &mca_coll_ml_component; - int ret, n_fragments = 1, frag_len, - pipeline_depth, n_dts_per_frag, rank; - - if (MPI_IN_PLACE == sbuf) { - sbuf = rbuf; - } - - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (ret < 0) { - return OMPI_ERROR; - } - - rank = ompi_comm_rank (comm); - - dt_size = (size_t) extent; - pack_len = count * dt_size; - - /* We use a separate recieve and send buffer so only half the buffer is usable. */ - if (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) { - /* The len of the message can not be larger than ML buffer size */ - assert(pack_len <= ml_module->payload_block->size_buffer); - - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - - ML_VERBOSE(10,("Using small data reduce (threshold = %d)", - REDUCE_SMALL_MESSAGE_THRESHOLD)); - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_reduce_functions[small_data_reduce], - sbuf, rbuf, pack_len, 0); - - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, - src_buffer_desc->buffer_index, src_buffer_desc); - - coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; - coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; - coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; - coll_op->variable_fn_params.src_desc = src_buffer_desc; - coll_op->variable_fn_params.count = count; - - ret = ompi_datatype_copy_content_same_ddt(dtype, count, - (void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf); - if (ret < 0){ - return OMPI_ERROR; - } - - } else if (cm->enable_fragmentation || !contiguous) { - ML_VERBOSE(1,("Using Fragmented Reduce ")); - - /* fragment the data */ - /* check for retarded application programming decisions */ - if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) { - ML_ERROR(("Sorry, but we don't support datatypes that large")); - return OMPI_ERROR; - } - - /* calculate the number of data types that can fit per ml-buffer */ - n_dts_per_frag = ml_module->small_message_thresholds[BCOL_REDUCE] / (4 * dt_size); - - /* calculate the number of fragments */ - n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */ - - /* calculate the actual pipeline depth */ - pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth; - - /* calculate the fragment size */ - frag_len = n_dts_per_frag * dt_size; - - /* allocate an ml buffer */ - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_reduce_functions[small_data_reduce], - sbuf,rbuf, - pack_len, - 0 /* offset for first pack */); - - MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, - src_buffer_desc->buffer_index, src_buffer_desc); - - - coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; - coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; - - coll_op->fragment_data.message_descriptor->n_active = 1; - coll_op->full_message.n_bytes_scheduled = frag_len; - coll_op->full_message.fragment_launcher = mca_coll_ml_reduce_frag_progress; - coll_op->full_message.pipeline_depth = pipeline_depth; - coll_op->fragment_data.current_coll_op = small_data_reduce; - coll_op->fragment_data.fragment_size = frag_len; - - coll_op->variable_fn_params.count = n_dts_per_frag; /* seems fishy */ - coll_op->variable_fn_params.buffer_size = frag_len; - coll_op->variable_fn_params.src_desc = src_buffer_desc; - /* copy into the ml-buffer */ - ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag, - (char *) src_buffer_desc->data_addr, (char *) sbuf); - if (ret < 0) { - return OMPI_ERROR; - } - } else { - ML_VERBOSE(1,("Using zero-copy ptp reduce")); - coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, - ml_module->coll_ml_reduce_functions[large_data_reduce], - sbuf, rbuf, pack_len, 0); - - coll_op->variable_fn_params.userbuf = - coll_op->variable_fn_params.sbuf = sbuf; - - coll_op->variable_fn_params.rbuf = rbuf; - - /* The ML buffer is used for testing. Later, when we - * switch to use knem/mmap/portals this should be replaced - * appropriately - */ - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - while (NULL == src_buffer_desc) { - opal_progress(); - src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); - } - - coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; - coll_op->variable_fn_params.src_desc = src_buffer_desc; - coll_op->variable_fn_params.count = count; - } - - coll_op->process_fn = (rank != root) ? NULL : mca_coll_ml_reduce_unpack; - - /* Set common parts */ - coll_op->fragment_data.buffer_desc = src_buffer_desc; - coll_op->variable_fn_params.dtype = dtype; - coll_op->variable_fn_params.op = op; - - /* NTH: the root, root route, and root flag are set in the task setup */ - - /* Fill in the function arguments */ - coll_op->variable_fn_params.sbuf_offset = 0; - coll_op->variable_fn_params.rbuf_offset = (ml_module->payload_block->size_buffer - - ml_module->data_offset)/2; - - /* Keep track of the global root of this operation */ - coll_op->global_root = root; - - coll_op->variable_fn_params.sequence_num = - OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); - coll_op->sequential_routine.current_active_bcol_fn = 0; - /* set the task setup callback */ - coll_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup; - - /* Reduce requires the schedule to be fixed. If we use other (changing) schedule, - the operation might result in different result. */ - coll_op->coll_schedule->component_functions = coll_op->coll_schedule-> - comp_fn_arr[coll_op->coll_schedule->topo_info->route_vector[root].level]; - - /* Launch the collective */ - ret = mca_coll_ml_launch_sequential_collective (coll_op); - if (OMPI_SUCCESS != ret) { - ML_VERBOSE(10, ("Failed to launch reduce collective")); - return ret; - } - - *req = &coll_op->full_message.super; - - return OMPI_SUCCESS; -} - - -int mca_coll_ml_reduce(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) { - - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module; - int ret = OMPI_SUCCESS; - ompi_request_t *req; - - if (OPAL_UNLIKELY(!ompi_op_is_commute(op) || !opal_datatype_is_contiguous_memory_layout(&dtype->super, count))) { - /* coll/ml does not handle non-communative operations at this time. fallback - * on another collective module */ - return ml_module->fallback.coll_reduce (sbuf, rbuf, count, dtype, op, root, comm, - ml_module->fallback.coll_reduce_module); - } - - ML_VERBOSE(10,("Calling Ml Reduce ")); - ret = parallel_reduce_start(sbuf, rbuf, count, dtype, op, - root, comm, (mca_coll_ml_module_t *)module, - &req, ML_SMALL_DATA_REDUCE, - ML_LARGE_DATA_REDUCE); - if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) { - ML_VERBOSE(10, ("Failed to launch")); - return ret; - } - - /* Blocking reduce */ - ret = ompi_request_wait(&req, MPI_STATUS_IGNORE); - - ML_VERBOSE(10, ("Blocking Reduce is done")); - - return ret; -} - - -int mca_coll_ml_reduce_nb(const void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm, - ompi_request_t **req, - mca_coll_base_module_t *module) { - - int ret = OMPI_SUCCESS; - mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module; - - if (OPAL_UNLIKELY(!ompi_op_is_commute(op) || !opal_datatype_is_contiguous_memory_layout(&dtype->super, count))) { - /* coll/ml does not handle non-communative operations at this time. fallback - * on another collective module */ - return ml_module->fallback.coll_ireduce (sbuf, rbuf, count, dtype, op, root, comm, req, - ml_module->fallback.coll_ireduce_module); - } - - ML_VERBOSE(10,("Calling Ml Reduce ")); - ret = parallel_reduce_start(sbuf, rbuf, count, dtype, op, - root, comm, ml_module, - req, ML_SMALL_DATA_REDUCE, - ML_LARGE_DATA_REDUCE); - if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) { - ML_VERBOSE(10, ("Failed to launch")); - return ret; - } - - - ML_VERBOSE(10, ("Non-blocking Reduce is done")); - - return OMPI_SUCCESS; - -} diff --git a/ompi/mca/coll/ml/coll_ml_resource_affinity.c b/ompi/mca/coll/ml/coll_ml_resource_affinity.c deleted file mode 100644 index 23d9a0fc71..0000000000 --- a/ompi/mca/coll/ml/coll_ml_resource_affinity.c +++ /dev/null @@ -1,147 +0,0 @@ -#include "opal/mca/carto/carto.h" -#include "opal/mca/carto/base/base.h" -#include "opal/util/output.h" -#include "opal/class/opal_graph.h" -#include "opal/mca/paffinity/base/base.h" -#include "ompi/constants.h" - -#include "orte/mca/ess/ess.h" -#include "coll_ml_resource_affinity.h" - -int get_dev_distance_for_all_procs(opal_carto_graph_t *graph, const char *device) -{ - opal_paffinity_base_cpu_set_t cpus; - opal_carto_base_node_t *device_node; - int min_distance = -1, i, num_processors; - - if(opal_paffinity_base_get_processor_info(&num_processors) != OMPI_SUCCESS) { - num_processors = 100; /* Choose something big enough */ - } - - device_node = opal_carto_base_find_node(graph, device); - - /* no topology info for device found. Assume that it is close */ - if(NULL == device_node) - return 0; - - OPAL_PAFFINITY_CPU_ZERO(cpus); - opal_paffinity_base_get(&cpus); - - for (i = 0; i < num_processors; i++) { - opal_carto_base_node_t *slot_node; - int distance, socket, core; - char *slot; - - if(!OPAL_PAFFINITY_CPU_ISSET(i, cpus)) - continue; - - opal_paffinity_base_get_map_to_socket_core(i, &socket, &core); - asprintf(&slot, "socket%d", socket); - - slot_node = opal_carto_base_find_node(graph, slot); - - free(slot); - - if(NULL == slot_node) - return 0; - - distance = opal_carto_base_spf(graph, slot_node, device_node); - - if(distance < 0) - return 0; - - if(min_distance < 0 || min_distance > distance) - min_distance = distance; - } - - return min_distance; -} - -int get_dev_distance_proc(opal_carto_graph_t *graph, - const char *device,int rank, struct ompi_proc_t *proc){ - opal_paffinity_base_cpu_set_t cpus; - opal_carto_base_node_t *device_node; - opal_carto_base_node_t *slot_node; - int distance, socket, core; - char *slot; - int process_id; - int nrank; - - nrank = orte_ess.get_node_rank(&(proc->proc_name)); - - opal_paffinity_base_get_physical_processor_id(nrank, &process_id); - - device_node = opal_carto_base_find_node(graph, device); - - /* no topology info for device found. Assume that it is close */ - if(NULL == device_node) - return 0; - - OPAL_PAFFINITY_CPU_ZERO(cpus); - opal_paffinity_base_get(&cpus); - - - - opal_paffinity_base_get_map_to_socket_core(process_id, &socket, &core); - asprintf(&slot, "socket%d", socket); - ML_VERBOSE(10,("The socket addres is %d",socket)); - - slot_node = opal_carto_base_find_node(graph, slot); - - free(slot); - - if(NULL == slot_node) - return -1; - - distance = opal_carto_base_spf(graph, slot_node, device_node); - - if(distance < 0) - return -1; - - return distance; - -} - -int coll_ml_select_leader(mca_coll_ml_module_t *ml_module, - mca_sbgp_base_module_t *sbgp_module, - int *rank_in_comm, - struct ompi_proc_t ** procs, - int nprocs){ - - int rank, dist1, dist2,dist; - int min_dist = 10000; - int i,leader = 10000; - struct ompi_proc_t *proc = NULL; - - for (i=0; igroup_list[i]]; - proc = procs[sbgp_module->group_list[i]]; - dist1 = get_dev_distance_proc(ml_module->sm_graph,"mem0",rank,proc); - dist2 = get_dev_distance_proc(ml_module->ib_graph,"mthca0",rank,proc); - - dist = dist1 + dist2; - - ML_VERBOSE(10,("The distance for proc %d dist1 %d, dist2 %d",i,dist1,dist2)); - if ((dist < min_dist) || ((dist == min_dist) && (i < leader))) { - leader = i; - min_dist = dist; - } - } - - return leader; -} - - -int coll_ml_construct_resource_graphs(mca_coll_ml_module_t *ml_module){ - - opal_carto_base_get_host_graph(&ml_module->sm_graph,"Memory"); - opal_carto_base_get_host_graph(&ml_module->ib_graph,"Infiniband"); - - /* debug - opal_graph_print(ml_module->sm_graph); - */ - return 0; - -} diff --git a/ompi/mca/coll/ml/coll_ml_resource_affinity.h b/ompi/mca/coll/ml/coll_ml_resource_affinity.h deleted file mode 100644 index c64c214ee0..0000000000 --- a/ompi/mca/coll/ml/coll_ml_resource_affinity.h +++ /dev/null @@ -1,19 +0,0 @@ -#include "opal/mca/carto/carto.h" -#include "opal/mca/carto/base/base.h" -#include "opal/util/output.h" -#include "opal/class/opal_graph.h" -#include "coll_ml.h" - - -/* Get the host graph for SM and Infiniband */ -int discover_on_node_resources(const char device); -int get_dev_distance_for_all_procs(opal_carto_graph_t *graph, - const char *device); -int get_dev_distance_proc(opal_carto_graph_t *graph, - const char *device,int rank,struct ompi_proc_t *proc); -int coll_ml_select_leader(mca_coll_ml_module_t *ml_module, - mca_sbgp_base_module_t *sbgp_module, - int *rank_in_comm, - struct ompi_proc_t ** procs, - int nprocs); -int coll_ml_construct_resource_graphs(mca_coll_ml_module_t *ml_module); diff --git a/ompi/mca/coll/ml/coll_ml_select.c b/ompi/mca/coll/ml/coll_ml_select.c deleted file mode 100644 index a46197b869..0000000000 --- a/ompi/mca/coll/ml/coll_ml_select.c +++ /dev/null @@ -1,358 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/* - * Code for selecting a collective function. The selection is based on - * comm-time attributes and invoke-time attributes. - * - * comm-time attributes: Attributes, which can used to filter available - * collective functions at communicator init time. Example attributes include - * comm size and msg size supported by bcols. - * - * invoke-time attributes: Attributes, which can be used to select function - * for given collective when a collective is invoked. - * - */ - -#include "coll_ml_select.h" - -static int msg_to_range(size_t msg_len) -{ - int range; - - if (msg_len < MSG_RANGE_INITIAL) { - return 1; - } - - range = (int) log10((double)((msg_len / MSG_RANGE_INITIAL))); - - if (range > NUM_MSG_RANGES) - return NUM_MSG_RANGES; - - return range; -} - -static int cmp_comm_attribs(struct mca_bcol_base_coll_fn_comm_attributes_t *attrib_var, - struct mca_bcol_base_coll_fn_comm_attributes_t *attrib_bcol){ - - - if (!(attrib_var->comm_size_max <= attrib_bcol->comm_size_max)) { - return -1 ; - } - -#if 0 /* Manju: pelase fix it*/ - if (attrib_var->data_src != attrib_bcol->data_src) { - return -1; - } - - if (attrib_var->waiting_semantics != - attrib_bcol->waiting_semantics) { - return -1; - } -#endif - - return 0; -} - -/* - * Table that holds function names - */ -static int init_invoke_table(mca_coll_ml_module_t *ml_module) -{ - int i=0,j=0,k=0, index_topo; - int bcoll_type; - struct mca_bcol_base_module_t *bcol_module = NULL; - int j_bcol_module=0; - int i_hier=0; - mca_coll_ml_topology_t *topo; - - for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) { - topo = &ml_module->topo_list[index_topo]; - if (COLL_ML_TOPO_DISABLED == topo->status) { - /* skip the topology */ - continue; - } - for (i_hier = 0; i_hier < topo->n_levels; i_hier++) { - - for (j_bcol_module = 0; - j_bcol_module < topo->component_pairs[i_hier].num_bcol_modules; - ++j_bcol_module) { - - bcol_module = topo->component_pairs[i_hier].bcol_modules[j_bcol_module]; - - for (bcoll_type = 0; bcoll_type < BCOL_NUM_OF_FUNCTIONS ; bcoll_type++){ - for (i=0; ifiltered_fns_table[DATA_SRC_UNKNOWN][BLOCKING][bcoll_type][i][j][k] - = NULL; - - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][BLOCKING][bcoll_type][i][j][k] - = NULL; - - bcol_module->filtered_fns_table[DATA_SRC_UNKNOWN][NON_BLOCKING][bcoll_type][i][j][k] - = NULL; - - bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][bcoll_type][i][j][k] - = NULL; - - } - } - } - } - } - - } - } - - return 0; -} - -static int add_to_invoke_table(mca_bcol_base_module_t *bcol_module, - mca_bcol_base_coll_fn_desc_t *fn_filtered, - mca_coll_ml_module_t *ml_module) -{ - struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs = NULL; - int bcoll_type, data_src_type, waiting_semantic; - int range_min,range_max; - int i=0,j=0,k=0; - - - if((NULL == fn_filtered->inv_attr)||(NULL == fn_filtered->comm_attr)) { - return OMPI_ERROR; - } - - ML_VERBOSE(10, ("Calling add_to_invoke_table %p",fn_filtered->coll_fn)); - - inv_attribs = fn_filtered->inv_attr; - bcoll_type = fn_filtered->comm_attr->bcoll_type; - data_src_type = fn_filtered->comm_attr->data_src; - waiting_semantic = fn_filtered->comm_attr->waiting_semantics; - - range_min = msg_to_range(inv_attribs->bcol_msg_min); - range_max = msg_to_range(inv_attribs->bcol_msg_max); - - for (j=0; jdatatype_bitmap & (1ul << k)) && (inv_attribs->op_types_bitmap & (1ul << j))){ - - for (i=range_min; i<=range_max; i++) { - bcol_module->filtered_fns_table[data_src_type][waiting_semantic][bcoll_type][i][j][k] - = fn_filtered; - ML_VERBOSE(21, ("Putting functions %d %d %d %d %p", bcoll_type, i, j, k, fn_filtered)); - } - } - } - } - - return 0; - -} - -/* - * Maps count to msg range that is used for - * function table - * RANGE 0 is for small messages (say small msg =10k) - * MSG RANGE 1 - 10K - 100K - * RANGE 2 - 100K -1M - * RANGE 3 - 1M - 10M - * - * This is valid only when MSG_RANGE_INC is 10. - * For other values the function should replace log10 to log with - * base=MSG_RANGE_INC - */ -static int count_to_msg_range(int count,struct ompi_datatype_t *dtype) -{ - size_t msg_len =0,dt_size; - int range = 0 ; - - ompi_datatype_type_size(dtype, &dt_size); - msg_len = count*dt_size; - - if (msg_len < MSG_RANGE_INITIAL) { - return 1; - } - - range = (int) log10((double)((msg_len/MSG_RANGE_INITIAL))); - - if (range > NUM_MSG_RANGES) - return NUM_MSG_RANGES; - - return range; - -} - -/* Based on the attributes filled in comm_select_attributes - select functions for invoke time filtering */ - - -static int build_algorithms_table(mca_coll_ml_module_t *ml_module,struct - mca_bcol_base_coll_fn_comm_attributes_t *my_comm_attrib) -{ - int i_hier, j_bcol_module, k_bcol_fn, index_topo; - struct mca_bcol_base_module_t *bcol_module = NULL; - opal_list_t *fn_filtered_list; - opal_list_item_t *item; - mca_coll_ml_topology_t *topo; - - /* - * Go through each hierarchy and for each - * bcol module in the hierarchy, select the alogrithms. - */ - for (index_topo = 0; index_topo < COLL_ML_TOPO_MAX; index_topo++) { - topo = &ml_module->topo_list[index_topo]; - for (i_hier = 0; i_hier < topo->n_levels; i_hier++) { - my_comm_attrib->comm_size_max = - topo->component_pairs[i_hier].subgroup_module->group_size; - - for (j_bcol_module = 0; - j_bcol_module < topo->component_pairs[i_hier].num_bcol_modules; - ++j_bcol_module) { - - bcol_module = topo->component_pairs[i_hier].bcol_modules[j_bcol_module]; - - /* Go through all bcols and available bcol functions */ - for (k_bcol_fn = 0; k_bcol_fn < BCOL_NUM_OF_FUNCTIONS; k_bcol_fn++) { - struct mca_bcol_base_coll_fn_desc_t *fn_filtered = NULL; - - /* Query the function attributes */ - fn_filtered_list = - &(bcol_module->bcol_fns_table[k_bcol_fn]); - - - if (0 == opal_list_get_size(fn_filtered_list)) { - continue; - } - /* All definitions of a collective type is stored in the list - * Each item in the list is checked for compatability in the - * attributes and stored in the filtered list */ - for (item = opal_list_get_first(fn_filtered_list); - item != opal_list_get_end(fn_filtered_list); - item = opal_list_get_next(item)){ - - fn_filtered = (struct mca_bcol_base_coll_fn_desc_t *)item; - if (cmp_comm_attribs(my_comm_attrib, fn_filtered->comm_attr) < 0) { - /* Criteria not satisfied continue to next bcol function */ - continue; - } - - /* - * Add bcol function to be available for invoke time selection - */ - add_to_invoke_table(bcol_module, fn_filtered, ml_module); - } - - } - } - } - } - - return 0; - -} - -int mca_coll_ml_build_filtered_fn_table(mca_coll_ml_module_t *ml_module) -{ - - struct mca_bcol_base_coll_fn_comm_attributes_t *my_comm_attrib = NULL; - - - /* Init table storing all filtered functions */ - init_invoke_table(ml_module); - - my_comm_attrib = malloc(sizeof(struct mca_bcol_base_coll_fn_comm_attributes_t)); - - if (!my_comm_attrib) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - my_comm_attrib->comm_size_min = 0; - - /* - * This values should be passed using (maybe) mca parameters - */ -#if 0 /* Manju: pelase fix it*/ - my_comm_attrib->data_src = DATA_SRC_KNOWN; - my_comm_attrib->waiting_semantics = BLOCKING; -#endif - - if (build_algorithms_table(ml_module,my_comm_attrib)) { - return OMPI_ERROR; - } - - free(my_comm_attrib); - - return OMPI_SUCCESS; - -} - -#if 0 -static struct mca_bcol_base_coll_fn_invoke_attributes_t *mca_construct_invoke_attributes( - struct ompi_datatype_t *dtype, int count, - struct ompi_op_t op_type) -{ - size_t dt_size, msg_size; - struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs = NULL; - - ompi_datatype_type_size(dtype, &dt_size); - msg_size = count*dt_size; - - - inv_attribs = malloc(sizeof(struct mca_bcol_base_coll_fn_invoke_attributes_t)); - - /* Fix : We might need to have range for msg size - For now selection will - * be based on maximum value - */ - inv_attribs->bcol_msg_min = 0; - inv_attribs->bcol_msg_max = msg_size; - - return inv_attribs; -} -#endif - -int mca_select_bcol_function(mca_bcol_base_module_t *bcol_module, - int bcoll_type, - bcol_function_args_t *bcol_fn_arguments, - mca_bcol_base_function_t *ml_fn_arguments ) -{ - - struct mca_bcol_base_coll_fn_desc_t *fn_filtered = NULL; - int msg_range=0; - int ret; - int data_src_type = DATA_SRC_KNOWN, waiting_type = BLOCKING; - - msg_range = - count_to_msg_range(bcol_fn_arguments->count, - bcol_fn_arguments->dtype); - if ((BCOL_ALLREDUCE == bcoll_type) || (BCOL_REDUCE == bcoll_type)) { - /* needs to be resolved, the op structure has changed, there is no field called "op_type" */ - fn_filtered = - bcol_module->filtered_fns_table[data_src_type][waiting_type][bcoll_type][msg_range][bcol_fn_arguments->dtype->id][bcol_fn_arguments->op->op_type]; - } - else { - fn_filtered = - bcol_module->filtered_fns_table[data_src_type][waiting_type][bcoll_type][msg_range][bcol_fn_arguments->dtype->id][0]; - - } - - if (NULL == fn_filtered) { - return OMPI_ERROR; - } - - ret = (fn_filtered->coll_fn)(bcol_fn_arguments,ml_fn_arguments); - return ret; -} - diff --git a/ompi/mca/coll/ml/coll_ml_select.h b/ompi/mca/coll/ml/coll_ml_select.h deleted file mode 100644 index 32e3706d7a..0000000000 --- a/ompi/mca/coll/ml/coll_ml_select.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef MCA_COLL_ML_SELECT_H -#define MCA_COLL_ML_SELECT_H - - -#include "ompi_config.h" - -#include -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/op/op.h" -#include "ompi/mca/bcol/bcol.h" -#include "coll_ml.h" -#include "coll_ml_inlines.h" - - - -/* Forward declaration */ -struct mca_coll_ml_module_t; - -int mca_select_bcol_function(mca_bcol_base_module_t *bcol_module, - int bcoll_type, - bcol_function_args_t *bcol_fn_arguments, - mca_bcol_base_function_t *ml_fn_arguments ); -/* - * Goes through the function table and filters the collectives functions - * based on comm-time attributes. - */ -int mca_coll_ml_build_filtered_fn_table(struct mca_coll_ml_module_t *ml_module); - -#endif /* MCA_COLL_ML_SELECT_H */ diff --git a/ompi/mca/coll/ml/common_sym_whitelist.txt b/ompi/mca/coll/ml/common_sym_whitelist.txt deleted file mode 100644 index 6a99e2b40c..0000000000 --- a/ompi/mca/coll/ml/common_sym_whitelist.txt +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore symbols in this component that are auto-generated and we -# can't do anything about them (e.g., flex/bison symbols). -coll_ml_config_yyleng -coll_ml_config_yytext diff --git a/ompi/mca/coll/ml/help-mpi-coll-ml.txt b/ompi/mca/coll/ml/help-mpi-coll-ml.txt deleted file mode 100644 index 60ca60dfa1..0000000000 --- a/ompi/mca/coll/ml/help-mpi-coll-ml.txt +++ /dev/null @@ -1,64 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2009-2014 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2014 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English help file for Open MPI's Hierarchical Collective -# Component (coll/ml). -# -[empty-sub-group] -ML topology configuration explicitly requested for this subgroup: - - %s - -Such configuration results in a creation of empty groups. As a result, ML -framework cannot configure requested collective operations. ML framework will be -disabled. One configuration that might enable ML component is --mca bcol_base_string basesmuma,ptpcoll ---mca sbgp_base_subgroups_string basesmuma,p2p - -[allreduce-not-supported] -This BCOL is configured in one of the hierarchy : - - %s - -The BCOL does not support Allreduce for all -operations and datatype combination. In addition, you did not suggest -alternate topology building configurations. - -[allreduce-alt-nosupport] -The hierarchy is configured with alternate BCOL: - - %s - -Both the original topology and alternate topology not support Allreduce for all -operations and datatype combination. In addition, you did not suggest -alternate topology building configurations. - -[fragmentation-disabled] - -ML could not be used because the mca param coll_ml_enable_fragmentation -was set to zero and there is a bcol that does not support -zero copy method. - -[static-bcast-disabled] - -ML could not be used because the mca param coll_ml_bcast_algorithm -was not set to static and other broadcast implementation was available. - -[coll-ml-check-error] - -ML detected an error on communicator %s - -This communicator cannot be used any more - -[coll-ml-check-fatal-error] - -ML detected an unrecoverable error on intrinsic communicator %s - -The program will now abort diff --git a/ompi/mca/coll/ml/mca-coll-ml.config b/ompi/mca/coll/ml/mca-coll-ml.config deleted file mode 100644 index 6410b11923..0000000000 --- a/ompi/mca/coll/ml/mca-coll-ml.config +++ /dev/null @@ -1,170 +0,0 @@ -################################## -# ML collective configuration file -################################## -# NOTE (by Pasha): -# Since ML configuration infrastructure is limited on this stage we do not support some tunings, even so parser -# understands this values and keys, but we do not have place to load all this values. -# threshold - ML infrastructure does not handle multiple thresholds. -# fragmentation - ML infrastructure does not fragmentation tuning per collective. -################################## - -# Defining collective section -[BARRIER] -# Defining message size section. We will support small/large. In future we may add more options. Barrier is very specific case, because it is only collective that does not transfer any data, so for this specific case we use small - -# Since ML does not define any algorithm for BARRIER, we just use default. Later we have to introduce some algorithm name for Barrier -algorithm = ML_BARRIER_DEFAULT - -# Hierarchy setup: -# -# full_hr - means all possible levels of hierarchy (list of possible is defined by user command line) -# full_hr_no_basesocket - means all possible levels of hierarchy (list of possible is defined by user command line) -# except the basesocket subgroup. -# ptp_only - only ptp hierarchy -# iboffload_only - only iboffload hierarhcy -hierarchy = full_hr - -[IBARRIER] - -algorithm = ML_BARRIER_DEFAULT -hierarchy = full_hr - -[BCAST] - -# bcast supports: ML_BCAST_SMALL_DATA_KNOWN, ML_BCAST_SMALL_DATA_UNKNOWN, ML_BCAST_SMALL_DATA_SEQUENTIAL -algorithm = ML_BCAST_SMALL_DATA_KNOWN -hierarchy = full_hr - -# bcast supports: ML_BCAST_LARGE_DATA_KNOWN, ML_BCAST_LARGE_DATA_UNKNOWN, ML_BCAST_LARGE_DATA_SEQUENTIAL -algorithm = ML_BCAST_LARGE_DATA_KNOWN -hierarchy = full_hr - -[IBCAST] - -algorithm = ML_BCAST_SMALL_DATA_KNOWN -hierarchy = full_hr - -algorithm = ML_BCAST_LARGE_DATA_KNOWN -hierarchy = full_hr - -[GATHER] - -# gather supports: ML_SMALL_DATA_GATHER -algorithm = ML_SMALL_DATA_GATHER -hierarchy = full_hr - -# gather supports: ML_LARGE_DATA_GATHER -algorithm = ML_LARGE_DATA_GATHER -hierarchy = full_hr - -[IGATHER] - -# gather supports: ML_SMALL_DATA_GATHER -algorithm = ML_SMALL_DATA_GATHER -hierarchy = full_hr - -# gather supports: ML_LARGE_DATA_GATHER -algorithm = ML_LARGE_DATA_GATHER -hierarchy = full_hr - -[ALLGATHER] - -# allgather supports: ML_SMALL_DATA_ALLGATHER -algorithm = ML_SMALL_DATA_ALLGATHER -hierarchy = full_hr - -# allgather supports: ML_LARGE_DATA_ALLGATHER -algorithm = ML_LARGE_DATA_ALLGATHER -hierarchy = full_hr - -[IALLGATHER] - -# allgather supports: ML_SMALL_DATA_ALLGATHER -algorithm = ML_SMALL_DATA_ALLGATHER -hierarchy = full_hr - -# allgather supports: ML_LARGE_DATA_ALLGATHER -algorithm = ML_LARGE_DATA_ALLGATHER -hierarchy = full_hr - -[ALLTOALL] - -# alltoall supports: ML_SMALL_DATA_ALLTOALL -algorithm = ML_SMALL_DATA_ALLTOALL -hierarchy = ptp_only - -# alltoall supports: ML_LARGE_DATA_ALLTOALL -algorithm = ML_LARGE_DATA_ALLTOALL -hierarchy = ptp_only - -[IALLTOALL] - -# alltoall supports: ML_SMALL_DATA_ALLTOALL -algorithm = ML_SMALL_DATA_ALLTOALL -hierarchy = ptp_only - -# alltoall supports: ML_LARGE_DATA_ALLTOALL -algorithm = ML_LARGE_DATA_ALLTOALL -hierarchy = ptp_only - -[ALLREDUCE] - -# allreduce supports: ML_SMALL_DATA_ALLREDUCE -algorithm = ML_SMALL_DATA_ALLREDUCE -hierarchy = full_hr - -# allreduce supports: ML_LARGE_DATA_ALLREDUCE -algorithm = ML_LARGE_DATA_ALLREDUCE -hierarchy = full_hr - -[IALLREDUCE] - -# allreduce supports: ML_SMALL_DATA_ALLREDUCE -algorithm = ML_SMALL_DATA_ALLREDUCE -hierarchy = full_hr - -# allreduce supports: ML_LARGE_DATA_ALLREDUCE -algorithm = ML_LARGE_DATA_ALLREDUCE -hierarchy = full_hr - -[REDUCE] - -# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL -algorithm = ML_SMALL_DATA_REDUCE -hierarchy = full_hr - -# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL -algorithm = ML_LARGE_DATA_REDUCE -hierarchy = full_hr - -[IREDUCE] - -# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL -algorithm = ML_SMALL_DATA_REDUCE -hierarchy = full_hr - -# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL -algorithm = ML_LARGE_DATA_REDUCE -hierarchy = full_hr - - - -[SCATTER] - -# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL -algorithm = ML_SCATTER_SMALL_DATA_SEQUENTIAL -hierarchy = full_hr - -# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL -algorithm = ML_SCATTER_SMALL_DATA_SEQUENTIAL -hierarchy = full_hr - -[ISCATTER] - -# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL -algorithm = ML_SCATTER_SMALL_DATA_SEQUENTIAL -hierarchy = full_hr - -# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL -algorithm = ML_SCATTER_SMALL_DATA_SEQUENTIAL -hierarchy = full_hr diff --git a/ompi/mca/coll/ml/owner.txt b/ompi/mca/coll/ml/owner.txt deleted file mode 100644 index 51ea04a517..0000000000 --- a/ompi/mca/coll/ml/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL? -status: unmaintained diff --git a/ompi/mca/sbgp/Makefile.am b/ompi/mca/sbgp/Makefile.am deleted file mode 100644 index d07ea3306b..0000000000 --- a/ompi/mca/sbgp/Makefile.am +++ /dev/null @@ -1,36 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - - -# main library setup -noinst_LTLIBRARIES = libmca_sbgp.la -libmca_sbgp_la_SOURCES = - -# header setup -nobase_ompi_HEADERS = -nobase_nodist_ompi_HEADERS = - -# local files -headers = sbgp.h -libmca_sbgp_la_SOURCES += $(headers) $(nodist_headers) - -# Conditionally install the header files -if WANT_INSTALL_HEADERS -nobase_ompi_HEADERS += $(headers) -nobase_nodist_ompi_HEADERS += $(nodist_headers) -ompidir = $(ompiincludedir)/ompi/mca/sbgp -else -ompidir = $(includedir) -endif - -include base/Makefile.am - -distclean-local: - rm -f base/static-components.h diff --git a/ompi/mca/sbgp/base/Makefile.am b/ompi/mca/sbgp/base/Makefile.am deleted file mode 100644 index c520ef7bb7..0000000000 --- a/ompi/mca/sbgp/base/Makefile.am +++ /dev/null @@ -1,17 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - - -headers += \ - base/base.h -libmca_sbgp_la_SOURCES += \ - base/sbgp_base_frame.c \ - base/sbgp_base_init.c diff --git a/ompi/mca/sbgp/base/base.h b/ompi/mca/sbgp/base/base.h deleted file mode 100644 index f421aac1dd..0000000000 --- a/ompi/mca/sbgp/base/base.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_SBGP_BASE_H -#define MCA_SBGP_BASE_H - -#include "ompi_config.h" - -#include "ompi/mca/mca.h" -#include "opal/mca/base/mca_base_framework.h" -/* - * Global functions for SBGP - */ - -/* components in use */ -OMPI_MODULE_DECLSPEC extern opal_list_t mca_sbgp_base_components_in_use; -OMPI_MODULE_DECLSPEC extern int mca_sbgp_base_components_in_use_inited; -OMPI_DECLSPEC extern char *ompi_sbgp_subgroups_string; - -BEGIN_C_DECLS - -/* - * MCA Framework - */ -OMPI_DECLSPEC extern mca_base_framework_t ompi_sbgp_base_framework; - -/* select a component */ -OMPI_DECLSPEC int mca_sbgp_base_init(bool, bool); - -/* subgrouping component and key value */ -struct sbgp_base_component_keyval_t { - mca_base_component_list_item_t component; - char *key_value; -}; -typedef struct sbgp_base_component_keyval_t sbgp_base_component_keyval_t; -OBJ_CLASS_DECLARATION(sbgp_base_component_keyval_t); - -END_C_DECLS - -#endif /* MCA_SBGP_BASE_H */ diff --git a/ompi/mca/sbgp/base/owner.txt b/ompi/mca/sbgp/base/owner.txt deleted file mode 100644 index 1c86df367b..0000000000 --- a/ompi/mca/sbgp/base/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL -status: unmaintained diff --git a/ompi/mca/sbgp/base/sbgp_base_close.c b/ompi/mca/sbgp/base/sbgp_base_close.c deleted file mode 100644 index cc7dd26c4e..0000000000 --- a/ompi/mca/sbgp/base/sbgp_base_close.c +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "ompi_config.h" - -#include - -#include "ompi/constants.h" -#include "ompi/mca/mca.h" -#include "opal/mca/base/base.h" -#include "ompi/mca/sbgp/sbgp.h" -#include "ompi/mca/sbgp/base/base.h" -#include "ompi/include/ompi/constants.h" - - -int mca_sbgp_base_close(void) -{ - - /* Close all remaining available modules */ - - mca_base_components_close(ompi_sbgp_base_framework.framework_output, - &mca_sbgp_base_components_opened, NULL); - - /* Close the framework output */ - opal_output_close (ompi_sbgp_base_framework.framework_output); - ompi_sbgp_base_framework.framework_output = -1; - - /* All done */ - - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/sbgp/base/sbgp_base_frame.c b/ompi/mca/sbgp/base/sbgp_base_frame.c deleted file mode 100644 index a0091e3532..0000000000 --- a/ompi/mca/sbgp/base/sbgp_base_frame.c +++ /dev/null @@ -1,205 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012-2014 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include - -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNIST_H */ -#include "ompi/mca/mca.h" -#include "opal/mca/base/base.h" - -#include "ompi/mca/sbgp/sbgp.h" -#include "ompi/mca/sbgp/base/base.h" -#include "ompi/include/ompi/constants.h" -#include "opal/util/argv.h" - -/* - * The following file was created by configure. It contains extern - * statements and the definition of an array of pointers to each - * component's public mca_base_component_t struct. - */ - -#include "ompi/mca/sbgp/base/static-components.h" - -/* -** * Global variables -** */ -opal_list_t mca_sbgp_base_components_in_use = {{0}}; -int mca_sbgp_base_components_in_use_inited=0; -OMPI_DECLSPEC char *ompi_sbgp_subgroups_string = NULL; - -static void mca_sbgp_base_destruct (mca_sbgp_base_module_t *module) -{ - /* free the list of ranks */ - if(module->group_list ) { - free(module->group_list); - module->group_list=NULL; - } -} - -OBJ_CLASS_INSTANCE(mca_sbgp_base_module_t, - opal_object_t, - NULL, - mca_sbgp_base_destruct); - -OBJ_CLASS_INSTANCE(sbgp_base_component_keyval_t, - mca_base_component_list_item_t, - NULL, - NULL); - -/* get list of subgrouping coponents to use */ -static int ompi_sbgp_set_components_to_use(opal_list_t *sbgp_components_avail, - opal_list_t *sbgp_components_in_use) -{ - /* local variables */ - const mca_base_component_t *component; - mca_base_component_list_item_t *cli; - sbgp_base_component_keyval_t *clj; - char **subgroups_requested = NULL, **sbgp_string = NULL; - char *sbgp_component, *sbgp_key; - const char *component_name; - int i, sbgp_size = 0, - sbgp_string_size = 0, - rc = OMPI_SUCCESS; - - /* split the list of requested subgroups */ - subgroups_requested = opal_argv_split(ompi_sbgp_subgroups_string, ','); - if(NULL == subgroups_requested) { - return OMPI_ERROR; - } - sbgp_size = opal_argv_count (subgroups_requested); - - /* Initialize list */ - OBJ_CONSTRUCT(sbgp_components_in_use, opal_list_t); - - /* loop over list of components requested */ - for (i = 0; i < sbgp_size; i++) { - /* get key-value */ - sbgp_string = opal_argv_split(subgroups_requested[i], ':'); - if (NULL == sbgp_string) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - break; - } - - sbgp_string_size = opal_argv_count (sbgp_string); - if (sbgp_string_size < 1 || sbgp_string_size > 2) { - opal_output(ompi_sbgp_base_framework.framework_output, - "Requested SBGP configuration is illegal %s", - subgroups_requested[i]); - opal_argv_free (sbgp_string); - rc = OMPI_ERROR; - break; - } - - /* it is garanteed that sbgp_string[1] will either be NULL (count = 1) or a string */ - sbgp_key = sbgp_string[1]; - sbgp_component = sbgp_string[0]; - - /* loop over discovered components */ - OPAL_LIST_FOREACH(cli, sbgp_components_avail, mca_base_component_list_item_t) { - component = cli->cli_component; - component_name = component->mca_component_name; - - /* key_value[0] has the component name, and key_value[1], if - ** it is not NULL, has the key_value associated with this - ** instance of the compoenent - */ - - if (0 == strcmp (component_name, sbgp_component)) { - /* found selected component */ - clj = OBJ_NEW(sbgp_base_component_keyval_t); - if (NULL == clj) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - opal_argv_free (sbgp_string); - goto exit_ERROR; - } - /* fprintf(stderr,"sbgp selecting %s %s\n", sbgp_component, component_name); */ - - clj->component.cli_component = component; - if (NULL != sbgp_key) { - clj->key_value = strdup(sbgp_key); - } else { - clj->key_value = NULL; - } - opal_list_append(sbgp_components_in_use, (opal_list_item_t *)clj); - break; - } - } - - opal_argv_free (sbgp_string); - } - - /* Note: Need to add error checking to make sure all requested functions - ** were found */ - - /* - ** release resources - ** */ - exit_ERROR: - opal_argv_free (subgroups_requested); - - return rc; -} - -static int mca_sbgp_base_register(mca_base_register_flag_t flags) -{ - /* get list of sub-grouping functions to use */ - ompi_sbgp_subgroups_string = "basesmsocket,basesmuma,ibnet,p2p"; - (void) mca_base_var_register("ompi", "sbgp", "base", "subgroups_string", - "Default set of subgroup operations to apply ", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_LOCAL, - &ompi_sbgp_subgroups_string); - - return OMPI_SUCCESS; -} - -static int mca_sbgp_base_close(void) -{ - opal_list_item_t *item; - - while (NULL != (item = opal_list_remove_first (&mca_sbgp_base_components_in_use))) { - OBJ_RELEASE(item); - } - - OBJ_DESTRUCT(&mca_sbgp_base_components_in_use); - - return mca_base_framework_components_close(&ompi_sbgp_base_framework, NULL); -} - -/** - * Function for finding and opening either all MCA components, or the one - * that was specifically requested via a MCA parameter. - */ -static int mca_sbgp_base_open(mca_base_open_flag_t flags) -{ - int ret; - - if (OMPI_SUCCESS != (ret = mca_base_framework_components_open(&ompi_sbgp_base_framework, flags))) { - return ret; - } - - ret = ompi_sbgp_set_components_to_use(&ompi_sbgp_base_framework.framework_components, - &mca_sbgp_base_components_in_use); - - return ret; -} - -MCA_BASE_FRAMEWORK_DECLARE(ompi, sbgp, "OMPI Subgroup Subsystem", mca_sbgp_base_register, - mca_sbgp_base_open, mca_sbgp_base_close, - mca_sbgp_base_static_components, 0); - diff --git a/ompi/mca/sbgp/base/sbgp_base_init.c b/ompi/mca/sbgp/base/sbgp_base_init.c deleted file mode 100644 index d1f66da9b5..0000000000 --- a/ompi/mca/sbgp/base/sbgp_base_init.c +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/mca/mca.h" -#include "opal/mca/base/base.h" -#include "ompi/mca/sbgp/sbgp.h" -#include "ompi/mca/sbgp/base/base.h" -#include "ompi/include/ompi/constants.h" - -int mca_sbgp_base_init(bool enable_progress_threads, bool enable_mpi_threads) -{ - mca_sbgp_base_component *sbgp_component = NULL; - mca_base_component_list_item_t *cli; - opal_list_item_t *item; - int ret; - - /* loop over component initialization functions */ - for (item = opal_list_get_first((opal_list_t *) &mca_sbgp_base_components_in_use); - opal_list_get_end((opal_list_t *) &mca_sbgp_base_components_in_use) != item; - item = opal_list_get_next(item)) { - - cli = (mca_base_component_list_item_t *) item; - sbgp_component = (mca_sbgp_base_component *)cli->cli_component; - - ret = sbgp_component->sbgp_init_query(true, true); - if( OMPI_SUCCESS != ret) { - return ret; - } - } - - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/sbgp/basesmsocket/Makefile.am b/ompi/mca/sbgp/basesmsocket/Makefile.am deleted file mode 100644 index e255546573..0000000000 --- a/ompi/mca/sbgp/basesmsocket/Makefile.am +++ /dev/null @@ -1,41 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources = \ - sbgp_basesmsocket.h \ - sbgp_basesmsocket_component.c \ - sbgp_basesmsocket_module.c - - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -component_noinst = -component_install = -if MCA_BUILD_ompi_sbgp_basesmsocket_DSO -component_install += mca_sbgp_basesmsocket.la -else -component_noinst += libmca_sbgp_basesmsocket.la -endif - -# See ompi/mca/btl/sm/Makefile.am for an explanation of -# libmca_common_sm.la. - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_sbgp_basesmsocket_la_SOURCES = $(sources) -mca_sbgp_basesmsocket_la_LDFLAGS = -module -avoid-version -mca_sbgp_basesmsocket_la_LIBADD = - -noinst_LTLIBRARIES = $(component_noinst) -libmca_sbgp_basesmsocket_la_SOURCES =$(sources) -libmca_sbgp_basesmsocket_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/sbgp/basesmsocket/owner.txt b/ompi/mca/sbgp/basesmsocket/owner.txt deleted file mode 100644 index 1c86df367b..0000000000 --- a/ompi/mca/sbgp/basesmsocket/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL -status: unmaintained diff --git a/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket.h b/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket.h deleted file mode 100644 index 739f913335..0000000000 --- a/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket.h +++ /dev/null @@ -1,81 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BCOL_basesmsocket_EXPORT_H -#define MCA_BCOL_basesmsocket_EXPORT_H - -#include "ompi_config.h" - -#include "mpi.h" -#include "ompi/mca/mca.h" -#include "ompi/mca/sbgp/sbgp.h" -#include "ompi/mca/sbgp/base/base.h" -#include "opal/mca/mpool/mpool.h" -#include "ompi/request/request.h" -#include "ompi/proc/proc.h" -#include "opal/util/output.h" - -BEGIN_C_DECLS - -#ifdef HAVE_SCHED_YIELD -# include -# define SPIN sched_yield() -#else /* no switch available */ -# define SPIN -#endif - -#define BASESMSOCKET_VERBOSE(level, ...) \ - do { \ - OPAL_OUTPUT_VERBOSE((ompi_sbgp_base_framework.framework_output, level, \ - __VA_ARGS__)); \ - } while(0); - -/** - * Structure to hold the basic shared memory coll component. First it holds the - * base coll component, and then holds a bunch of - * sm-coll-component-specific stuff (e.g., current MCA param - * values). - */ -struct mca_sbgp_basesmsocket_component_t { - /** Base coll component */ - mca_sbgp_base_component_2_0_0_t super; -}; - -/** - * Convenience typedef - */ -typedef struct mca_sbgp_basesmsocket_component_t - mca_sbgp_basesmsocket_component_t; - - -/* -** Base sub-group module -**/ - -struct mca_sbgp_basesmsocket_module_t { - /** Collective modules all inherit from opal_object */ - mca_sbgp_base_module_t super; - -}; -typedef struct mca_sbgp_basesmsocket_module_t mca_sbgp_basesmsocket_module_t; -OBJ_CLASS_DECLARATION(mca_sbgp_basesmsocket_module_t); - -/** -* Global component instance -*/ -OMPI_MODULE_DECLSPEC extern mca_sbgp_basesmsocket_component_t mca_sbgp_basesmsocket_component; - - -END_C_DECLS - -#endif /* MCA_BCOL_basesmsocket_EXPORT_H */ diff --git a/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_component.c b/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_component.c deleted file mode 100644 index d2cf31d416..0000000000 --- a/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_component.c +++ /dev/null @@ -1,305 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" - -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_MMAN_H -#include -#endif -#ifdef HAVE_FCNTL_H -#include -#endif - -#include "opal/mca/hwloc/hwloc.h" -#include "opal/mca/hwloc/base/base.h" -#include "opal/dss/dss_internal.h" -#include "opal/class/opal_object.h" - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "sbgp_basesmsocket.h" - -#include "ompi/patterns/comm/coll_ops.h" - - -/* - * Public string showing the coll ompi_sm V2 component version number - */ -const char *mca_sbgp_basesmsocket_component_version_string = - "Open MPI sbgp - basesmsocket collective MCA component version " OMPI_VERSION; - - -/* - * Local functions - */ - -static int basesmsocket_register(void); -static int basesmsocket_open(void); -static int basesmsocket_close(void); -static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_proc_t ** procs, - int n_procs_in, - struct ompi_communicator_t *comm, - char *key, - void *output_data - ); -static int mca_sbgp_basesmsocket_init_query(bool enable_progress_threads, - bool enable_mpi_threads); -/*----end local functions ----*/ - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ - -mca_sbgp_basesmsocket_component_t mca_sbgp_basesmsocket_component = { - - /* First, fill in the super */ - - { - /* First, the mca_component_t struct containing meta - information about the component itself */ - - .sbgp_version = { - MCA_SBGP_BASE_VERSION_2_0_0, - - /* Component name and version */ - - .mca_component_name = "basesmsocket", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - - /* Component open and close functions */ - - .mca_open_component = basesmsocket_open, - .mca_close_component = basesmsocket_close, - .mca_register_component_params = basesmsocket_register, - }, - - .sbgp_init_query = mca_sbgp_basesmsocket_init_query, - .select_procs = mca_sbgp_basesmsocket_select_procs, - .priority = 0, - } -}; - -/* - * Register the component - */ -static int basesmsocket_register(void) -{ - mca_sbgp_basesmsocket_component_t *cs = &mca_sbgp_basesmsocket_component; - - cs->super.priority = 90; - (void) mca_base_component_var_register(&mca_sbgp_basesmsocket_component.super.sbgp_version, - "priority", "Priority for the sbgp basesmsocket component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->super.priority); - - return OMPI_SUCCESS; -} - -/* - * Open the component - */ -static int basesmsocket_open(void) -{ - return OMPI_SUCCESS; -} - -/* - * Close the component - */ -static int basesmsocket_close(void) -{ - return OMPI_SUCCESS; -} - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_sbgp_basesmsocket_init_query(bool enable_progress_threads, - bool enable_mpi_threads) -{ - /* at this stage there is no reason to disaulify this component */ - - /* done */ - return OMPI_SUCCESS; -} - -#if 0 -/* NTH: this is no longer used but may be used if we can determine the binding policy*/ -static int mca_sbgp_map_to_logical_socket_id(int *socket) -{ - int ret = OMPI_SUCCESS; - hwloc_obj_t obj; - hwloc_obj_t first_pu_object; - hwloc_bitmap_t good; - int pu_os_index = -1, my_logical_socket_id = -1; - int this_pus_logical_socket_id = -1; - - *socket = my_logical_socket_id; - - /* bozo check */ - if (NULL == opal_hwloc_topology) { - return OPAL_ERR_NOT_INITIALIZED; - } - - good = hwloc_bitmap_alloc(); - if (NULL == good) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* get this process' CPU binding */ - if( 0 != hwloc_get_cpubind(opal_hwloc_topology,good, 0)){ - /* report some error */ - BASESMSOCKET_VERBOSE(10, "The global variable opal_hwloc_topology appears not to have been initialized\n"); - hwloc_bitmap_free(good); - return OMPI_ERROR; - } - - /* find the first logical PU object in the hwloc tree */ - first_pu_object = hwloc_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU, 0); - - - /* get the next bit in the bitmap (note: if pu_os_index == -1, then the - * first bit is returned - */ - /* traverse the hwloc tree */ - while( -1 != (pu_os_index = hwloc_bitmap_next(good, pu_os_index) ) ) { - /* Traverse all PUs in the machine in logical order, in the simple case - * there should only be a single PU that this process is bound to, right? - * - */ - for( obj = first_pu_object; obj != NULL; obj = obj->next_cousin ) {/* WTF is a "next_cousin" ? */ - /* is this PU the same as the bit I pulled off the mask? */ - if( obj->os_index == (unsigned int) pu_os_index) { - /* Then I found it, break out of for loop */ - break; - } - } - - if( NULL != obj) { - /* if we found the PU, then go upward in the tree - * looking for the enclosing socket - */ - while( (NULL != obj) && ( HWLOC_OBJ_SOCKET != obj->type) ){ - obj = obj->parent; - } - - if( NULL == obj ) { - /* then we couldn't find an enclosing socket, report this */ - } else { - /* We found the enclosing socket */ - if( -1 == my_logical_socket_id ){ - /* this is the first PU that I'm bound to */ - this_pus_logical_socket_id = obj->logical_index; - my_logical_socket_id = this_pus_logical_socket_id; - } else { - /* this is not the first PU that I'm bound to. - * Seems I'm bound to more than a single PU. Question - * is, am I bound to the same socket?? - */ - /* in order to get rid of the compiler warning, I had to cast - * "this_pus_logical_socket_id", at a glance this seems ok, - * but if subgrouping problems arise, maybe look here. I shall - * tag this line with the "mark of the beast" for grepability - * 666 - */ - if( (unsigned int) this_pus_logical_socket_id != obj->logical_index ){ - /* 666 */ - /* Then we're bound to more than one socket...fail */ - this_pus_logical_socket_id = -1; - my_logical_socket_id = -1; - break; - } - } - } - - } - - /* end while */ - } - *socket = my_logical_socket_id; - hwloc_bitmap_free(good); - - return ret; - -} -#endif - -/* This routine is used to find the list of procs that run on the -** same host as the calling process. -*/ - -static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_proc_t ** procs, - int n_procs_in, - struct ompi_communicator_t *comm, - char *key, - void *output_data - ) -{ - /* local variables */ - mca_sbgp_basesmsocket_module_t *module; - int proc, cnt, n_local_peers; - - /* initialize data */ - for (proc = 0, n_local_peers = 0 ; proc < n_procs_in ; ++proc) { - if (OPAL_PROC_ON_LOCAL_SOCKET(procs[proc]->super.proc_flags)) { - n_local_peers++; - } - } - - /* we need to return a module even if there is only one local peer. this - * covers the case where there may be a basesmsocket module on one rank - * but not another */ - if (0 == n_local_peers) { - return NULL; - } - - /* create a new module */ - module = OBJ_NEW(mca_sbgp_basesmsocket_module_t); - if (!module) { - return NULL; - } - - module->super.group_size = n_local_peers; - module->super.group_comm = comm; - module->super.group_list = NULL; - module->super.group_net = OMPI_SBGP_SOCKET; - - /* allocate memory and fill in the group_list */ - module->super.group_list = (int *) calloc (n_local_peers, sizeof(int)); - if (NULL == module->super.group_list) { - OBJ_RELEASE(module); - return NULL; - } - - for (proc = 0, cnt = 0 ; proc < n_procs_in ; ++proc) { - if (OPAL_PROC_ON_LOCAL_SOCKET(procs[proc]->super.proc_flags)) { - module->super.group_list[cnt++] = proc; - } - } - - /* Return the module */ - return (mca_sbgp_base_module_t *) module; -} diff --git a/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_module.c b/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_module.c deleted file mode 100644 index 7f075eecdd..0000000000 --- a/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_module.c +++ /dev/null @@ -1,35 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_MMAN_H -#include -#endif -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket.h" - -OBJ_CLASS_INSTANCE(mca_sbgp_basesmsocket_module_t, - mca_sbgp_base_module_t, NULL, NULL); diff --git a/ompi/mca/sbgp/basesmuma/Makefile.am b/ompi/mca/sbgp/basesmuma/Makefile.am deleted file mode 100644 index 03470b69ae..0000000000 --- a/ompi/mca/sbgp/basesmuma/Makefile.am +++ /dev/null @@ -1,41 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources = \ - sbgp_basesmuma.h \ - sbgp_basesmuma_component.c \ - sbgp_basesmuma_module.c - - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -component_noinst = -component_install = -if MCA_BUILD_ompi_sbgp_basesmuma_DSO -component_install += mca_sbgp_basesmuma.la -else -component_noinst += libmca_sbgp_basesmuma.la -endif - -# See ompi/mca/btl/sm/Makefile.am for an explanation of -# libmca_common_sm.la. - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_sbgp_basesmuma_la_SOURCES = $(sources) -mca_sbgp_basesmuma_la_LDFLAGS = -module -avoid-version -mca_sbgp_basesmuma_la_LIBADD = - -noinst_LTLIBRARIES = $(component_noinst) -libmca_sbgp_basesmuma_la_SOURCES =$(sources) -libmca_sbgp_basesmuma_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/sbgp/basesmuma/owner.txt b/ompi/mca/sbgp/basesmuma/owner.txt deleted file mode 100644 index 1c86df367b..0000000000 --- a/ompi/mca/sbgp/basesmuma/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL -status: unmaintained diff --git a/ompi/mca/sbgp/basesmuma/sbgp_basesmuma.h b/ompi/mca/sbgp/basesmuma/sbgp_basesmuma.h deleted file mode 100644 index efe501e046..0000000000 --- a/ompi/mca/sbgp/basesmuma/sbgp_basesmuma.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#ifndef MCA_BCOL_basesmuma_EXPORT_H -#define MCA_BCOL_basesmuma_EXPORT_H - -#include "ompi_config.h" - -#include "mpi.h" -#include "ompi/mca/mca.h" -#include "ompi/mca/sbgp/sbgp.h" -#include "opal/mca/mpool/mpool.h" -#include "ompi/request/request.h" -#include "ompi/proc/proc.h" - -BEGIN_C_DECLS - -#ifdef HAVE_SCHED_YIELD -# include -# define SPIN sched_yield() -#else /* no switch available */ -# define SPIN -#endif - - - /** - * Structure to hold the basic shared memory coll component. First it holds the - * base coll component, and then holds a bunch of - * sm-coll-component-specific stuff (e.g., current MCA param - * values). - */ - struct mca_sbgp_basesmuma_component_t { - /** Base coll component */ - mca_sbgp_base_component_2_0_0_t super; - - }; - - /** - * Convenience typedef - */ - typedef struct mca_sbgp_basesmuma_component_t - mca_sbgp_basesmuma_component_t; - - - /* - ** Base sub-group module - **/ - - struct mca_sbgp_basesmuma_module_t { - /** Collective modules all inherit from opal_object */ - mca_sbgp_base_module_t super; - - }; - typedef struct mca_sbgp_basesmuma_module_t mca_sbgp_basesmuma_module_t; - OBJ_CLASS_DECLARATION(mca_sbgp_basesmuma_module_t); - - /** - * Global component instance - */ - OMPI_MODULE_DECLSPEC extern mca_sbgp_basesmuma_component_t mca_sbgp_basesmuma_component; - - -END_C_DECLS - -#endif /* MCA_BCOL_basesmuma_EXPORT_H */ diff --git a/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_component.c b/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_component.c deleted file mode 100644 index 4c6e232860..0000000000 --- a/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_component.c +++ /dev/null @@ -1,208 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_MMAN_H -#include -#endif -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "sbgp_basesmuma.h" - - -/* - * Public string showing the coll ompi_sm V2 component version number - */ -const char *mca_sbgp_basesmuma_component_version_string = - "Open MPI sbgp - basesmuma collective MCA component version " OMPI_VERSION; - - -/* - * Local functions - */ - -static int basesmuma_register(void); -static int basesmuma_open(void); -static int basesmuma_close(void); -static mca_sbgp_base_module_t *mca_sbgp_basesmuma_select_procs(struct ompi_proc_t ** procs, - int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data); - -static int mca_sbgp_basesmuma_init_query(bool enable_progress_threads, - bool enable_mpi_threads); - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ - -mca_sbgp_basesmuma_component_t mca_sbgp_basesmuma_component = { - - /* First, fill in the super */ - - { - /* First, the mca_component_t struct containing meta - information about the component itself */ - - .sbgp_version = { - MCA_SBGP_BASE_VERSION_2_0_0, - - /* Component name and version */ - - .mca_component_name = "basesmuma", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - - /* Component open, close, and register functions */ - - .mca_open_component = basesmuma_open, - .mca_close_component = basesmuma_close, - .mca_register_component_params = basesmuma_register, - }, - .sbgp_init_query = mca_sbgp_basesmuma_init_query, - .select_procs = mca_sbgp_basesmuma_select_procs, - .priority = 0, - } -}; - -/* - * Register the component - */ -static int basesmuma_register(void) -{ - mca_sbgp_basesmuma_component_t *cs = &mca_sbgp_basesmuma_component; - - /* set component priority */ - cs->super.priority = 90; - (void) mca_base_component_var_register(&cs->super.sbgp_version, - "priority", "Priority of the sbgp basesmuma", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->super.priority); - return OMPI_SUCCESS; -} - -/* - * Open the component - */ -static int basesmuma_open(void) -{ - return OMPI_SUCCESS; -} - - -/* - * Close the component - */ -static int basesmuma_close(void) -{ - return OMPI_SUCCESS; -} - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_sbgp_basesmuma_init_query(bool enable_progress_threads, - bool enable_mpi_threads) -{ - /* at this stage there is no reason to disaulify this component */ - - /* done */ - return OMPI_SUCCESS; -} - -/* This routine is used to find the list of procs that run on the -** same host as the calling process. -*/ -static mca_sbgp_base_module_t *mca_sbgp_basesmuma_select_procs(struct ompi_proc_t ** procs, - int n_procs_in, - struct ompi_communicator_t *comm, - char *key, - void *output_data - ) -{ - /* local variables */ - int cnt,proc,local,last_local_proc; - mca_sbgp_basesmuma_module_t *module; - - module=OBJ_NEW(mca_sbgp_basesmuma_module_t); - if (!module ) { - return NULL; - } - module->super.group_size=0; - module->super.group_comm = comm; - module->super.group_list = NULL; - module->super.group_net = OMPI_SBGP_MUMA; - for (proc = 0, cnt = 0, last_local_proc = 0 ; proc < n_procs_in ; ++proc) { - local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->super.proc_flags); - if (local) { - last_local_proc = proc; - cnt++; - } - } - /* if no other local procs found skip to end */ - - if( 2 > cnt ) { - /* There's always at least one - namely myself */ - assert(1 == cnt); - module->super.group_size = 1; - module->super.group_list = (int *) malloc (sizeof (int)); - module->super.group_list[0] = last_local_proc; - /* let ml handle this case */ - goto OneLocalPeer; - } - - /* generate list of local ranks */ - module->super.group_size=cnt; - if( cnt > 0 ) { - module->super.group_list=(int *)malloc(sizeof(int)*cnt); - if(NULL == module->super.group_list){ - goto Error; - } - } - - for (proc = 0, cnt = 0 ; proc < n_procs_in ; ++proc) { - local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->super.proc_flags); - if( local ) { - module->super.group_list[cnt++] = proc; - } - } -OneLocalPeer: - /* successful completion */ - return (mca_sbgp_base_module_t *)module; - - /* return with error */ - -Error: - - /* clean up */ - if( NULL != module->super.group_list ) { - free(module->super.group_list); - module->super.group_list=NULL; - } - - OBJ_RELEASE(module); - - return NULL; -} diff --git a/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_module.c b/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_module.c deleted file mode 100644 index 79028c4e25..0000000000 --- a/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_module.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_MMAN_H -#include -#endif -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/sbgp/basesmuma/sbgp_basesmuma.h" - -/* - * Local functions - */ -static void -mca_sbgp_basesmuma_module_construct(mca_sbgp_basesmuma_module_t *module) -{ -} - -static void -mca_sbgp_basesmuma_module_destruct(mca_sbgp_basesmuma_module_t *module) -{ - /* done */ -} - -OBJ_CLASS_INSTANCE(mca_sbgp_basesmuma_module_t, - mca_sbgp_base_module_t, - mca_sbgp_basesmuma_module_construct, - mca_sbgp_basesmuma_module_destruct); diff --git a/ompi/mca/sbgp/ibnet/.opal_ignore b/ompi/mca/sbgp/ibnet/.opal_ignore deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/ompi/mca/sbgp/ibnet/Makefile.am b/ompi/mca/sbgp/ibnet/Makefile.am deleted file mode 100644 index 28c3161eee..0000000000 --- a/ompi/mca/sbgp/ibnet/Makefile.am +++ /dev/null @@ -1,55 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(sbgp_ibnet_CPPFLAGS) $(btl_openib_CPPFLAGS) - -sources = \ - sbgp_ibnet.h \ - sbgp_ibnet_mca.h \ - sbgp_ibnet_mca.c \ - sbgp_ibnet_component.c \ - sbgp_ibnet_module.c - - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -component_noinst = -component_install = -if MCA_BUILD_ompi_sbgp_ibnet_DSO -component_install += mca_sbgp_ibnet.la -else -component_noinst += libmca_sbgp_ibnet.la -endif - -# See ompi/mca/btl/sm/Makefile.am for an explanation of -# libmca_common_sm.la. - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_sbgp_ibnet_la_SOURCES = $(sources) -mca_sbgp_ibnet_la_LDFLAGS = -module -avoid-version $(sbgp_ibnet_LDFLAGS) $(btl_openib_LDFLAGS) -mca_sbgp_ibnet_la_LIBADD = $(sbgp_ibnet_LIBS) $(btl_openib_LIBS) \ - $(OMPI_TOP_BUILDDIR)/ompi/mca/common/verbs/libmca_common_verbs.la \ - $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ofacm/libmca_common_ofacm.la - -noinst_LTLIBRARIES = $(component_noinst) -libmca_sbgp_ibnet_la_SOURCES =$(sources) -libmca_sbgp_ibnet_la_LDFLAGS = -module -avoid-version - -$(OMPI_TOP_BUILDDIR)/ompi/mca/common/ofacm/libmca_common_ofacm.la: foo.c - cd $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ofacm && $(MAKE) - -$(OMPI_TOP_BUILDDIR)/ompi/mca/common/ofautils/libmca_common_ofautils.la: foo.c - cd $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ofautils && $(MAKE) - -foo.c: diff --git a/ompi/mca/sbgp/ibnet/configure.m4 b/ompi/mca/sbgp/ibnet/configure.m4 deleted file mode 100644 index 6fdb24fa40..0000000000 --- a/ompi/mca/sbgp/ibnet/configure.m4 +++ /dev/null @@ -1,40 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2015 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_ompi_sbgp_ibnet_CONFIG([should_build]) -# ------------------------------------------ -# AC_DEFUN([MCA_ompi_sbgp_ibnet_POST_CONFIG], [ -# ]) - - -# MCA_ompi_sbgp_ibnet_CONFIG([action-if-can-compile], -# [action-if-cant-compile]) -# ------------------------------------------------ -AC_DEFUN([MCA_ompi_sbgp_ibnet_CONFIG],[ - AC_CONFIG_FILES([ompi/mca/sbgp/ibnet/Makefile]) - sbgp_ofa_happy="no" - sbgp_mlnx_ofed_happy="no" - - OPAL_CHECK_OPENFABRICS([sbgp_ibnet], [sbgp_ofa_happy="yes"]) - OPAL_CHECK_MLNX_OPENFABRICS([sbgp_ibnet], [sbgp_mlnx_ofed_happy="yes"]) - - AS_IF([test "$sbgp_ofa_happy" = "yes" && test "$sbgp_mlnx_ofed_happy" = "yes"], - [$1], - [$2]) - - # substitute in the things needed to build iboffload - AC_SUBST([sbgp_ibnet_CFLAGS]) - AC_SUBST([sbgp_ibnet_CPPFLAGS]) - AC_SUBST([sbgp_ibnet_LDFLAGS]) - AC_SUBST([sbgp_ibnet_LIBS]) -])dnl diff --git a/ompi/mca/sbgp/ibnet/owner.txt b/ompi/mca/sbgp/ibnet/owner.txt deleted file mode 100644 index 1c86df367b..0000000000 --- a/ompi/mca/sbgp/ibnet/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL -status: unmaintained diff --git a/ompi/mca/sbgp/ibnet/sbgp_ibnet.h b/ompi/mca/sbgp/ibnet/sbgp_ibnet.h deleted file mode 100644 index f29ffc33db..0000000000 --- a/ompi/mca/sbgp/ibnet/sbgp_ibnet.h +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#ifndef MCA_BCOL_ibnet_EXPORT_H -#define MCA_BCOL_ibnet_EXPORT_H - -#include "ompi_config.h" - -#include "mpi.h" -#include "infiniband/verbs.h" -#include "ompi/mca/mca.h" -#include "ompi/mca/sbgp/sbgp.h" -#include "opal/mca/mpool/mpool.h" -#include "ompi/request/request.h" -#include "ompi/proc/proc.h" -#include "ompi/mca/common/ofacm/connect.h" - -BEGIN_C_DECLS - -#ifdef HAVE_SCHED_YIELD -# include -# define SPIN sched_yield() -#else /* no switch available */ -# define SPIN -#endif - -typedef enum { - OFFLOAD_CONNECTX_B0, - OFFLOAD_DISABLE -} coll_offload_support; - -/** - * Structure to hold the basic shared memory coll component. First it holds the - * base coll component, and then holds a bunch of - * sm-coll-component-specific stuff (e.g., current MCA param - * values). - */ -struct mca_sbgp_ibnet_component_t { - /** Base coll component */ - mca_sbgp_base_component_2_0_0_t super; - - /** Enable disable verbose mode */ - int verbose; - - /* Maximum allowed number of subroups */ - int max_sbgps; - /* Enable disable default subnet id warning */ - bool warn_default_gid_prefix; - bool warn_nonexistent_if; - /* IB MTU requested by user */ - int mtu; /** MTU on this port */ - /** IB partition definition */ - int pkey_val; - /* Keeping hca data */ - char *if_include; - char **if_include_list; - char *if_exclude; - char **if_exclude_list; - /** Dummy argv-style list; a copy of names from the - if_[in|ex]clude list that we use for error checking (to ensure - that they all exist) */ - char **if_list; - /** List of iboffload devices that have at list one active port */ - opal_list_t devices; - int curr_max_group_id; - uint32_t total_active_ports; -}; - -/** - * Convenience typedef - */ -typedef struct mca_sbgp_ibnet_component_t -mca_sbgp_ibnet_component_t; - -/* IB port OBJ*/ -struct mca_sbgp_ibnet_port_t { - uint16_t id; /** Port number */ - int stat; /** Port status - Active,Init,etc.. */ - enum ibv_mtu mtu; /** MTU on this port */ - coll_offload_support coll_offload; /** Collectives offload mode */ - uint64_t subnet_id; /** Sunnet id for the port */ - /* uint8_t src_path_bits; */ - uint16_t lid; - uint16_t lmc; - /** Array of the peer's CPCs available on this port */ - uint32_t num_cpcs; - bool used; - ompi_common_ofacm_base_module_data_t *pm_cpc_data; - ompi_common_ofacm_base_module_t *local_cpc; /* selected cpc*/ - ompi_common_ofacm_base_module_data_t *remote_cpc_data; /* data for remote cpc */ -}; - -typedef struct mca_sbgp_ibnet_port_t mca_sbgp_ibnet_port_t; - -typedef enum { - MCA_SBGP_IBNET_NONE = 0, - MCA_SBGP_IBNET_NODE_LEADER = 1<<0, - MCA_SBGP_IBNET_SOCKET_LEADER = 1<<1, - MCA_SBGP_IBNET_SWITCH_LEADER = 1<<2 -} mca_sbgp_ibnet_duty_t; - -typedef enum { - MCA_SBGP_IBNET_ALL_NET, - MCA_SBGP_IBNET_NODE_NET, - MCA_SBGP_IBNET_NONE_NET -} mca_sbgp_ibnet_mode_t; - -struct mca_sbgp_ibnet_proc_t { - opal_list_item_t super; - ompi_proc_t *ompi_proc; /* Ompi proc pointer */ - int ompi_proc_index; /* Index of the proc in array */ - uint32_t rank; /* vpid, remote proc rank */ - uint32_t num_ports; /* number of remote ports */ - int *use_port; /* the size of this array is equal to number of cgroups that points to this proc. - Each cgroup has own index "I". The array keep remote port number that ne need to use - for cgroup "I" - use_port[I]. We need it for iboffload module */ - mca_sbgp_ibnet_port_t *remote_ports_info; /* the array keeps remote port information */ - mca_sbgp_ibnet_duty_t duty; /* Socket leader, Node leader, switch leader, etc. */ -}; - -typedef struct mca_sbgp_ibnet_proc_t mca_sbgp_ibnet_proc_t; -OBJ_CLASS_DECLARATION(mca_sbgp_ibnet_proc_t); - -/* Device OBJ */ -struct mca_sbgp_ibnet_device_t { - opal_list_item_t super; - struct ibv_device* ib_dev; /* pointer to device, from device list */ - int device_index; /* device index in device list */ - struct ibv_device_attr ib_dev_attr; /* attributes of the device */ - int num_act_ports; - int num_allowed_ports; - struct mca_sbgp_ibnet_port_t *ports; - /* CPC stuff */ - ompi_common_ofacm_base_module_t **cpcs; /* Array of CPCs */ - uint8_t num_cpcs; /* Number of elements in cpc array */ -}; - -typedef struct mca_sbgp_ibnet_device_t mca_sbgp_ibnet_device_t; -OBJ_CLASS_DECLARATION(mca_sbgp_ibnet_device_t); - -struct mca_sbgp_ibnet_connection_group_info_t { - int device_index; /* device index in device list */ - uint32_t port; /* port number */ - /* Used for detect number of a port to communicate with remote proc, - index in use_port arrray in the mca_sbgp_ibnet_proc_t structure */ - uint32_t index; - /* array of procs connected with this group */ - uint32_t num_procs; - opal_pointer_array_t *ibnet_procs; -}; -typedef struct mca_sbgp_ibnet_connection_group_info_t - mca_sbgp_ibnet_connection_group_info_t; - -/* - ** Base sub-group module - **/ -struct mca_sbgp_ibnet_module_t { - /** Collective modules all inherit from opal_object */ - mca_sbgp_base_module_t super; - int group_id; - /* opal_pointer_array_t *ibnet_procs; */ - /* number of connection groups */ - int num_cgroups; - /* - * Array of connection groups. There are same procs in these groups, - * but they were created over different ports (and different devices maybe). - */ - mca_sbgp_ibnet_connection_group_info_t *cgroups; - mca_sbgp_ibnet_mode_t mode; /* working mode of the module, it is ALL by default */ -}; -typedef struct mca_sbgp_ibnet_module_t mca_sbgp_ibnet_module_t; -OBJ_CLASS_DECLARATION(mca_sbgp_ibnet_module_t); - -/* Error and verbose prints */ - -static inline int mca_sbgp_ibnet_err(const char* fmt, ...) -{ - va_list list; - int ret; - - va_start(list, fmt); - ret = vfprintf(stderr, fmt, list); - va_end(list); - return ret; -} - -#define IBNET_ERROR(args) \ - do { \ - mca_sbgp_ibnet_err("[%s]%s[%s:%d:%s] IBNET ", \ - ompi_process_info.nodename, \ - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - mca_sbgp_ibnet_err args; \ - mca_sbgp_ibnet_err("\n"); \ - } while(0); - -#if OPAL_ENABLE_DEBUG -#define IBNET_VERBOSE(level, args) \ - do { \ - if(mca_sbgp_ibnet_component.verbose >= level) { \ - mca_sbgp_ibnet_err("[%s]%s[%s:%d:%s] IBNET ", \ - ompi_process_info.nodename, \ - OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - mca_sbgp_ibnet_err args; \ - mca_sbgp_ibnet_err("\n"); \ - } \ - } while(0); -#else -#define IBNET_VERBOSE(level, args) -#endif - -#define MCA_SBGP_IBNET_PKEY_MASK 0x7fff - -/* Error and verbose prints - end */ - -/* This routine is used to find the list of procs that run on the - ** same host as the calling process. - */ -mca_sbgp_base_module_t *mca_sbgp_ibnet_select_procs(struct ompi_proc_t ** procs, - int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data); - -/** - * Global component instance - */ -OMPI_MODULE_DECLSPEC extern mca_sbgp_ibnet_component_t mca_sbgp_ibnet_component; - - -END_C_DECLS - -#endif /* MCA_BCOL_ibnet_EXPORT_H */ diff --git a/ompi/mca/sbgp/ibnet/sbgp_ibnet_component.c b/ompi/mca/sbgp/ibnet/sbgp_ibnet_component.c deleted file mode 100644 index 15df331ad3..0000000000 --- a/ompi/mca/sbgp/ibnet/sbgp_ibnet_component.c +++ /dev/null @@ -1,600 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#include "infiniband/verbs.h" -#include -#include -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "opal/util/argv.h" -#include "opal/include/opal/types.h" -#include "opal_stdint.h" -#include "sbgp_ibnet.h" -#include "sbgp_ibnet_mca.h" -#include "ompi/mca/common/ofacm/base.h" -#include "ompi/mca/common/ofacm/connect.h" -#include "ompi/mca/common/verbs/common_verbs.h" - -/* - * Public string showing the coll ompi_sm V2 component version number - */ -const char *mca_sbgp_ibnet_component_version_string = - "Open MPI sbgp - ibnet collective MCA component version " OMPI_VERSION; - -/* - * Local functions - */ - -static int mca_sbgp_ibnet_open(void); -static int mca_sbgp_ibnet_close(void); -static int mca_sbgp_ibnet_init_query(bool enable_progress_threads, - bool enable_mpi_threads); - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ - -mca_sbgp_ibnet_component_t mca_sbgp_ibnet_component = { - - /* First, fill in the super */ - - { - /* First, the mca_component_t struct containing meta - information about the component itself */ - - .sbgp_version = { - MCA_SBGP_BASE_VERSION_2_0_0, - - /* Component name and version */ - - .mca_component_name = "ibnet", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - - /* Component open and close functions */ - - .mca_open_component = mca_sbgp_ibnet_open, - .mca_close_component = mca_sbgp_ibnet_close, - .mca_register_component_params = mca_sbgp_ibnet_register_params, - }, - - .sbgp_init_query = mca_sbgp_ibnet_init_query, - .select_procs =mca_sbgp_ibnet_select_procs, - .priority = 0, - }, - - /* verbose mode */ - false, - - /* Maximum allowed number of subroups*/ - 0, - - /* Enable disable default subnet id warning */ - false, - false, - - /* IB MTU requested by user */ - 0, - - /* IB partition definition */ - 0, - - /* Keeping hca data */ - NULL, - NULL, - NULL, - NULL, - - /** Dummy argv-style list; a copy of names from the - if_[in|ex]clude list that we use for error checking (to ensure - that they all exist) */ - NULL, -}; - -static int mca_sbgp_ibnet_dummy_init_query( - bool enable_progress_threads, bool enable_mpi_threads) -{ - return OMPI_SUCCESS; -} - -/* - * Open the component - */ -static int mca_sbgp_ibnet_open(void) -{ - /* local variables */ - mca_sbgp_ibnet_component_t *cs = &mca_sbgp_ibnet_component; - - mca_sbgp_ibnet_component.pkey_val &= SBGP_IBNET_IB_PKEY_MASK; - - cs->total_active_ports = 0; - cs->curr_max_group_id = 100; - - OBJ_CONSTRUCT(&cs->devices, opal_list_t); - - return OMPI_SUCCESS; -} - -/* - * Close the component - */ -static int mca_sbgp_ibnet_close(void) -{ - mca_sbgp_ibnet_component_t *cs = &mca_sbgp_ibnet_component; - - OBJ_DESTRUCT(&cs->devices); - - return OMPI_SUCCESS; -} - -static void mca_sbgp_ibnet_device_constructor - (mca_sbgp_ibnet_device_t *device) -{ - /* Init OFACM stuf */ - device->ib_dev = NULL; - device->device_index = -1; - device->num_act_ports = 0; - memset(&device->ib_dev_attr, 0, sizeof(struct ibv_device_attr)); - device->cpcs= NULL; - device->num_cpcs = 0; - device->ports = NULL; -} - -static void mca_sbgp_ibnet_device_destructor - (mca_sbgp_ibnet_device_t *device) -{ - /* release memory */ - if (NULL != device->ports) { - free(device->ports); - } -} - -OBJ_CLASS_INSTANCE(mca_sbgp_ibnet_device_t, - opal_list_item_t, - mca_sbgp_ibnet_device_constructor, - mca_sbgp_ibnet_device_destructor); - -static int -get_port_list(mca_sbgp_ibnet_device_t *device, int *allowed_ports) -{ - char *name; - const char *dev_name; - int i, j, k, num_ports = 0; - - dev_name = ibv_get_device_name(device->ib_dev); - name = (char*) malloc(strlen(dev_name) + 4); - if (NULL == name) { - return 0; - } - - num_ports = 0; - if (NULL != mca_sbgp_ibnet_component.if_include_list) { - /* If only the device name is given (eg. mtdevice0,mtdevice1) use all - ports */ - i = 0; - - while (mca_sbgp_ibnet_component.if_include_list[i]) { - if (0 == strcmp(dev_name, - mca_sbgp_ibnet_component.if_include_list[i])) { - num_ports = device->ib_dev_attr.phys_port_cnt; - - IBNET_VERBOSE(10, ("if_include_list - %s.\n", mca_sbgp_ibnet_component.if_include_list[i])); - goto done; - } - ++i; - } - - /* Include only requested ports on the device */ - for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) { - sprintf(name, "%s:%d", dev_name, i); - - for (j = 0; - NULL != mca_sbgp_ibnet_component.if_include_list[j]; ++j) { - if (0 == strcmp(name, - mca_sbgp_ibnet_component.if_include_list[j])) { - - IBNET_VERBOSE(10, ("Allowed port %d: idx %d; if_include_list - %s\n", - i, num_ports, mca_sbgp_ibnet_component.if_include_list[j])); - - allowed_ports[num_ports++] = i; - break; - } - } - } - } else if (NULL != mca_sbgp_ibnet_component.if_exclude_list) { - /* If only the device name is given (eg. mtdevice0,mtdevice1) exclude - all ports */ - i = 0; - while (mca_sbgp_ibnet_component.if_exclude_list[i]) { - if (0 == strcmp(dev_name, - mca_sbgp_ibnet_component.if_exclude_list[i])) { - num_ports = 0; - goto done; - } - ++i; - } - /* Exclude the specified ports on this device */ - for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) { - sprintf(name,"%s:%d",dev_name,i); - for (j = 0; - NULL != mca_sbgp_ibnet_component.if_exclude_list[j]; ++j) { - if (0 == strcmp(name, - mca_sbgp_ibnet_component.if_exclude_list[j])) { - /* If found, set a sentinel value */ - j = -1; - break; - } - } - /* If we didn't find it, it's ok to include in the list */ - if (-1 != j) { - allowed_ports[num_ports++] = i; - } - } - } else { - /* Assume that all ports are allowed. num_ports will be adjusted - below to reflect whether this is true or not. */ - for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) { - allowed_ports[num_ports++] = i; - } - } - -done: - - /* Remove the following from the error-checking if_list: - - bare device name - - device name suffixed with port number */ - if (NULL != mca_sbgp_ibnet_component.if_list) { - for (i = 0; NULL != mca_sbgp_ibnet_component.if_list[i]; ++i) { - /* Look for raw device name */ - if (0 == strcmp(mca_sbgp_ibnet_component.if_list[i], dev_name)) { - j = opal_argv_count(mca_sbgp_ibnet_component.if_list); - opal_argv_delete(&j, &(mca_sbgp_ibnet_component.if_list), - i, 1); - --i; - } - } - - for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) { - sprintf(name, "%s:%d", dev_name, i); - for (j = 0; NULL != mca_sbgp_ibnet_component.if_list[j]; ++j) { - if (0 == strcmp(mca_sbgp_ibnet_component.if_list[j], name)) { - k = opal_argv_count(mca_sbgp_ibnet_component.if_list); - opal_argv_delete(&k, &(mca_sbgp_ibnet_component.if_list), - j, 1); - --j; - break; - } - } - } - } - - free(name); - - return num_ports; -} - -static int ibnet_init_port(struct mca_sbgp_ibnet_device_t *device, - int port_index, struct ibv_port_attr *ib_port_attr, - struct ibv_context *ib_dev_context) -{ - union ibv_gid gid; - struct mca_sbgp_ibnet_port_t *p = &device->ports[port_index]; - - /* Set port data */ - p->lmc = (1 << ib_port_attr->lmc); - p->lid = ib_port_attr->lid; - p->stat = ib_port_attr->state; - p->mtu = ib_port_attr->active_mtu; - - IBNET_VERBOSE(10, ("Setting port data (%s:%d) lid=%d, lmc=%d, stat=%d, mtu=%d\n", - ibv_get_device_name(device->ib_dev), p->id, p->lid, - p->lmc, p->stat, p->mtu)); - - if (0 != ibv_query_gid(ib_dev_context, p->id, 0, &gid)) { - IBNET_ERROR(("ibv_query_gid failed (%s:%d)\n", - ibv_get_device_name(device->ib_dev), p->id)); - return OMPI_ERR_NOT_FOUND; - } - /* set subnet data */ - p->subnet_id = ntoh64(gid.global.subnet_prefix); - -/* p->subnet_id = gid.global.subnet_prefix; */ - - IBNET_VERBOSE(10, ("my IB-only subnet_id for HCA %d %s port %d is %lx\n" PRIx64, - gid.global.subnet_prefix,ibv_get_device_name(device->ib_dev), p->id, p->subnet_id)); - - return OMPI_SUCCESS; -} - -/* Find active port */ -static mca_sbgp_ibnet_device_t* ibnet_load_ports(struct ibv_device *ib_dev, int device_index) -{ - struct ibv_context *ib_dev_context = NULL; - mca_sbgp_ibnet_device_t *device = NULL; - int *allowed_ports = NULL; - int rc, port_cnt, port, i, ret, p = 0; - -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) - if (IBV_TRANSPORT_IB != ib_dev->transport_type) { - IBNET_VERBOSE(10, ("Skipping non IB device %s", - ibv_get_device_name(ib_dev))); - goto error; - } -#endif - - device = OBJ_NEW(mca_sbgp_ibnet_device_t); - device->ib_dev = ib_dev; - device->device_index = device_index; - ib_dev_context = ibv_open_device(ib_dev); - - if(NULL == ib_dev_context) { - IBNET_ERROR(("Error obtaining device context for %s errno says %s", - ibv_get_device_name(device->ib_dev), strerror(errno))); - goto error; - } - - if(ibv_query_device(ib_dev_context, &device->ib_dev_attr)) { - IBNET_ERROR(("error obtaining device attributes for %s errno says %s", - ibv_get_device_name(ib_dev), strerror(errno))); - goto error; - } - - allowed_ports = (int *) calloc(device->ib_dev_attr.phys_port_cnt, sizeof(int)); - if (NULL == allowed_ports) { - goto error; - } - - port_cnt = get_port_list(device, allowed_ports); - if (0 == port_cnt) { - goto error; - } - -#if OPAL_ENABLE_DEBUG - for (i = 0; i < port_cnt; ++i) { - IBNET_VERBOSE(10, ("allowed port %d with idx %d.\n", allowed_ports[i], i)); - } -#endif - - device->num_allowed_ports = port_cnt; - device->ports = (mca_sbgp_ibnet_port_t *) calloc(port_cnt, sizeof(mca_sbgp_ibnet_port_t)); - if (NULL == device->ports) { - goto error; - } - - /* Note ports are 1 based (i >= 1) */ - for(port = 0; port < port_cnt; port++) { - struct ibv_port_attr ib_port_attr; - - i = allowed_ports[port]; - if(ibv_query_port(ib_dev_context, i, &ib_port_attr)){ - IBNET_ERROR(("Error getting port attributes for device %s " - "port number %d errno says %s", - ibv_get_device_name(device->ib_dev), i, strerror(errno))); - continue; - } - - if(IBV_PORT_ACTIVE == ib_port_attr.state) { - /* Pasha: Need to think how we want to handle MTUs - if (ib_port_attr.active_mtu < mca_bcol_iboffload_component.mtu){ - device->mtu = ib_port_attr.active_mtu; - } - */ - /* start to put port info */ - device->ports[p].id = i; - device->ports[p].stat = ib_port_attr.state; - device->ports[p].mtu = ib_port_attr.active_mtu; - - device->ports[p].used = true; - - if (0 == mca_sbgp_ibnet_component.pkey_val) { - ret = ibnet_init_port(device, p, &ib_port_attr, ib_dev_context); - if (OMPI_SUCCESS != ret) { - IBNET_ERROR(("Device %s " - "port number %d , failed to init port, errno says %s", - ibv_get_device_name(device->ib_dev), - i, strerror(errno))); - continue; - } - } else { - uint16_t pkey,j; - device->ports[p].used = false; - - for (j = 0; j < device->ib_dev_attr.max_pkeys; j++) { - if(ibv_query_pkey(ib_dev_context, i, j, &pkey)){ - IBNET_ERROR(("error getting pkey for index %d, device %s " - "port number %d errno says %s", - j, ibv_get_device_name(device->ib_dev), i, strerror(errno))); - continue; - } - - pkey = ntohs(pkey) & MCA_SBGP_IBNET_PKEY_MASK; - if (pkey == (uint32_t) mca_sbgp_ibnet_component.pkey_val){ - ret = ibnet_init_port(device, p, &ib_port_attr, ib_dev_context); - if (OMPI_SUCCESS != ret) { - IBNET_ERROR(("Device %s " - "port number %d , failed to init port, errno says %s", - ibv_get_device_name(device->ib_dev), - i, strerror(errno))); - continue; - } - } - } - } - - p++; /* One port was loaded, go to the next one */ - } - } - - device->num_act_ports = p; - /* Update total number of active ports */ - mca_sbgp_ibnet_component.total_active_ports += p; - - if (0 != device->num_act_ports) { - ompi_common_ofacm_base_dev_desc_t dev; - /* Init dev */ - dev.ib_dev = ib_dev; - dev.ib_dev_context = ib_dev_context; - dev.capabilities = 0; - - rc = ompi_common_ofacm_base_select_for_local_port( - &dev, &device->cpcs, (int *)&device->num_cpcs); - /* If we get NOT_SUPPORTED, then no CPC was found for this - port. But that's not a fatal error -- just keep going; - let's see if we find any usable openib modules or not. */ - if (OMPI_SUCCESS != rc) { - /* All others *are* fatal. Note that we already did a - show_help in the lower layer */ - IBNET_VERBOSE(10, ("Device %s, no CPC found", - ibv_get_device_name(device->ib_dev))); - goto error; - } - } - - /* we do not continue to use the device we just collect data, - * so close it for now. We will open it later in iboffload coll*/ - if(ibv_close_device(ib_dev_context)) { - IBNET_ERROR(("Device %s, failed to close the device %s", - ibv_get_device_name(device->ib_dev), strerror(errno))); - } - - if (0 == device->num_act_ports) { - goto error; - } - - /* Pasha - I do not like the error flow here */ - free(allowed_ports); - - return device; - -error: - - if (NULL != allowed_ports) { - free(allowed_ports); - } - - OBJ_DESTRUCT(device); - - return NULL; -} - -/* Create list of IB hca that have active port */ -static int ibnet_load_devices(void) -{ - int num_devs, i; - struct ibv_device **ib_devs = NULL; - - mca_sbgp_ibnet_device_t *device = NULL; - mca_sbgp_ibnet_component_t *cs = &mca_sbgp_ibnet_component; - - IBNET_VERBOSE(7, ("Entering to ibnet_load_devices")); - - /* Get list of devices */ - ib_devs = ompi_ibv_get_device_list(&num_devs); - - if(0 == num_devs || NULL == ib_devs) { - IBNET_VERBOSE(10, ("No ib devices found")); - /* No hca error*/ - opal_show_help("help-mpi-btl-base.txt", "btl:no-nics", true); - return OMPI_ERROR; - } - - for (i = 0; i < num_devs; i++) { - device = ibnet_load_ports(ib_devs[i], i); - if (NULL != device) { - IBNET_VERBOSE(10, ("Device %s was appended to device list with index %d.\n", - ibv_get_device_name(device->ib_dev), i)); - opal_list_append(&cs->devices, - (opal_list_item_t *) device); - } - } - - if (opal_list_is_empty(&cs->devices)) { - /* No relevand devices were found, return error */ - IBNET_ERROR(("No active devices found")); - return OMPI_ERROR; - /* Maybe need to add error here*/ - } - - ompi_ibv_free_device_list(ib_devs); - - return OMPI_SUCCESS; -} - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_sbgp_ibnet_init_query(bool enable_progress_threads, - bool enable_mpi_threads) -{ - int rc, list_count = 0; - - /* Parse the include and exclude lists, checking for errors */ - mca_sbgp_ibnet_component.if_list = NULL; - mca_sbgp_ibnet_component.if_include_list = NULL; - mca_sbgp_ibnet_component.if_exclude_list = NULL; - - IBNET_VERBOSE(7, ("Calling mca_sbgp_ibnet_init_query")); - - if (NULL != mca_sbgp_ibnet_component.if_include) { - list_count++; - } - - if (NULL != mca_sbgp_ibnet_component.if_exclude) { - list_count++; - } - - if (list_count > 1) { - IBNET_ERROR(("Bad --mca (if_include, if_exclude) parameters !")); - return OMPI_ERROR; - } else if (NULL != mca_sbgp_ibnet_component.if_include) { - mca_sbgp_ibnet_component.if_include_list = - opal_argv_split(mca_sbgp_ibnet_component.if_include, ','); - mca_sbgp_ibnet_component.if_list = - opal_argv_copy(mca_sbgp_ibnet_component.if_include_list); - } else if (NULL != mca_sbgp_ibnet_component.if_exclude) { - mca_sbgp_ibnet_component.if_exclude_list = - opal_argv_split(mca_sbgp_ibnet_component.if_exclude, ','); - mca_sbgp_ibnet_component.if_list = - opal_argv_copy(mca_sbgp_ibnet_component.if_exclude_list); - } - - /* Init CPC components */ - rc = ompi_common_ofacm_base_init(); - if (OMPI_SUCCESS != rc) { - return rc; - } - - /* Load all devices and active ports */ - rc = ibnet_load_devices(); - if (OMPI_SUCCESS != rc) { - return rc; - } - - mca_sbgp_ibnet_component.super.sbgp_init_query = - mca_sbgp_ibnet_dummy_init_query; - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/sbgp/ibnet/sbgp_ibnet_mca.c b/ompi/mca/sbgp/ibnet/sbgp_ibnet_mca.c deleted file mode 100644 index a9c2553c0e..0000000000 --- a/ompi/mca/sbgp/ibnet/sbgp_ibnet_mca.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/bcol/bcol.h" -#include "ompi/mca/bcol/base/base.h" -#include "ompi/mca/common/ofacm/base.h" - -#include "sbgp_ibnet.h" -#include "sbgp_ibnet_mca.h" - -/* - * Local flags - */ -enum { - REGINT_NEG_ONE_OK = 0x01, - REGINT_GE_ZERO = 0x02, - REGINT_GE_ONE = 0x04, - REGINT_NONZERO = 0x08, - REGINT_MAX = 0x88 -}; - -enum { - REGSTR_EMPTY_OK = 0x01, - - REGSTR_MAX = 0x88 -}; - -static mca_base_var_enum_value_t mtu_values[] = { - {IBV_MTU_512, "256B"}, - {IBV_MTU_512, "512B"}, - {IBV_MTU_1024, "1k"}, - {IBV_MTU_2048, "2k"}, - {IBV_MTU_4096, "4k"}, - {0, NULL} -}; - -/* - * utility routine for string parameter registration - */ -static int reg_string(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - const char* default_value, char **storage, - int flags) -{ - int index; - - /* the MCA variable system will not change this value */ - *storage = (char *) default_value; - index = mca_base_component_var_register(&mca_sbgp_ibnet_component.super.sbgp_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_STRING, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "sbgp", "ibnet", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGSTR_EMPTY_OK) && (NULL == *storage || 0 == strlen(*storage))) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} - -/* - * utility routine for integer parameter registration - */ -static int reg_int(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - int default_value, int *storage, int flags) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_sbgp_ibnet_component.super.sbgp_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_INT, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "sbgp", "ibnet", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) { - return OMPI_SUCCESS; - } - - if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) || - (0 != (flags & REGINT_GE_ONE) && *storage < 1) || - (0 != (flags & REGINT_NONZERO) && 0 == *storage)) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OMPI_ERR_BAD_PARAM; - } - - return OMPI_SUCCESS; -} - -/* - * utility routine for boolean parameter registration - */ -static int reg_bool(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - bool default_value, bool *storage) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_sbgp_ibnet_component.super.sbgp_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "sbgp", "ibnet", deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - return OMPI_SUCCESS; -} - -int mca_sbgp_ibnet_register_params(void) -{ - mca_base_var_enum_t *new_enum; - char *msg; - int ret, tmp; - - ret = OMPI_SUCCESS; - -#define CHECK(expr) do { \ - tmp = (expr); \ - if (OMPI_SUCCESS != tmp) ret = tmp; \ - } while (0) - - /* register openib component parameters */ - - CHECK(reg_int("priority", NULL, - "IB offload component priority" - "(from 0(low) to 90 (high))", 90, &mca_sbgp_ibnet_component.super.priority, 0)); - - CHECK(reg_int("verbose", NULL, - "Output some verbose IB offload BTL information " - "(0 = no output, nonzero = output)", 0, &mca_sbgp_ibnet_component.verbose, 0)); - - CHECK(reg_bool("warn_default_gid_prefix", NULL, - "Warn when there is more than one active ports and at least one of them connected to the network with only default GID prefix configured (0 = do not warn; any other value = warn)", - true, &mca_sbgp_ibnet_component.warn_default_gid_prefix)); - CHECK(reg_bool("warn_nonexistent_if", NULL, - "Warn if non-existent devices and/or ports are specified in the sbgp_ibnet_if_[in|ex]clude MCA parameters (0 = do not warn; any other value = warn)", - true, &mca_sbgp_ibnet_component.warn_nonexistent_if)); - - CHECK(reg_int("max_sbgps", NULL, - "Maximum allowed number of subroups", - 100, &mca_sbgp_ibnet_component.max_sbgps, 0)); - - CHECK(reg_int("pkey", "ib_pkey_val", - "OpenFabrics partition key (pkey) value. " - "Unsigned integer decimal or hex values are allowed (e.g., \"3\" or \"0x3f\") and will be masked against the maximum allowable IB paritition key value (0x7fff)", - 0, &mca_sbgp_ibnet_component.pkey_val, 0)); - mca_sbgp_ibnet_component.pkey_val &= SBGP_IBNET_IB_PKEY_MASK; - - asprintf(&msg, "OpenFabrics MTU, in bytes (if not specified in INI files). Valid values are: %d=256 bytes, %d=512 bytes, %d=1024 bytes, %d=2048 bytes, %d=4096 bytes", - IBV_MTU_256, - IBV_MTU_512, - IBV_MTU_1024, - IBV_MTU_2048, - IBV_MTU_4096); - if (NULL == msg) { - /* Don't try to recover from this */ - return OMPI_ERR_OUT_OF_RESOURCE; - } - - CHECK(mca_base_var_enum_create("sbgp_ibnet_mtu", mtu_values, &new_enum)); - if (OPAL_SUCCESS != ret) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - mca_sbgp_ibnet_component.mtu = IBV_MTU_1024; - ret = mca_base_component_var_register(&mca_sbgp_ibnet_component.super.sbgp_version, - "mtu", msg, MCA_BASE_VAR_TYPE_INT, new_enum, - 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &mca_sbgp_ibnet_component.mtu); - OBJ_RELEASE(new_enum); - free(msg); - - if (0 > ret) { - return ret; - } - - (void) mca_base_var_register_synonym(ret, "ompi", "sbgp", "ibnet", "ib_mtu", - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - - CHECK(reg_string("if_include", NULL, - "Comma-delimited list of devices/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with sbgp_ibnet_if_exclude.", - NULL, &mca_sbgp_ibnet_component.if_include, - 0)); - - CHECK(reg_string("if_exclude", NULL, - "Comma-delimited list of device/ports to be excluded (empty value means to not exclude any ports). Mutually exclusive with sbgp_ibnet_if_include.", - NULL, &mca_sbgp_ibnet_component.if_exclude, - 0)); - - /* Register any MCA params for the connect pseudo-components */ - if (OMPI_SUCCESS == ret) { - ret = ompi_common_ofacm_base_register(&mca_sbgp_ibnet_component.super.sbgp_version); - } - - return ret; -} diff --git a/ompi/mca/sbgp/ibnet/sbgp_ibnet_mca.h b/ompi/mca/sbgp/ibnet/sbgp_ibnet_mca.h deleted file mode 100644 index 58fd8adcb2..0000000000 --- a/ompi/mca/sbgp/ibnet/sbgp_ibnet_mca.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - /** @file */ - -#ifndef MCA_SBGP_IBNET_MCA_H -#define MCA_SBGP_IBNET_MCA_H - -#include -#include "ompi_config.h" - -#define SBGP_IBNET_IB_PKEY_MASK 0x7fff - -int mca_sbgp_ibnet_register_params(void); - -#endif diff --git a/ompi/mca/sbgp/ibnet/sbgp_ibnet_module.c b/ompi/mca/sbgp/ibnet/sbgp_ibnet_module.c deleted file mode 100644 index fa5d54d171..0000000000 --- a/ompi/mca/sbgp/ibnet/sbgp_ibnet_module.c +++ /dev/null @@ -1,1029 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#include -#include -#include -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h" -#include "ompi/mca/common/ofacm/base.h" -#include "ompi/mca/common/ofacm/connect.h" -#include "ompi/patterns/comm/coll_ops.h" -/* - * Unused -static int ibnet_module_enable(mca_sbgp_base_module_t *module, - struct ompi_communicator_t *comm); - -*/ - -/* - * Local functions - */ -static void -mca_sbgp_ibnet_module_construct(mca_sbgp_ibnet_module_t *module) -{ - module->cgroups = NULL; - module->group_id = 0; -} - -static void -mca_sbgp_ibnet_module_destruct(mca_sbgp_ibnet_module_t *module) -{ - -} - -OBJ_CLASS_INSTANCE(mca_sbgp_ibnet_module_t, - mca_sbgp_base_module_t, - mca_sbgp_ibnet_module_construct, - mca_sbgp_ibnet_module_destruct); - -static void -mca_sbgp_ibnet_proc_construct(mca_sbgp_ibnet_proc_t *proc) -{ - /* done */ - proc->ompi_proc = 0; - proc->num_ports = 0; - proc->use_port = NULL; - proc->remote_ports_info = NULL; - proc->duty = MCA_SBGP_IBNET_NONE; -} - -static void -mca_sbgp_ibnet_proc_destruct(mca_sbgp_ibnet_proc_t *proc) -{ - /* done */ - if (NULL != proc->remote_ports_info) { - free(proc->remote_ports_info); - /* Pasha: need to check if we need - * to release some data from inside of the proc*/ - } - - if (NULL != proc->use_port) { - free(proc->use_port); - } -} - -OBJ_CLASS_INSTANCE(mca_sbgp_ibnet_proc_t, - opal_list_item_t, - mca_sbgp_ibnet_proc_construct, - mca_sbgp_ibnet_proc_destruct); - - -/* Pack all data to gather buffer */ -static int pack_gather_sbuff(char* sbuffer) -{ - int port, cpc; - coll_offload_support coll_offload_flag = OFFLOAD_CONNECTX_B0; /**< Pasha: add query for collectives offload support */ - - char* pack_ptr = sbuffer; - - mca_sbgp_ibnet_device_t *device = NULL; - uint32_t my_rank = ompi_process_info.my_name.vpid; - opal_list_t *devices = &mca_sbgp_ibnet_component.devices; - - /* Message format: - * - my rank (uint32_t) - * - number of active ports (uint32_t) - * - for each active port: - * + lid (uint16_t) - * + subnetid (uint64_t) - * + mtu (uint32_t) - * + colloffload (uint8_t) - * + num of cpcs (uint8_t) - * + for each cpc: (uint8_t) - * * cpc index (uint8_t) - * * cpc priority (uint8_t) - * * cpc buffer len (uint8_t) - * * cpc buffer (byte * buffer_len) - * - */ - - /* Start to put data */ - - /* Pack my rank , I need it because allgather doesn't work as expected */ - IBNET_VERBOSE(10, ("Send pack rank = %d\n", my_rank)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint32_t))); - - memcpy(pack_ptr, &my_rank, sizeof(uint32_t)); - pack_ptr += sizeof(uint32_t); - - /* Put number of ports that we send */ - IBNET_VERBOSE(10, ("Send pack num of ports = %d\n", mca_sbgp_ibnet_component.total_active_ports)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint32_t))); - - memcpy(pack_ptr, &mca_sbgp_ibnet_component.total_active_ports, sizeof(uint32_t)); - pack_ptr += sizeof(uint32_t); - - /* Go through list of device and build the message*/ - for (device = (mca_sbgp_ibnet_device_t *) opal_list_get_first(devices); - device != (mca_sbgp_ibnet_device_t *) opal_list_get_end(devices); - device = (mca_sbgp_ibnet_device_t *) opal_list_get_next((opal_list_item_t *)device)) { - for (port = 0; port < device->num_allowed_ports; ++port) { - if (!device->ports[port].used) { - continue; - } - - /* put port num */ - IBNET_VERBOSE(10, ("Send pack port num = %d\n", device->ports[port].id)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint16_t))); - - memcpy(pack_ptr, &device->ports[port].id, sizeof(uint16_t)); - pack_ptr += sizeof(uint16_t); - - /* put lid */ - IBNET_VERBOSE(10, ("Send pack lid = %d\n", device->ports[port].lid)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint16_t))); - - memcpy(pack_ptr, &device->ports[port].lid, sizeof(uint16_t)); - pack_ptr += sizeof(uint16_t); - - /* put subnetid */ - IBNET_VERBOSE(10, ("Send pack subnet id = %lx\n", device->ports[port].subnet_id)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint64_t))); - - memcpy(pack_ptr, &device->ports[port].subnet_id, sizeof(uint64_t)); - pack_ptr += sizeof(uint64_t); - - /* put default mtu */ - IBNET_VERBOSE(10, ("Send pack MTU = %d\n", device->ports[port].mtu)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint32_t))); - - memcpy(pack_ptr, &device->ports[port].mtu, sizeof(uint32_t)); - pack_ptr += sizeof(uint32_t); - - /* collectives offload support */ - IBNET_VERBOSE(10, ("Send pack collectives offload = %d\n", OFFLOAD_CONNECTX_B0)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint8_t))); - - /* Pasha: add query for collectives offload support */ - memcpy(pack_ptr, &coll_offload_flag, sizeof(uint8_t)); - pack_ptr += sizeof(uint8_t); - - /* number of cpcs for this port */ - IBNET_VERBOSE(10, ("Send pack number of cpcs = %d\n", device->num_cpcs)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint8_t))); - - memcpy(pack_ptr, &device->num_cpcs, sizeof(uint8_t)); - pack_ptr += sizeof(uint8_t); - - for (cpc = 0; cpc < device->num_cpcs; cpc++) { - uint8_t cpc_index; - uint8_t cpc_buflen; - - /* cpc index */ - cpc_index = ompi_common_ofacm_base_get_cpc_index(device->cpcs[cpc]->data.cbm_component); - - IBNET_VERBOSE(10, ("Send pack cpc index = %d\n", cpc_index)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint8_t))); - - memcpy(pack_ptr, &cpc_index, sizeof(uint8_t)); - pack_ptr += sizeof(uint8_t); - - /* cpc priority */ - IBNET_VERBOSE(10, ("Send pack cpc priority = %d\n", - device->cpcs[cpc]->data.cbm_priority)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint8_t))); - - memcpy(pack_ptr, &device->cpcs[cpc]->data.cbm_priority, sizeof(uint8_t)); - pack_ptr += sizeof(uint8_t); - - /* cpc buffer length in bytes */ - cpc_buflen = device->cpcs[cpc]->data.cbm_modex_message_len; - - IBNET_VERBOSE(10, ("Send pack cpc message len = %d\n", cpc_buflen)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint8_t))); - - memcpy(pack_ptr, &cpc_buflen, sizeof(uint8_t)); - pack_ptr += sizeof(uint8_t); - - /* cpc buffer */ - if (0 != cpc_buflen) { - IBNET_VERBOSE(10, ("Send pack cpc buffer len = %d\n", cpc_buflen)); - IBNET_VERBOSE(10, ("packing %d of %d\n", 1, sizeof(uint8_t))); - - memcpy(pack_ptr, device->cpcs[cpc]->data.cbm_modex_message, cpc_buflen); - pack_ptr += (size_t) cpc_buflen; - } - } - } - } - - return OMPI_SUCCESS; -} - -/* Translation vpid to ompi_proc */ -static int vpid_to_proc(ompi_vpid_t vpid, - struct ompi_proc_t ** procs, int n_procs_in, ompi_proc_t** out_proc) -{ - int i; - for (i = 0; i < n_procs_in; i++) { - if (vpid == procs[i]->proc_name.vpid) { - *out_proc = procs[i]; - return i; - } - } - - return OMPI_ERROR; -} - -static int unpack_and_load_gather_rbuff(char *rbuffer, int max_sent_bytes, - struct ompi_proc_t ** procs, int n_procs_in, opal_list_t *peers_data) -{ - - int i; - char* unpack_ptr; - - /* Message format: - * - my rank (uint32_t) - * - number of active ports (uint32_t) - * - for each active port: - * + lid (uint16_t) - * + subnetid (uint64_t) - * + mtu (uint32_t) - * + colloffload (uint8_t) - * + num of cpcs (uint8_t) - * + for each cpc: (uint8_t) - * * cpc index (uint8_t) - * * cpc priority (uint8_t) - * * cpc buffer len (uint8_t) - * * cpc buffer (byte*buffer_len) - * - */ - - /* Start to unpack data */ - for(i = 0; i < n_procs_in; i++) { - uint32_t p; - mca_sbgp_ibnet_proc_t *ibnet_proc; - - unpack_ptr = rbuffer + (size_t) (i * max_sent_bytes); - - /* create new proc */ - ibnet_proc = OBJ_NEW(mca_sbgp_ibnet_proc_t); - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint32_t))); - IBNET_VERBOSE(10, ("Recive remote rank %d\n", ibnet_proc->rank)); - - memcpy(&ibnet_proc->rank, unpack_ptr, sizeof(uint32_t)); - unpack_ptr += sizeof(uint32_t); - - /* set back pointer to ompi_proc */ - ibnet_proc->ompi_proc_index = - vpid_to_proc(ibnet_proc->rank, procs, - n_procs_in, &ibnet_proc->ompi_proc); - if (OMPI_ERROR == ibnet_proc->ompi_proc_index) { - return OMPI_ERROR; - } - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint32_t))); - IBNET_VERBOSE(10, ("Recive number of ports %d\n", ibnet_proc->num_ports)); - - memcpy(&ibnet_proc->num_ports, unpack_ptr, sizeof(uint32_t)); - unpack_ptr += sizeof(uint32_t); - - /* prepare place for port data*/ - ibnet_proc->remote_ports_info = calloc(ibnet_proc->num_ports, sizeof(mca_sbgp_ibnet_port_t)); - if (NULL == ibnet_proc->remote_ports_info) { - return OMPI_ERROR; - } - - /* load the data */ - for(p = 0; p < ibnet_proc->num_ports; p++) { - mca_sbgp_ibnet_port_t *port = &ibnet_proc->remote_ports_info[p]; - uint32_t cpc; - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint16_t))); - IBNET_VERBOSE(10, ("Recive id %d\n", port->id)); - - memcpy(&port->id, unpack_ptr, sizeof(uint16_t)); - unpack_ptr += sizeof(uint16_t); - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint16_t))); - IBNET_VERBOSE(10, ("Recive lid %d\n", port->lid)); - - memcpy(&port->lid, unpack_ptr, sizeof(uint16_t)); - unpack_ptr += sizeof(uint16_t); - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint64_t))); - IBNET_VERBOSE(10, ("Recive subnet id %lx\n", port->subnet_id)); - - memcpy(&port->subnet_id, unpack_ptr, sizeof(uint64_t)); - unpack_ptr += sizeof(uint64_t); - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint32_t))); - IBNET_VERBOSE(10, ("Recive mtu %d\n", port->mtu)); - - memcpy(&port->mtu, unpack_ptr, sizeof(uint32_t)); - unpack_ptr += sizeof(uint32_t); - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint8_t))); - IBNET_VERBOSE(10, ("Recive offload %d\n", port->coll_offload)); - - memcpy(&port->coll_offload, unpack_ptr, sizeof(uint8_t)); - unpack_ptr += sizeof(uint8_t); - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint8_t))); - IBNET_VERBOSE(10, ("Recive number of cpcs %d\n", port->num_cpcs)); - - memcpy(&port->num_cpcs, unpack_ptr, sizeof(uint8_t)); - unpack_ptr += sizeof(uint8_t); - - port->pm_cpc_data = calloc(port->num_cpcs, - sizeof(ompi_common_ofacm_base_module_data_t)); - if (NULL == port->pm_cpc_data) { - return OMPI_ERROR; - } - - /* load cpc data */ - for (cpc = 0; cpc < port->num_cpcs; cpc++) { - ompi_common_ofacm_base_module_data_t *cpc_data = - &port->pm_cpc_data[cpc]; - uint8_t cpc_index = -1; - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint8_t))); - IBNET_VERBOSE(10, ("Recive cpc index %d\n", cpc_index)); - - memcpy(&cpc_index, unpack_ptr, sizeof(uint8_t)); - unpack_ptr += sizeof(uint8_t); - - cpc_data->cbm_component = - ompi_common_ofacm_base_get_cpc_byindex(cpc_index); - if (NULL == cpc_data->cbm_component) { - IBNET_VERBOSE(10, ("Failed to resolve cpc index %d\n", cpc_index)); - return OMPI_ERROR; - } - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint8_t))); - IBNET_VERBOSE(10, ("Recive priority %d\n", cpc_data->cbm_priority)); - - memcpy(&cpc_data->cbm_priority, unpack_ptr, sizeof(uint8_t)); - unpack_ptr += sizeof(uint8_t); - - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, 1, sizeof(uint8_t))); - IBNET_VERBOSE(10, ("Recive cpc message len %d\n", cpc_data->cbm_modex_message_len)); - - memcpy(&cpc_data->cbm_modex_message_len, unpack_ptr, sizeof(uint8_t)); - unpack_ptr += sizeof(uint8_t); - - if (0 != cpc_data->cbm_modex_message_len) { - int cpc_buflen = cpc_data->cbm_modex_message_len; - - IBNET_VERBOSE(10, ("Recive cpc message data with len %d\n", cpc_buflen)); - IBNET_VERBOSE(10, ("element=%d unpacking %d of %d\n", i, cpc_buflen, cpc_buflen)); - - memcpy(&cpc_data->cbm_modex_message, unpack_ptr, cpc_buflen); - unpack_ptr += (size_t) cpc_buflen; - } - } - } - - /* Put the new proc to the list */ - opal_list_append(peers_data, (opal_list_item_t*) ibnet_proc); - } - - assert((uint32_t) n_procs_in == opal_list_get_size(peers_data)); - return OMPI_SUCCESS; -} - -static int cmp_cgroups(const void *p1, const void *p2) -{ - mca_sbgp_ibnet_connection_group_info_t *g1 = - (mca_sbgp_ibnet_connection_group_info_t *)p1; - mca_sbgp_ibnet_connection_group_info_t *g2 = - (mca_sbgp_ibnet_connection_group_info_t *)p2; - return (g2->num_procs - g1->num_procs); -} - -static int set_ibnet_proc_on_cgroup( - mca_sbgp_ibnet_connection_group_info_t *cgroup, - mca_sbgp_ibnet_proc_t *ibnet_proc, - mca_sbgp_ibnet_device_t *device, - mca_sbgp_ibnet_module_t *module) -{ - uint32_t p; - int k, rc, p_indx; /* port index in array of device */ - - for (p_indx = 0; p_indx < device->num_allowed_ports; ++p_indx) { - if (cgroup->port == device->ports[p_indx].id) { - break; - } - } - - assert(device->num_act_ports > p_indx); - - if (NULL == ibnet_proc->use_port) { - ibnet_proc->use_port = calloc(module->num_cgroups, sizeof(int)); - if (NULL == ibnet_proc->use_port) { - IBNET_ERROR(("Failed to allocate use_port array.")); - return OMPI_ERROR; - } - } - - IBNET_VERBOSE(10, ("Local port is %d, idx - %d.\n", - device->ports[p_indx].id, p_indx)); - - for(p = 0; p < ibnet_proc->num_ports; p++) { - if (device->ports[p_indx].subnet_id == - ibnet_proc->remote_ports_info[p].subnet_id) { - ompi_common_ofacm_base_module_t *local_cpc = NULL; - ompi_common_ofacm_base_module_data_t *remote_cpc_data = NULL; - /* check if we have matching cpc on both sides */ - if (OMPI_SUCCESS != - ompi_common_ofacm_base_find_match(device->cpcs, - device->num_cpcs, - ibnet_proc->remote_ports_info[p].pm_cpc_data, - ibnet_proc->remote_ports_info[p].num_cpcs, - &local_cpc, - &remote_cpc_data)) { - /* Failed to match, can not use the port */ - IBNET_VERBOSE(10, ("Failed to match, can not use the port - %d.\n", p + 1)); - continue; - } - - for (k = 0; k < module->num_cgroups && ((p + 1) != (uint32_t) ibnet_proc->use_port[k]); ++k) - ; - - if (k < module->num_cgroups) { - /* The port in use - another connection group use it */ - continue; - } - - /* It means that connection group 'cgroup' communicates with - this proc over its own remote port */ - ibnet_proc->use_port[cgroup->index] = p + 1; - /* if it is no group array we need to create it*/ - if(OPAL_UNLIKELY(NULL == cgroup->ibnet_procs)) { - cgroup->ibnet_procs = OBJ_NEW(opal_pointer_array_t); - rc = opal_pointer_array_init(cgroup->ibnet_procs, 10, INT_MAX, 10); - if (OPAL_SUCCESS != rc) { - IBNET_ERROR(("Failed to allocate opal_pointer_array")); - return OMPI_ERROR; - } - } - - IBNET_VERBOSE(10, ("Device idx %d, local port idx %d; " - "adding rank %d to the module %p, rem port %d", - device->device_index, p_indx, ibnet_proc->rank, - module, ibnet_proc->remote_ports_info[p].id)); - /* No need to remove: opal_list_remove_item(peers_data, (opal_list_item_t*)ibnet_proc); */ - rc = opal_pointer_array_set_item(cgroup->ibnet_procs, - /* num_selected, */ cgroup->num_procs, - (void *) ibnet_proc); - if (OPAL_SUCCESS != rc) { - IBNET_ERROR( ("Failed to set rank %d to index %d", - ibnet_proc->rank, 1 + cgroup->num_procs)); - return OMPI_ERROR; - } - - /* put selected cpc data to this proc */ - ibnet_proc->remote_ports_info[p].local_cpc = local_cpc; - ibnet_proc->remote_ports_info[p].remote_cpc_data = remote_cpc_data; - - ++cgroup->num_procs; - /* we done for the proc, go to next one */ - break; - } - } - - return OMPI_SUCCESS; -} - -static int setup_cgroup_all( - mca_sbgp_ibnet_connection_group_info_t *cgroup, - mca_sbgp_ibnet_device_t *device, - mca_sbgp_ibnet_module_t *module, - opal_list_t *peers_data) -{ - int rc; - mca_sbgp_ibnet_proc_t *ibnet_proc = NULL; - - for (ibnet_proc = (mca_sbgp_ibnet_proc_t *) opal_list_get_first(peers_data); - ibnet_proc != (mca_sbgp_ibnet_proc_t *) opal_list_get_end(peers_data); - ibnet_proc = (mca_sbgp_ibnet_proc_t *) - opal_list_get_next((opal_list_item_t *)ibnet_proc)) { - - rc = set_ibnet_proc_on_cgroup(cgroup, ibnet_proc, device, module); - if (OMPI_SUCCESS != rc) { - return rc; - } - } - - return OMPI_SUCCESS; -} - -static int setup_cgroup_node(mca_sbgp_ibnet_connection_group_info_t *cgroup, mca_sbgp_ibnet_device_t *device, - mca_sbgp_ibnet_module_t *module, opal_list_t *peers_data) -{ - int rc, local = 0; - mca_sbgp_ibnet_proc_t *ibnet_proc = NULL; - - for (ibnet_proc = (mca_sbgp_ibnet_proc_t *)opal_list_get_first(peers_data); - ibnet_proc != (mca_sbgp_ibnet_proc_t *)opal_list_get_end(peers_data); - ibnet_proc = (mca_sbgp_ibnet_proc_t *) - opal_list_get_next((opal_list_item_t *)ibnet_proc)) { - - local = OPAL_PROC_ON_LOCAL_NODE(ibnet_proc->ompi_proc->super.proc_flags); - if (0 == local) { - /* the remote process resides on different node */ - continue; - } - - /* the process resides on the same machine */ - rc = set_ibnet_proc_on_cgroup(cgroup, ibnet_proc, device, module); - if (OMPI_SUCCESS != rc) { - return rc; - } - } - - return OMPI_SUCCESS; -} - -/* The function should be the heart of the ibnet component. - * Main purpose: - * The function should run over list of all peers and select only "reachable" peers. - * Peer that have subnet_id equal to subnet id that I have on my ports is reachable. - * All peers that have the same number of active ports on the same subnet maybe grouped - * to subgroup? - * Need to think more about the select logic on this stage I just return list of all - * procs - */ -static int select_procs(mca_sbgp_ibnet_module_t *module, opal_list_t *peers_data) -{ - mca_sbgp_ibnet_device_t *device = NULL; - mca_sbgp_ibnet_proc_t *ibnet_proc = NULL; - mca_sbgp_ibnet_connection_group_info_t *cgroup = NULL; - - uint32_t p = 0; - int i = 0, j, rc = OMPI_SUCCESS; - int num_grouped = 0, - groups_to_use = 1; - - mca_sbgp_ibnet_component_t *cs = &mca_sbgp_ibnet_component; - - IBNET_VERBOSE(10, ("Start to select procs.\n")); - - module->num_cgroups = 0; - for (device = (mca_sbgp_ibnet_device_t *) opal_list_get_first(&cs->devices); - device != (mca_sbgp_ibnet_device_t *) opal_list_get_end(&cs->devices); - device = (mca_sbgp_ibnet_device_t *) - opal_list_get_next((opal_list_item_t *) device)) { - module->num_cgroups += device->num_act_ports; - IBNET_VERBOSE(10, ("Device num %d with index %d num of active ports %d\n", - ++i, device->device_index, device->num_act_ports)); - } - - module->cgroups = calloc(module->num_cgroups, - sizeof(mca_sbgp_ibnet_connection_group_info_t)); - - if (NULL == module->cgroups) { - IBNET_ERROR(("Failed to allocate cgroups")); - goto select_error; - } - - IBNET_VERBOSE(10, ("Num of cgroups - %d.\n", module->num_cgroups)); - - /* 1. Run over all active ports and build connection group - * for each one */ - for (device = (mca_sbgp_ibnet_device_t *) opal_list_get_first(&cs->devices); - device != (mca_sbgp_ibnet_device_t *) opal_list_get_end(&cs->devices); - device = (mca_sbgp_ibnet_device_t *) - opal_list_get_next((opal_list_item_t *)device)) { - /* run over active ports on the device */ - for(j = 0; j < device->num_act_ports; j++) { - cgroup = &module->cgroups[num_grouped]; - - /* Init cgroups structs */ - cgroup->device_index = device->device_index; - cgroup->index = num_grouped; - cgroup->port = device->ports[j].id; - cgroup->num_procs = 0; - - /* Setup comunication group */ - switch(module->mode) { - case MCA_SBGP_IBNET_ALL_NET: - rc = setup_cgroup_all(cgroup, device, module, peers_data); - break; - case MCA_SBGP_IBNET_NODE_NET: - rc = setup_cgroup_node(cgroup, device, module, peers_data); - break; - default: - rc = OMPI_ERROR; - IBNET_ERROR(("Module mode is unknow, fatal error")); - } - - if (OMPI_SUCCESS != rc) { - IBNET_ERROR(("Failed to setup cgroup.")); - goto select_error; - } - - if (0 != cgroup->num_procs) { - ++num_grouped; - } - } - } - - if (0 == num_grouped) { - /* No connection group was found */ - IBNET_ERROR(("No connection group was found.")); - goto select_error; - } - - /* If we have more than one single cgroup, - * we need to return groups that connects - * to exactly the same peers - */ - if (num_grouped > 1) { - - /* 2. Sort connection groups by size */ - qsort(module->cgroups, num_grouped, - sizeof(mca_sbgp_ibnet_connection_group_info_t), - cmp_cgroups); - - /* 3. What is the number of groups with maximal size */ - /* The first is Maximal */ - for (groups_to_use = 1; groups_to_use < num_grouped; groups_to_use++) { - if (module->cgroups[0].num_procs != module->cgroups[groups_to_use].num_procs) { - break; - } - } - - /* Ishai - It looks that noone is uses this groups_to_use value. In any case there is a bug in it. */ - /* 4. Check that all the maximal size groups are - * connect to the same peers, if not we just use FIRST cgroup */ - if (groups_to_use > 1) { - /* we need to check that all groups connects - * the same set of peers. */ - for (j = groups_to_use - 1; j > 0; j--) { - for (p = 0; p < module->cgroups[0].num_procs; p++) { - /* compare proc by proc....*/ - if (opal_pointer_array_get_item(module->cgroups[0].ibnet_procs, p) != - opal_pointer_array_get_item(module->cgroups[j].ibnet_procs, p)) { - /* peers are not equal, ignore this group and go to the next one */ - groups_to_use--; - if (j != groups_to_use) { - /* it was not the last group, swap last and this one */ - mca_sbgp_ibnet_connection_group_info_t tmp = module->cgroups[j]; - module->cgroups[j] = module->cgroups[groups_to_use]; - module->cgroups[groups_to_use] = tmp; - } - - break; /* go to the next group */ - } - } - } - } - } - /* updating sgroup number */ - module->num_cgroups = groups_to_use; - /* put array of ranks and size */ - - module->super.group_size = module->cgroups[0].num_procs; - module->super.group_list = (int *) calloc(module->super.group_size, sizeof(int)); - if (NULL == module->super.group_list) { - IBNET_ERROR(("Failed to allocate memory for group list")); - goto select_error; - } - - for (i = 0; i < module->super.group_size; i++) { - ibnet_proc = (mca_sbgp_ibnet_proc_t *) - opal_pointer_array_get_item(module->cgroups[0].ibnet_procs, i); - - assert(NULL != ibnet_proc); - IBNET_VERBOSE(10, ("Adding rank %d to group list", ibnet_proc->rank)); - - module->super.group_list[i] = ibnet_proc->ompi_proc_index; - } - - /* Let proc with lowest index be a leader of the subgroup */ - ibnet_proc = (mca_sbgp_ibnet_proc_t *) - opal_pointer_array_get_item(module->cgroups[0].ibnet_procs, 0); - - assert(NULL != ibnet_proc); - ibnet_proc->duty = MCA_SBGP_IBNET_NODE_LEADER; - -#if OPAL_ENABLE_DEBUG - IBNET_VERBOSE(10, ("Ibnet module: size - %d, num_cgroups - %d.\n", - module->super.group_size, module->num_cgroups)); - - for (i = 0; i < module->num_cgroups; ++i) { - IBNET_VERBOSE(10, ("cgroup %d uses port %d.\n", - i + 1, module->cgroups[i].port)); - } -#endif - - return OMPI_SUCCESS; - -select_error: - if (NULL != module->cgroups) { - for (i = 0; i < num_grouped; i++) { - if (NULL != module->cgroups[i].ibnet_procs) { - /* Ishai: When do we destruct it if the fucntion was successful - only at the end of the process? */ - OBJ_DESTRUCT(module->cgroups[i].ibnet_procs); - } - } - - free(module->cgroups); - } - - if (0 != module->super.group_size && - NULL != module->super.group_list) { - free(module->super.group_list); - } - - for (ibnet_proc = (mca_sbgp_ibnet_proc_t *) opal_list_get_first(peers_data); - ibnet_proc != (mca_sbgp_ibnet_proc_t *) opal_list_get_end(peers_data); - ibnet_proc = (mca_sbgp_ibnet_proc_t *) - opal_list_get_next((opal_list_item_t *) ibnet_proc)) { - if (NULL != ibnet_proc->use_port) { - free(ibnet_proc->use_port); - } - } - - return rc; -} - -/* This routine is used to find the list of procs that run on the -** same host as the calling process. -*/ - -#define IBNET_ALL "all" -#define IBNET_NODE "node" - -static int key2mode(char *key) -{ - if (NULL == key) { - IBNET_VERBOSE(6, ("key is NULL, return MCA_SBGP_IBNET_ALL")); - return MCA_SBGP_IBNET_ALL_NET; - } - if (strlen(IBNET_ALL) == strlen(key) && - 0 == strncmp(IBNET_ALL, key, strlen(IBNET_ALL))) { - IBNET_VERBOSE(6, ("key is MCA_SBGP_IBNET_ALL")); - return MCA_SBGP_IBNET_ALL_NET; - } - if (strlen(IBNET_NODE) == strlen(key) && - 0 == strncmp(IBNET_NODE, key, strlen(IBNET_NODE))) { - IBNET_VERBOSE(6, ("key is NODE")); - return MCA_SBGP_IBNET_NODE_NET; - } - - IBNET_VERBOSE(6, ("key was not detected, return MCA_SBGP_IBNET_NONE")); - return MCA_SBGP_IBNET_NONE_NET; -} - -static int mca_sbgp_ibnet_calc_sbuff_size(void) -{ - int bytes_tosend = 0, port, cpc; - mca_sbgp_ibnet_device_t *device; - - opal_list_t *devices = &mca_sbgp_ibnet_component.devices; - - bytes_tosend += sizeof(uint32_t); /* OPAL_UINT32 rank */ - bytes_tosend += sizeof(uint32_t); /* OPAL_UINT32 num of active ports */ - - /* Go through list of device and build the message*/ - for (device = (mca_sbgp_ibnet_device_t *) opal_list_get_first(devices); - device != (mca_sbgp_ibnet_device_t *) opal_list_get_end(devices); - device = (mca_sbgp_ibnet_device_t *) opal_list_get_next((opal_list_item_t *) device)) { - for (port = 0; port < device->num_allowed_ports; ++port) { - if (!device->ports[port].used) { - continue; - } - - /* OPAL_UINT16 port num */ - bytes_tosend += sizeof(uint16_t); - - /* OPAL_UINT16 lid */ - bytes_tosend += sizeof(uint16_t); - - /* OPAL_UINT64 subnetid */ - bytes_tosend += sizeof(uint64_t); - - /* OPAL_UINT32 default mtu */ - bytes_tosend += sizeof(uint32_t); - - /* OPAL_UINT8 collectives offload support */ - bytes_tosend += sizeof(uint8_t); - - /* OPAL_UINT8 number of cpcs for this port */ - bytes_tosend += sizeof(uint8_t); - - for (cpc = 0; cpc < device->num_cpcs; ++cpc) { - /* OPAL_UINT8 cpc index */ - bytes_tosend += sizeof(uint8_t); - - /* OPAL_UINT8 cpc priority */ - bytes_tosend += sizeof(uint8_t); - - /* cpc buffer length (OPAL_UINT8) in bytes */ - bytes_tosend += device->cpcs[cpc]->data.cbm_modex_message_len; - bytes_tosend += sizeof(uint8_t); - } - } - } - - return bytes_tosend; -} - -mca_sbgp_base_module_t *mca_sbgp_ibnet_select_procs(struct ompi_proc_t **procs, - int n_procs_in, - struct ompi_communicator_t *comm, - char *key, - void *output_data - ) -{ - /* local variables */ - opal_list_t peers_data; - mca_sbgp_ibnet_module_t *module; - - uint32_t rc; - char *sbuff = NULL, *rbuff = NULL; - - int *sbgp_procs_ranks = NULL, *ranks_in_comm = NULL; - int i, my_rank_in_group = -1, my_rank, num_bytes_tosend; - - struct mca_sbgp_ibnet_proc_t *ibnet_proc = NULL; - mca_sbgp_ibnet_component_t *cs = &mca_sbgp_ibnet_component; - - /* Create the module */ - module = OBJ_NEW(mca_sbgp_ibnet_module_t); - if (OPAL_UNLIKELY(NULL == module)) { - return NULL; - } - - module->num_cgroups = 0; - module->cgroups = NULL; - module->mode = key2mode(key); - - if (OPAL_UNLIKELY(MCA_SBGP_IBNET_NONE_NET == module->mode)) { - goto Error_module; - } - - module->super.group_size = 0; - module->super.group_list = NULL; - module->super.group_comm = comm; - module->super.group_net = OMPI_SBGP_IBCX2; - - ranks_in_comm = (int *) malloc(n_procs_in * sizeof(int)); - if (OPAL_UNLIKELY(NULL == ranks_in_comm)) { - IBNET_ERROR(("Cannot allocate memory.\n")); - goto Error; - } - - my_rank = ompi_comm_rank(&ompi_mpi_comm_world.comm); - - for (i = 0; i < n_procs_in; i++) { - ranks_in_comm[i] = procs[i]->proc_name.vpid; - if (my_rank == ranks_in_comm[i]) { - my_rank_in_group = i; - } - } - - /* Prepare send data */ - num_bytes_tosend = mca_sbgp_ibnet_calc_sbuff_size(); - - rc = comm_allreduce_pml(&num_bytes_tosend, - &num_bytes_tosend, 1, - MPI_INT, my_rank_in_group, - MPI_MAX, n_procs_in, - ranks_in_comm, &ompi_mpi_comm_world.comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - goto Error; - } - - IBNET_VERBOSE(10, ("The size of the send buff is %d\n", num_bytes_tosend)); - - assert(num_bytes_tosend > 0); - - /* Allocate send/recv buffers for allgather comunication */ - sbuff = (char *) malloc(num_bytes_tosend); - rbuff = (char *) malloc(num_bytes_tosend * n_procs_in); - if (OPAL_UNLIKELY(NULL == sbuff || NULL == rbuff)) { - IBNET_ERROR(("Failed to allocate buffers for send/recv ibnet allgather\n")); - goto Error; - } - - rc = pack_gather_sbuff(sbuff); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - goto Error; - } - - rc = comm_allgather_pml((void *) sbuff, (void *) rbuff, - num_bytes_tosend, MPI_BYTE, - my_rank_in_group, n_procs_in, - ranks_in_comm, &ompi_mpi_comm_world.comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - IBNET_ERROR(("Allgather call failed.\n")); - goto Error; - } - - /* Prepare list for arraving data */ - OBJ_CONSTRUCT(&peers_data, opal_list_t); - - /* Load the data to peers data */ - rc = unpack_and_load_gather_rbuff(rbuff, num_bytes_tosend, - procs, n_procs_in, &peers_data); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - goto Error; - } - - /* Select logic */ - rc = select_procs(module, &peers_data); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - goto Error; - } - - /* Put group id */ - sbgp_procs_ranks = (int *) malloc(module->super.group_size * - sizeof(int)); - if (OPAL_UNLIKELY(NULL == sbgp_procs_ranks)) { - IBNET_ERROR(("Cannot allocate memory.\n")); - goto Error; - } - - for (i = 0; i < module->super.group_size; ++i) { - ibnet_proc = (struct mca_sbgp_ibnet_proc_t *) - opal_pointer_array_get_item( - module->cgroups[0].ibnet_procs, i); - - sbgp_procs_ranks[i] = ibnet_proc->ompi_proc->proc_name.vpid; - if (my_rank == sbgp_procs_ranks[i]) { - my_rank_in_group = i; - } - - } - - assert(my_rank_in_group >= 0); - - rc = comm_allreduce_pml(&cs->curr_max_group_id, - &cs->curr_max_group_id, 1, - MPI_INT, my_rank_in_group, - MPI_MAX, module->super.group_size, - sbgp_procs_ranks, &ompi_mpi_comm_world.comm); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - goto Error; - } - - module->group_id = cs->curr_max_group_id; - cs->curr_max_group_id++; - - /* successful completion */ - /* clean up the temporary structures */ - OBJ_DESTRUCT(&peers_data); - - free(sbuff); - free(rbuff); - - free(ranks_in_comm); - free(sbgp_procs_ranks); - - IBNET_VERBOSE(10, ("Return ibnet module.\n")); - return (mca_sbgp_base_module_t *) module; - - /* return with error */ -Error: - /* clean up */ - if(NULL != module->super.group_list) { - free(module->super.group_list); - module->super.group_list = NULL; - } - - /* clean up the temporary structures */ - OBJ_DESTRUCT(&peers_data); - - if (NULL != sbgp_procs_ranks) { - free(sbgp_procs_ranks); - } - - if (NULL != ranks_in_comm) { - free(ranks_in_comm); - } - - if (NULL != sbuff) { - free(sbuff); - } - - if (NULL != rbuff) { - free(rbuff); - } - -Error_module: - OBJ_RELEASE(module); - - return NULL; -} diff --git a/ompi/mca/sbgp/p2p/Makefile.am b/ompi/mca/sbgp/p2p/Makefile.am deleted file mode 100644 index d7d14f795f..0000000000 --- a/ompi/mca/sbgp/p2p/Makefile.am +++ /dev/null @@ -1,41 +0,0 @@ -# -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources = \ - sbgp_p2p.h \ - sbgp_p2p_component.c \ - sbgp_p2p_module.c - - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -component_noinst = -component_install = -if MCA_BUILD_ompi_sbgp_p2p_DSO -component_install += mca_sbgp_p2p.la -else -component_noinst += libmca_sbgp_p2p.la -endif - -# See ompi/mca/btl/sm/Makefile.am for an explanation of -# libmca_common_sm.la. - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_sbgp_p2p_la_SOURCES = $(sources) -mca_sbgp_p2p_la_LDFLAGS = -module -avoid-version -mca_sbgp_p2p_la_LIBADD = - -noinst_LTLIBRARIES = $(component_noinst) -libmca_sbgp_p2p_la_SOURCES =$(sources) -libmca_sbgp_p2p_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/sbgp/p2p/configure.m4 b/ompi/mca/sbgp/p2p/configure.m4 deleted file mode 100644 index 56cc9a06af..0000000000 --- a/ompi/mca/sbgp/p2p/configure.m4 +++ /dev/null @@ -1,27 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2013 Sandia National Laboratories. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_ompi_sbgp_p2p_POST_CONFIG(will_build) -# ---------------------------------------- -# The p2p sbgp requires a BML endpoint tag to compile, so require it. -# Require in POST_CONFIG instead of CONFIG so that we only require it -# if we're not disabled. -AC_DEFUN([MCA_ompi_sbgp_p2p_POST_CONFIG], [ - AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([BML])]) -])dnl - -# MCA_ompi_sbgp_p2p_CONFIG(action-if-can-compile, -# [action-if-cant-compile]) -# ------------------------------------------------ -# We can always build, unless we were explicitly disabled. -AC_DEFUN([MCA_ompi_sbgp_p2p_CONFIG],[ - AC_CONFIG_FILES([ompi/mca/sbgp/p2p/Makefile]) - [$1] -])dnl diff --git a/ompi/mca/sbgp/p2p/owner.txt b/ompi/mca/sbgp/p2p/owner.txt deleted file mode 100644 index 1c86df367b..0000000000 --- a/ompi/mca/sbgp/p2p/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: ORNL -status: unmaintained diff --git a/ompi/mca/sbgp/p2p/sbgp_p2p.h b/ompi/mca/sbgp/p2p/sbgp_p2p.h deleted file mode 100644 index f8fa5fc194..0000000000 --- a/ompi/mca/sbgp/p2p/sbgp_p2p.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#ifndef MCA_BCOL_p2p_EXPORT_H -#define MCA_BCOL_p2p_EXPORT_H - -#include "ompi_config.h" - -#include "mpi.h" -#include "ompi/mca/mca.h" -#include "ompi/mca/sbgp/sbgp.h" -#include "opal/mca/mpool/mpool.h" -#include "ompi/request/request.h" -#include "ompi/proc/proc.h" - -BEGIN_C_DECLS - -#ifdef HAVE_SCHED_YIELD -# include -# define SPIN sched_yield() -#else /* no switch available */ -# define SPIN -#endif - - - /** - * Structure to hold the basic shared memory coll component. First it holds the - * base coll component, and then holds a bunch of - * sm-coll-component-specific stuff (e.g., current MCA param - * values). - */ - struct mca_sbgp_p2p_component_t { - /** Base coll component */ - mca_sbgp_base_component_2_0_0_t super; - - }; - - /** - * Convenience typedef - */ - typedef struct mca_sbgp_p2p_component_t - mca_sbgp_p2p_component_t; - - - /* - ** Base sub-group module - **/ - - struct mca_sbgp_p2p_module_t { - /** Collective modules all inherit from opal_object */ - mca_sbgp_base_module_t super; - - }; - typedef struct mca_sbgp_p2p_module_t mca_sbgp_p2p_module_t; - OBJ_CLASS_DECLARATION(mca_sbgp_p2p_module_t); - - /* This routine is used to find the list of procs that run on the - ** same host as the calling process. - */ - /* - struct mca_sbgp_base_module_t *mca_sbgp_p2p_select_procs(struct ompi_proc_t ** procs, - int n_procs_in, char *key, void *output_data); - */ - - /** - * Global component instance - */ - OMPI_MODULE_DECLSPEC extern mca_sbgp_p2p_component_t mca_sbgp_p2p_component; - - -END_C_DECLS - -#endif /* MCA_BCOL_p2p_EXPORT_H */ diff --git a/ompi/mca/sbgp/p2p/sbgp_p2p_component.c b/ompi/mca/sbgp/p2p/sbgp_p2p_component.c deleted file mode 100644 index 2fd93da404..0000000000 --- a/ompi/mca/sbgp/p2p/sbgp_p2p_component.c +++ /dev/null @@ -1,224 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_MMAN_H -#include -#endif -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "sbgp_p2p.h" -#include "ompi/mca/bml/bml.h" - - -/* - * Public string showing the coll ompi_sm V2 component version number - */ -const char *mca_sbgp_p2p_component_version_string = - "Open MPI sbgp - p2p collective MCA component version " OMPI_VERSION; - - -/* - * Local functions - */ - -static int p2p_register(void); -static int p2p_open(void); -static int p2p_close(void); -static mca_sbgp_base_module_t * mca_sbgp_p2p_select_procs(struct ompi_proc_t ** procs, - int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data); - -static int mca_sbgp_p2p_init_query(bool enable_progress_threads, - bool enable_mpi_threads); - -/* - * Instantiate the public struct with all of our public information - * and pointers to our public functions in it - */ - -mca_sbgp_p2p_component_t mca_sbgp_p2p_component = { - - - { - /* First, the mca_component_t struct containing meta - information about the component itself */ - - .sbgp_version = { - MCA_SBGP_BASE_VERSION_2_0_0, - /* Component name and version */ - - .mca_component_name = "p2p", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), - - /* Component open and close functions */ - - .mca_open_component = p2p_open, - .mca_close_component = p2p_close, - .mca_register_component_params = p2p_register, - }, - - .sbgp_init_query = mca_sbgp_p2p_init_query, - .select_procs = mca_sbgp_p2p_select_procs, - .priority = 0, - } - -}; - -static int p2p_register(void) -{ - mca_sbgp_p2p_component_t *cs = &mca_sbgp_p2p_component; - cs->super.priority = 90; - (void) mca_base_component_var_register(&cs->super.sbgp_version, - "priority", "Priority for the sbgp p2p component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->super.priority); - - return OMPI_SUCCESS; -} - -/* - * Open the component - */ -static int p2p_open(void) -{ - return OMPI_SUCCESS; -} - - -/* - * Close the component - */ -static int p2p_close(void) -{ - return OMPI_SUCCESS; -} - -/* query to see if the component is available for use, and can - * satisfy the thread and progress requirements - */ -int mca_sbgp_p2p_init_query(bool enable_progress_threads, - bool enable_mpi_threads) -{ - /* at this stage there is no reason to disaulify this component */ - - /* done */ - return OMPI_SUCCESS; -} -/* This routine is used to find the list of procs that run on the -** same host as the calling process. -*/ -static mca_sbgp_base_module_t * mca_sbgp_p2p_select_procs(struct ompi_proc_t ** procs, - int n_procs_in, - struct ompi_communicator_t *comm, - char *key, - void *output_data - ) -{ - /* local variables */ - int cnt, proc, my_rank; - mca_sbgp_p2p_module_t *module; - - /* find my rank in the group */ - for (my_rank = -1, proc = 0 ; proc < n_procs_in ; ++proc) { - if (ompi_proc_local() == procs[proc]) { - my_rank = proc; - } - } - - /* I am not in the list - so will form no local subgroup */ - if (0 > my_rank) { - return NULL; - } - - module = OBJ_NEW(mca_sbgp_p2p_module_t); - if (!module ) { - return NULL; - } - - module->super.group_size = 0; - module->super.group_comm = comm; - module->super.group_net = OMPI_SBGP_P2P; - - /* allocate resources */ - module->super.group_list = (int *) calloc (n_procs_in, sizeof (int)); - if (NULL == module->super.group_list) { - goto Error; - } - - for (cnt = 0, proc = 0 ; proc < n_procs_in ; ++proc) { -#if defined(OMPI_PROC_ENDPOINT_TAG_BML) - mca_bml_base_endpoint_t* endpoint = - (mca_bml_base_endpoint_t*) procs[proc]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; -#endif - - if (my_rank == proc || !key) { - module->super.group_list[cnt++] = proc; - continue; - } - -#if defined(OMPI_PROC_ENDPOINT_TAG_BML) - if (NULL != endpoint) { - int num_btls = mca_bml_base_btl_array_get_size(&(endpoint->btl_eager)); - /* loop over btls */ - - for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) { - /* I am checking for specific btl */ - if (strcmp(endpoint->btl_eager.bml_btls[i_btl].btl-> - btl_component->btl_version.mca_component_name, key)) { - module->super.group_list[cnt++] = proc; - break; - } - } - } -#endif - } - - if (0 == cnt) { - goto Error; - } - - module->super.group_size = cnt; - module->super.group_list = (int *) realloc (module->super.group_list, sizeof (int) * cnt); - if (NULL == module->super.group_list) { - /* Shouldn't ever happen */ - goto Error; - } - - /* successful return */ - return (mca_sbgp_base_module_t *)module; - - /* return with error */ -Error: - /* clean up */ - if (NULL != module->super.group_list) { - free (module->super.group_list); - module->super.group_list = NULL; - } - OBJ_RELEASE(module); - - return NULL; -} diff --git a/ompi/mca/sbgp/p2p/sbgp_p2p_module.c b/ompi/mca/sbgp/p2p/sbgp_p2p_module.c deleted file mode 100644 index 40a1c104bb..0000000000 --- a/ompi/mca/sbgp/p2p/sbgp_p2p_module.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#include "ompi_config.h" -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_MMAN_H -#include -#endif -#include -#include - -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/sbgp/p2p/sbgp_p2p.h" - -/* - * Local functions - */ -static void -mca_sbgp_p2p_module_construct(mca_sbgp_p2p_module_t *module) -{ -} - -static void -mca_sbgp_p2p_module_destruct(mca_sbgp_p2p_module_t *module) -{ - /* done */ -} - - -OBJ_CLASS_INSTANCE(mca_sbgp_p2p_module_t, - mca_sbgp_base_module_t, - mca_sbgp_p2p_module_construct, - mca_sbgp_p2p_module_destruct); diff --git a/ompi/mca/sbgp/sbgp.h b/ompi/mca/sbgp/sbgp.h deleted file mode 100644 index c128051b2e..0000000000 --- a/ompi/mca/sbgp/sbgp.h +++ /dev/null @@ -1,137 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_SBGP_H -#define MCA_SBGP_H - -#include "ompi_config.h" -#include "opal/class/opal_list.h" -#include "ompi/mca/mca.h" -#include "ompi/communicator/communicator.h" - -#include "opal/util/show_help.h" - -#if defined(c_plusplus) || defined(__cplusplus) -extern "C" { -#endif - -/** - * List of supported network types - */ - -typedef int (*mca_sbgp_component_init_query_fn_t) - (bool enable_progress_threads, bool enable_mpi_threads); - -typedef enum { - OMPI_SBGP_MUMA = 1 << 0, /* Muma */ - OMPI_SBGP_SOCKET = 1 << 1, /* CPU socket */ - OMPI_SBGP_P2P = 1 << 2, /* Point 2 point networks */ - OMPI_SBGP_IBCX2 = 1 << 3, /* Infiniband ConnextX2 */ - OMPI_SBGP_IB = 1 << 4 /* Infiniband */ -} mca_sbgp_net_type; - -/* - * Interface function for routine that will extract subgroups - * - * @param procs (IN) List of mpi processes to filter - * @param n_procs_in (IN) Number of input processes - * @param key (IN) optional key - * @param output_data (OUT) component specific output - * @return module, NULL if one is not created. - * - */ - -struct mca_sbgp_base_module_2_0_0_t { - - /** Collective modules all inherit from opal_object */ - opal_object_t super; - /* group size */ - int group_size; - - /* largest power of 2 in group */ - int pow_2; - - /* number of levels in the tree */ - int n_levels_pow2; - - /* my index in the group list, - * pointer to my rank */ - int my_index; - /* List of ranks. - * Actually we return to ML array of - * indexes to ompi_proc. - * And ML is responsible to replace - * the indexes to ranks */ - int *group_list; - /* pointer to *father* communicator, - * Not sure if we really need it now. I know my rank via my index, - * and ompi_proc I can cache on sbgp module. - * For ib I do not need it */ - struct ompi_communicator_t *group_comm; - /* network supported by this groups */ - mca_sbgp_net_type group_net; - - /*FIXME: - * I don't know where to add the use_hdl flag since the - * mca_bcol_basesmuma_comm_query takes just two input parameters. - */ - bool use_hdl; - -}; -typedef struct mca_sbgp_base_module_2_0_0_t mca_sbgp_base_module_2_0_0_t; -typedef struct mca_sbgp_base_module_2_0_0_t mca_sbgp_base_module_t; -/* typedef mca_sbgp_base_module_2_0_0_t mca_sbgp_base_module_t; */ -OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_sbgp_base_module_t); - -typedef mca_sbgp_base_module_t *(*mca_sbgp_create_subgroup_fn_t)( - struct ompi_proc_t ** procs, int n_procs_in, - struct ompi_communicator_t *comm, char *key, - void *output_data - ); - -/** - * Subgrouping component interface - * - * Component interface for the sub-gorup framework. A public - * instance of this structure, called - * mca_sbgp_[component_name]_component, must exist in any sub-group - * component. - */ -struct mca_sbgp_base_component_2_0_0_t { - /** Base component description */ - mca_base_component_t sbgp_version; - - /** Sbgp component init query function */ - mca_sbgp_component_init_query_fn_t sbgp_init_query; - - /** process selection function */ - mca_sbgp_create_subgroup_fn_t select_procs; - - /** priority */ - int priority; - -}; -typedef struct mca_sbgp_base_component_2_0_0_t mca_sbgp_base_component_2_0_0_t; -typedef struct mca_sbgp_base_component_2_0_0_t mca_sbgp_base_component; - - -/* -* Macro for use in components that are of type coll -*/ -#define MCA_SBGP_BASE_VERSION_2_0_0 \ - OMPI_MCA_BASE_VERSION_2_1_0("sbgp", 2, 0, 0) - -#if defined(c_plusplus) || defined(__cplusplus) -} -#endif -#endif /* MCA_SBGP_H */