1
1

Merge pull request #2658 from rhc54/topic/removal

Remove the bcol, coll/ml, and sbgp code as stale and lacking a maintainer
Этот коммит содержится в:
Ralph Castain 2017-01-03 20:34:09 -08:00 коммит произвёл GitHub
родитель dadc6fbaf6 66131b4183
Коммит 5737a45b35
163 изменённых файлов: 0 добавлений и 54840 удалений

Просмотреть файл

@ -1,35 +0,0 @@
#
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_bcol.la
libmca_bcol_la_SOURCES =
# header setup
nobase_ompi_HEADERS =
nobase_nodist_ompi_HEADERS =
# local files
headers = bcol.h
libmca_bcol_la_SOURCES += $(headers) $(nodist_headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
nobase_ompi_HEADERS += $(headers)
nobase_nodist_ompi_HEADERS += $(nodist_headers)
ompidir = $(ompiincludedir)/ompi/mca/bcol
else
ompidir = $(includedir)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

Просмотреть файл

@ -1,16 +0,0 @@
#
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_bcol_la_SOURCES += \
base/bcol_base_frame.c \
base/bcol_base_init.c

Просмотреть файл

@ -1,49 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_BASE_H
#define MCA_BCOL_BASE_H
#include "ompi_config.h"
#include "ompi/mca/mca.h"
#include "opal/class/opal_list.h"
#include "ompi/mca/bcol/bcol.h"
/*
* Global functions for BCOL
*/
BEGIN_C_DECLS
OMPI_DECLSPEC extern opal_list_t mca_bcol_base_components_in_use;
OMPI_DECLSPEC extern char *ompi_bcol_bcols_string;
OMPI_DECLSPEC extern mca_base_framework_t ompi_bcol_base_framework;
OMPI_DECLSPEC int mca_bcol_base_init(bool enable_progress_threads, bool enable_mpi_threads);
struct mca_bcol_base_module_t;
OMPI_DECLSPEC int mca_bcol_base_bcol_fns_table_init(struct mca_bcol_base_module_t *bcol_module);
OMPI_DECLSPEC int mca_bcol_base_fn_table_construct(struct mca_bcol_base_module_t *bcol_module);
OMPI_DECLSPEC int mca_bcol_base_fn_table_destroy(struct mca_bcol_base_module_t *bcol_module);
OMPI_DECLSPEC int mca_bcol_base_set_attributes(struct mca_bcol_base_module_t *bcol_module,
mca_bcol_base_coll_fn_comm_attributes_t *comm_attribs,
mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs,
mca_bcol_base_module_collective_fn_primitives_t bcol_fn,
mca_bcol_base_module_collective_fn_primitives_t progress_fn);
END_C_DECLS
#endif /* MCA_BCOL_BASE_H */

Просмотреть файл

@ -1,374 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <stdio.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNIST_H */
#include "ompi/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/argv.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/include/ompi/constants.h"
#include "opal/mca/mpool/mpool.h"
#include "opal/class/opal_list.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "ompi/mca/bcol/base/static-components.h"
static int mca_bcol_base_open(mca_base_open_flag_t flags);
static int mca_bcol_base_close (void);
static int mca_bcol_base_register(mca_base_register_flag_t flags);
/*
** * Global variables
** */
MCA_BASE_FRAMEWORK_DECLARE(ompi, bcol, NULL, mca_bcol_base_register, mca_bcol_base_open, mca_bcol_base_close,
mca_bcol_base_static_components, 0);
OMPI_DECLSPEC opal_list_t mca_bcol_base_components_in_use = {{0}};
OMPI_DECLSPEC char *ompi_bcol_bcols_string = NULL;
OMPI_DECLSPEC int bcol_mpool_compatibility[BCOL_SIZE][BCOL_SIZE] = {{0}};
OMPI_DECLSPEC int bcol_mpool_index[BCOL_SIZE][BCOL_SIZE] = {{0}};
static void bcol_base_module_constructor(mca_bcol_base_module_t *module)
{
int fnc;
module->bcol_component = NULL;
module->network_context = NULL;
module->context_index = -1;
module->supported_mode = 0;
module->init_module = NULL;
module->sbgp_partner_module = NULL;
module->squence_number_offset = 0;
module->n_poll_loops = 0;
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
module->bcol_function_table[fnc] = NULL;
module->small_message_thresholds[fnc] = BCOL_THRESHOLD_UNLIMITED;
}
module->set_small_msg_thresholds = NULL;
module->header_size = 0;
module->bcol_memory_init = NULL;
module->next_inorder = NULL;
mca_bcol_base_fn_table_construct(module);
}
static void bcol_base_module_destructor(mca_bcol_base_module_t *module)
{
int fnc;
module->bcol_component = NULL;
module->context_index = -1;
module->init_module = NULL;
module->sbgp_partner_module = NULL;
module->squence_number_offset = 0;
module->n_poll_loops = 0;
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
module->bcol_function_table[fnc] = NULL;
}
module->bcol_memory_init = NULL;
}
OBJ_CLASS_INSTANCE(mca_bcol_base_module_t,
opal_object_t,
bcol_base_module_constructor,
bcol_base_module_destructor);
static void bcol_base_network_context_constructor(bcol_base_network_context_t *nc)
{
nc->context_id = -1;
nc->context_data = NULL;
}
static void bcol_base_network_context_destructor(bcol_base_network_context_t *nc)
{
nc->context_id = -1;
nc->context_data = NULL;
nc->register_memory_fn = NULL;
nc->deregister_memory_fn = NULL;
}
OBJ_CLASS_INSTANCE(bcol_base_network_context_t,
opal_object_t,
bcol_base_network_context_constructor,
bcol_base_network_context_destructor);
/* get list of subgrouping coponents to use */
static int mca_bcol_base_set_components_to_use(opal_list_t *bcol_components_avail,
opal_list_t *bcol_components_in_use)
{
/* local variables */
const mca_base_component_t *b_component;
mca_base_component_list_item_t *b_cli;
mca_base_component_list_item_t *b_clj;
char **bcols_requested;
const char *b_component_name;
/* split the requst for the bcol modules */
bcols_requested = opal_argv_split(ompi_bcol_bcols_string, ',');
if (NULL == bcols_requested) {
return OMPI_ERROR;
}
/* Initialize list */
OBJ_CONSTRUCT(bcol_components_in_use, opal_list_t);
/* figure out basic collective modules to use */
/* loop over list of components requested */
for (int i = 0 ; bcols_requested[i] ; ++i) {
/* loop over discovered components */
OPAL_LIST_FOREACH(b_cli, bcol_components_avail, mca_base_component_list_item_t) {
b_component = b_cli->cli_component;
b_component_name = b_component->mca_component_name;
if (0 == strcmp (b_component_name, bcols_requested[i])) {
/* found selected component */
b_clj = OBJ_NEW(mca_base_component_list_item_t);
if (NULL == b_clj) {
opal_argv_free (bcols_requested);
return OPAL_ERR_OUT_OF_RESOURCE;
}
b_clj->cli_component = b_component;
opal_list_append(bcol_components_in_use,
(opal_list_item_t *) b_clj);
break;
} /* end check for bcol component */
}
}
/* Note: Need to add error checking to make sure all requested functions
** were found */
/*
** release resources
** */
opal_argv_free (bcols_requested);
return OMPI_SUCCESS;
}
static int mca_bcol_base_register(mca_base_register_flag_t flags)
{
/* figure out which bcol and sbgp components will actually be used */
/* get list of sub-grouping functions to use */
ompi_bcol_bcols_string = "basesmuma,basesmuma,iboffload,ptpcoll,ugni";
(void) mca_base_var_register("ompi", "bcol", "base", "string",
"Default set of basic collective components to use",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_bcol_bcols_string);
return OMPI_SUCCESS;
}
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
static int mca_bcol_base_open(mca_base_open_flag_t flags)
{
int ret;
/* Open up all available components */
if (OMPI_SUCCESS !=
(ret = mca_base_framework_components_open(&ompi_bcol_base_framework, flags))) {
return ret;
}
ret = mca_bcol_base_set_components_to_use(&ompi_bcol_base_framework.framework_components,
&mca_bcol_base_components_in_use);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* memory registration compatibilities */
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_SHARED_MEMORY_UMA]=1;
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_SHARED_MEMORY_SOCKET]=1;
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_POINT_TO_POINT]=1;
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_IB_OFFLOAD]=1;
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_SOCKET][BCOL_SHARED_MEMORY_UMA]=1;
bcol_mpool_compatibility[BCOL_POINT_TO_POINT] [BCOL_SHARED_MEMORY_UMA]=1;
bcol_mpool_compatibility[BCOL_IB_OFFLOAD] [BCOL_SHARED_MEMORY_UMA]=1;
return OMPI_SUCCESS;
}
static int mca_bcol_base_close (void)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (&mca_bcol_base_components_in_use))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mca_bcol_base_components_in_use);
return mca_base_framework_components_close(&ompi_bcol_base_framework, NULL);
}
/*
* Prototype implementation of selection logic
*/
int mca_bcol_base_fn_table_construct(struct mca_bcol_base_module_t *bcol_module){
int bcol_fn;
/* Call all init functions */
/* Create a function table */
for (bcol_fn = 0; bcol_fn < BCOL_NUM_OF_FUNCTIONS; bcol_fn++){
/* Create a list object for each bcol type list */
OBJ_CONSTRUCT(&(bcol_module->bcol_fns_table[bcol_fn]), opal_list_t);
}
return OMPI_SUCCESS;
}
int mca_bcol_base_fn_table_destroy(struct mca_bcol_base_module_t *bcol_module){
int bcol_fn;
for (bcol_fn = 0; bcol_fn < BCOL_NUM_OF_FUNCTIONS; bcol_fn++){
/* gvm FIX: Go through the list and destroy each item */
/* Destroy the function table object for each bcol type list */
OBJ_DESTRUCT(&(bcol_module->bcol_fns_table[bcol_fn]));
}
return OMPI_SUCCESS;
}
int mca_bcol_base_set_attributes(struct mca_bcol_base_module_t *bcol_module,
mca_bcol_base_coll_fn_comm_attributes_t *arg_comm_attribs,
mca_bcol_base_coll_fn_invoke_attributes_t *arg_inv_attribs,
mca_bcol_base_module_collective_fn_primitives_t bcol_fn,
mca_bcol_base_module_collective_fn_primitives_t progress_fn
)
{
mca_bcol_base_coll_fn_comm_attributes_t *comm_attribs = NULL;
mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs = NULL;
struct mca_bcol_base_coll_fn_desc_t *fn_filtered = NULL;
int coll_type;
comm_attribs = malloc(sizeof(mca_bcol_base_coll_fn_comm_attributes_t));
if (NULL == comm_attribs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
inv_attribs = malloc(sizeof(mca_bcol_base_coll_fn_invoke_attributes_t));
if (NULL == inv_attribs) {
free(comm_attribs);
return OMPI_ERR_OUT_OF_RESOURCE;
}
coll_type = comm_attribs->bcoll_type = arg_comm_attribs->bcoll_type;
comm_attribs->comm_size_min = arg_comm_attribs->comm_size_min;
comm_attribs->comm_size_max = arg_comm_attribs->comm_size_max;
comm_attribs->data_src = arg_comm_attribs->data_src;
comm_attribs->waiting_semantics = arg_comm_attribs->waiting_semantics;
inv_attribs->bcol_msg_min = arg_inv_attribs->bcol_msg_min;
inv_attribs->bcol_msg_max = arg_inv_attribs->bcol_msg_max ;
inv_attribs->datatype_bitmap = arg_inv_attribs->datatype_bitmap ;
inv_attribs->op_types_bitmap = arg_inv_attribs->op_types_bitmap;
fn_filtered = OBJ_NEW(mca_bcol_base_coll_fn_desc_t);
fn_filtered->coll_fn = bcol_fn;
fn_filtered->progress_fn = progress_fn;
fn_filtered->comm_attr = comm_attribs;
fn_filtered->inv_attr = inv_attribs;
opal_list_append(&(bcol_module->bcol_fns_table[coll_type]),(opal_list_item_t*)fn_filtered);
return OMPI_SUCCESS;
}
int mca_bcol_base_bcol_fns_table_init(struct mca_bcol_base_module_t *bcol_module){
int ret, bcol_init_fn;
for (bcol_init_fn =0; bcol_init_fn < BCOL_NUM_OF_FUNCTIONS; bcol_init_fn++) {
if (NULL != bcol_module->bcol_function_init_table[bcol_init_fn]) {
ret = (bcol_module->bcol_function_init_table[bcol_init_fn]) (bcol_module);
if (OMPI_SUCCESS != ret) {
return OMPI_ERROR;
}
}
}
return OMPI_SUCCESS;
}
static void mca_bcol_base_coll_fn_desc_constructor(mca_bcol_base_coll_fn_desc_t *fn)
{
fn->comm_attr = NULL;
fn->inv_attr = NULL;
}
static void mca_bcol_base_coll_fn_desc_destructor(mca_bcol_base_coll_fn_desc_t *fn)
{
if (fn->comm_attr) {
free(fn->comm_attr);
}
if (fn->inv_attr) {
free(fn->inv_attr);
}
}
OBJ_CLASS_INSTANCE(mca_bcol_base_coll_fn_desc_t,
opal_list_item_t,
mca_bcol_base_coll_fn_desc_constructor,
mca_bcol_base_coll_fn_desc_destructor);
static void lmngr_block_constructor(mca_bcol_base_lmngr_block_t *item)
{
item->base_addr = NULL;
}
static void lnmgr_block_destructor(mca_bcol_base_lmngr_block_t *item)
{
/* I have nothing to do here */
}
OBJ_CLASS_INSTANCE(mca_bcol_base_lmngr_block_t,
opal_list_item_t,
lmngr_block_constructor,
lnmgr_block_destructor);

Просмотреть файл

@ -1,45 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/mca/mca.h"
#include "opal/mca/base/base.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/include/ompi/constants.h"
int mca_bcol_base_init(bool enable_progress_threads, bool enable_mpi_threads)
{
mca_bcol_base_component_t *bcol_component;
mca_base_component_list_item_t *cli;
int ret;
OPAL_LIST_FOREACH(cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
bcol_component = (mca_bcol_base_component_t *) cli->cli_component;
if (false == bcol_component->init_done) {
ret = bcol_component->collm_init_query(true, true);
if (OMPI_SUCCESS != ret) {
return ret;
}
bcol_component->init_done = true;
}
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: ORNL
status: unmaintained

Просмотреть файл

@ -1,66 +0,0 @@
#
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
bcol_basesmuma.h \
bcol_basesmuma_utils.h \
bcol_basesmuma_bcast.c \
bcol_basesmuma_component.c \
bcol_basesmuma_module.c \
bcol_basesmuma_buf_mgmt.c \
bcol_basesmuma_mem_mgmt.c \
bcol_basesmuma_fanin.c \
bcol_basesmuma_fanout.c \
bcol_basesmuma_progress.c \
bcol_basesmuma_reduce.h \
bcol_basesmuma_reduce.c \
bcol_basesmuma_allreduce.c \
bcol_basesmuma_setup.c \
bcol_basesmuma_rd_barrier.c \
bcol_basesmuma_rd_nb_barrier.c \
bcol_basesmuma_rk_barrier.c \
bcol_basesmuma_utils.c \
bcol_basesmuma_bcast_prime.c \
bcol_basesmuma_lmsg_knomial_bcast.c \
bcol_basesmuma_lmsg_bcast.c \
bcol_basesmuma_gather.c \
bcol_basesmuma_allgather.c \
bcol_basesmuma_smcm.h \
bcol_basesmuma_smcm.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
component_noinst =
component_install =
if MCA_BUILD_ompi_bcol_basesmuma_DSO
component_install += mca_bcol_basesmuma.la
else
component_noinst += libmca_bcol_basesmuma.la
endif
# See ompi/mca/btl/sm/Makefile.am for an explanation of
# libmca_common_sm.la.
AM_CPPFLAGS = $(btl_portals_CPPFLAGS)
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_bcol_basesmuma_la_SOURCES = $(sources)
mca_bcol_basesmuma_la_LDFLAGS = -module -avoid-version $(btl_portals_LDFLAGS)
mca_bcol_basesmuma_la_LIBADD = \
$(btl_portals_LIBS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_bcol_basesmuma_la_SOURCES =$(sources)
libmca_bcol_basesmuma_la_LDFLAGS = -module -avoid-version $(btl_portals_LDFLAGS)

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,352 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/include/ompi/constants.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
/*
#define IS_AGDATA_READY(peer, my_flag, my_sequence_number)\
(((peer)->sequence_number == (my_sequence_number) && \
(peer)->flags[ALLGATHER_FLAG][bcol_id] >= (my_flag) \
)? true : false )
*/
#define CALC_ACTIVE_REQUESTS(active_requests,peers, tree_order) \
do{ \
for( j = 0; j < (tree_order - 1); j++){ \
if( 0 > peers[j] ) { \
/* set the bit */ \
*active_requests ^= (1<<j); \
} \
} \
}while(0)
/*
* Recursive K-ing allgather
*/
/*
*
* Recurssive k-ing algorithm
* Example k=3 n=9
*
*
* Number of Exchange steps = log (basek) n
* Number of steps in exchange step = k (radix)
*
*/
int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
int bcol_id = (int) bcol_module->super.bcol_id;
uint32_t buffer_index = input_args->buffer_index;
int *active_requests =
&(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
int leading_dim, buff_idx, idx;
int64_t sequence_number = input_args->sequence_num;
int my_rank = bcol_module->super.sbgp_partner_module->my_index;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
/* control structures */
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
volatile int8_t ready_flag;
/* initialize the iteration counter */
buff_idx = input_args->src_desc->buffer_index;
leading_dim = bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
/* Set pointer to current proc ctrl region */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
/* initialize headers and ready flag */
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
/* initialize these */
*iteration = -1;
*active_requests = 0;
*status = ready_flag;
if (EXTRA_NODE == exchange_node->node_type) {
/* I am ready at this level */
opal_atomic_wmb ();
my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
}
return bcol_basesmuma_k_nomial_allgather_progress (input_args, const_args);
}
/* allgather progress function */
int bcol_basesmuma_k_nomial_allgather_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variables */
int8_t flag_offset;
uint32_t buffer_index = input_args->buffer_index;
volatile int8_t ready_flag;
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
int group_size = bcol_module->colls_no_user_data.size_of_group;
int *list_connected = bcol_module->super.list_n_connected; /* critical for hierarchical colls */
int bcol_id = (int) bcol_module->super.bcol_id;
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
int *active_requests =
&(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
int leading_dim, idx, buff_idx;
int i, j, probe;
int knt;
int src;
int recv_offset, recv_len;
int max_requests = 0; /* critical to set this */
int pow_k, tree_order;
int64_t sequence_number=input_args->sequence_num;
int my_rank = bcol_module->super.sbgp_partner_module->my_index;
int pack_len = input_args->count * input_args->dtype->super.size;
void *data_addr = (void*)(
(unsigned char *) input_args->sbuf +
(size_t) input_args->sbuf_offset);
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile char *peer_data_pointer;
/* control structures */
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;
#if 0
fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n", my_rank,
*active_requests, *iteration, *status);
#endif
buff_idx = input_args->src_desc->buffer_index;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
/* Set pointer to current proc ctrl region */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
/* increment the starting flag by one and return */
/* flag offset seems unnecessary here */
flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];
ready_flag = *status;
my_ctl_pointer->sequence_number = sequence_number;
/* k-nomial parameters */
tree_order = exchange_node->tree_order;
pow_k = exchange_node->log_tree_order;
/* calculate the maximum number of requests
* at each level each rank communicates with
* at most (k - 1) peers
* so if we set k - 1 bit fields in "max_requests", then
* we have max_request == 2^(k - 1) -1
*/
for(i = 0; i < (tree_order - 1); i++){
max_requests ^= (1<<i);
}
/* let's begin the collective, starting with extra ranks and their
* respective proxies
*/
if (OPAL_UNLIKELY(-1 == *iteration)) {
if (EXTRA_NODE == exchange_node->node_type) {
/* If I'm in here, then I must be looking for data */
ready_flag = flag_offset + 1 + pow_k + 2;
src = exchange_node->rank_extra_sources_array[0];
peer_data_pointer = data_buffs[src].payload;
peer_ctl_pointer = data_buffs[src].ctl_struct;
/* calculate the count */
for (i = 0, knt = 0 ; i < group_size ; ++i){
knt += list_connected[i];
}
for (i = 0 ; i < cm->num_to_probe ; ++i) {
if (IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)) {
/* we receive the entire message */
opal_atomic_mb ();
memcpy (data_addr, (void *) peer_data_pointer, knt * pack_len);
goto FINISHED;
}
}
/* haven't found it, state is saved, bail out */
return BCOL_FN_STARTED;
} else if (0 < exchange_node->n_extra_sources) {
/* I am a proxy for someone */
src = exchange_node->rank_extra_sources_array[0];
peer_data_pointer = data_buffs[src].payload;
peer_ctl_pointer = data_buffs[src].ctl_struct;
/* calculate the offset */
for (i = 0, knt = 0 ; i < src ; ++i){
knt += list_connected[i];
}
/* probe for extra rank's arrival */
for (i = 0 ; i < cm->num_to_probe ; ++i) {
if (IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)) {
opal_atomic_mb ();
/* copy it in */
memcpy ((void *) ((uintptr_t) data_addr + knt * pack_len),
(void *) ((uintptr_t) peer_data_pointer + knt * pack_len),
pack_len * list_connected[src]);
break;
}
}
if (i == cm->num_to_probe) {
return BCOL_FN_STARTED;
}
}
/* bump the ready flag to indicate extra node exchange complete */
++ready_flag;
*iteration = 0;
}
/* start the recursive k - ing phase */
for (i = *iteration ; i < pow_k ; ++i) {
/* I am ready at this level */
opal_atomic_wmb ();
my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
if (0 == *active_requests) {
/* flip some bits, if we don't have active requests from a previous visit */
CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[i],tree_order);
}
for (j = 0; j < (tree_order - 1); ++j) {
/* recv phase */
src = exchange_node->rank_exchanges[i][j];
if (src < 0) {
/* then not a valid rank, continue */
continue;
}
if (!(*active_requests&(1<<j))) {
/* then this peer hasn't been processed at this level */
peer_data_pointer = data_buffs[src].payload;
peer_ctl_pointer = data_buffs[src].ctl_struct;
recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len;
recv_len = exchange_node->payload_info[i][j].r_len * pack_len;
/* I am putting the probe loop as the inner most loop to achieve
* better temporal locality
*/
for (probe = 0 ; probe < cm->num_to_probe ; ++probe) {
if (IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)) {
/* flip the request's bit */
*active_requests ^= (1<<j);
/* copy the data */
memcpy((void *)((unsigned char *) data_addr + recv_offset),
(void *)((unsigned char *) peer_data_pointer + recv_offset),
recv_len);
break;
}
}
}
}
if( max_requests == *active_requests ){
/* bump the ready flag */
ready_flag++;
/* reset the active requests for the next level */
*active_requests = 0;
/* calculate the number of active requests
* logically makes sense to do it here. We don't
* want to inadvertantly flip a bit to zero that we
* set previously
*/
} else {
/* state is saved hop out
*/
*status = my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id];
*iteration = i;
return BCOL_FN_STARTED;
}
}
/* bump the flag one more time for the extra rank */
ready_flag = flag_offset + 1 + pow_k + 2;
/* finish off the last piece, send the data back to the extra */
if( 0 < exchange_node->n_extra_sources ) {
/* simply announce my arrival */
opal_atomic_wmb ();
my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
}
FINISHED:
/* bump this up for others to see */
my_ctl_pointer->starting_flag_value[bcol_id]++;
return BCOL_FN_COMPLETE;
}
/* Register allreduce functions to the BCOL function table,
* so they can be selected
*/
int bcol_basesmuma_allgather_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
comm_attribs.bcoll_type = BCOL_ALLGATHER;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_k_nomial_allgather_init,
bcol_basesmuma_k_nomial_allgather_progress);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,611 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/op/op.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "opal/include/opal_stdint.h"
#include "ompi/mca/bcol/base/base.h"
#include "bcol_basesmuma.h"
static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args);
int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
comm_attribs.bcoll_type = BCOL_ALLREDUCE;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1048576;
comm_attribs.data_src = DATA_SRC_KNOWN;
/* selection logic at the ml level specifies a
* request for a non-blocking algorithm
* however, these algorithms are blocking
* following what was done at the p2p level
* we will specify non-blocking, but beware,
* these algorithms are blocking and will not make use
* of the progress engine
*/
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000;
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
/* Set attributes for fanin fanout algorithm */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_allreduce_intra_fanin_fanout,
bcol_basesmuma_allreduce_intra_fanin_fanout_progress);
inv_attribs.bcol_msg_min = 20000;
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_allreduce_intra_fanin_fanout,
bcol_basesmuma_allreduce_intra_fanin_fanout_progress);
/* Differs only in comm size */
comm_attribs.data_src = DATA_SRC_UNKNOWN;
comm_attribs.waiting_semantics = BLOCKING;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 8;
/* Set attributes for recursive doubling algorithm */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_allreduce_intra_recursive_doubling,
NULL);
return OMPI_SUCCESS;
}
/*
* Small data fanin reduce
* ML buffers are used for both payload and control structures
* This functions works with hierarchical allreduce and
* progress engine
*/
static inline int reduce_children (mca_bcol_basesmuma_module_t *bcol_module, volatile void *rbuf, netpatterns_tree_node_t *my_reduction_node,
int *iteration, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, ompi_datatype_t *dtype,
volatile mca_bcol_basesmuma_payload_t *data_buffs, int count, struct ompi_op_t *op, int process_shift)
{
volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
int bcol_id = (int) bcol_module->super.bcol_id;
int64_t sequence_number = my_ctl_pointer->sequence_number;
int8_t ready_flag = my_ctl_pointer->ready_flag;
int group_size = bcol_module->colls_no_user_data.size_of_group;
if (LEAF_NODE != my_reduction_node->my_node_type) {
volatile char *child_data_pointer;
volatile void *child_rbuf;
/* for each child */
/* my_result_data = child_result_data (op) my_source_data */
for (int child = *iteration ; child < my_reduction_node->n_children ; ++child) {
int child_rank = my_reduction_node->children_ranks[child] + process_shift;
if (group_size <= child_rank){
child_rank -= group_size;
}
child_ctl_pointer = data_buffs[child_rank].ctl_struct;
if (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) {
*iteration = child;
return BCOL_FN_STARTED;
}
child_data_pointer = data_buffs[child_rank].payload;
child_rbuf = child_data_pointer + child_ctl_pointer->roffsets[bcol_id];
ompi_op_reduce(op, (void *)child_rbuf, (void *)rbuf, count, dtype);
} /* end child loop */
}
if (ROOT_NODE != my_reduction_node->my_node_type) {
opal_atomic_wmb ();
my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag;
}
/* done with this step. move on to fan out */
*iteration = -1;
return BCOL_FN_COMPLETE;
}
static int allreduce_fanout (mca_bcol_basesmuma_module_t *bcol_module, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer,
volatile void *my_data_pointer, int process_shift, volatile mca_bcol_basesmuma_payload_t *data_buffs,
int sequence_number, int group_size, int rbuf_offset, size_t pack_len)
{
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
int bcol_id = (int) bcol_module->super.bcol_id;
int8_t ready_flag = my_ctl_pointer->ready_flag + 1;
netpatterns_tree_node_t *my_fanout_read_tree;
volatile void *parent_data_pointer;
int my_fanout_parent, my_rank;
void *parent_rbuf, *rbuf;
my_rank = bcol_module->super.sbgp_partner_module->my_index;
my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_rank]);
if (ROOT_NODE != my_fanout_read_tree->my_node_type) {
my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
if (group_size <= my_fanout_parent) {
my_fanout_parent -= group_size;
}
rbuf = (void *)((char *) my_data_pointer + rbuf_offset);
/*
* Get parent payload data and control data.
* Get the pointer to the base address of the parent's payload buffer.
* Get the parent's control buffer.
*/
parent_data_pointer = data_buffs[my_fanout_parent].payload;
parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
parent_rbuf = (void *) ((char *) parent_data_pointer + rbuf_offset);
/* Wait until parent signals that data is ready */
/* The order of conditions checked in this loop is important, as it can
* result in a race condition.
*/
if (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) {
return BCOL_FN_STARTED;
}
assert (parent_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] == ready_flag);
/* Copy the rank to a shared buffer writable by the current rank */
memcpy ((void *) rbuf, (const void*) parent_rbuf, pack_len);
}
if (LEAF_NODE != my_fanout_read_tree->my_node_type) {
opal_atomic_wmb ();
/* Signal to children that they may read the data from my shared buffer (bump the ready flag) */
my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag;
}
my_ctl_pointer->starting_flag_value[bcol_id] += 1;
return BCOL_FN_COMPLETE;
}
static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args)
{
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
int buff_idx = input_args->src_desc->buffer_index;
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
void *data_addr = (void *) input_args->src_desc->data_addr;
int my_node_index, my_rank, group_size, leading_dim, idx;
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
int64_t sequence_number = input_args->sequence_num;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
struct ompi_datatype_t *dtype = input_args->dtype;
netpatterns_tree_node_t *my_reduction_node;
struct ompi_op_t *op = input_args->op;
volatile void *my_data_pointer;
int count = input_args->count;
int rc, process_shift;
ptrdiff_t lb, extent;
volatile void *rbuf;
/* get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
leading_dim = bcol_module->colls_no_user_data.size_of_group;
idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
/* Align node index to around sbgp root */
process_shift = input_args->root;
my_node_index = my_rank - input_args->root;
if (0 > my_node_index ) {
my_node_index += group_size;
}
data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx;
/* Get control structure and payload buffer */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
my_data_pointer = (volatile char *) data_addr;
my_data_pointer = (volatile char *) data_addr;
rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset);
/***************************
* Fan into root phase
***************************/
my_reduction_node = &(bcol_module->reduction_tree[my_node_index]);
if (-1 != *iteration) {
rc = reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer,
dtype, data_buffs, count, op, process_shift);
if (BCOL_FN_COMPLETE != rc) {
return rc;
}
}
/* there might be non-contig dtype - so compute the length with get_extent */
ompi_datatype_get_extent(dtype, &lb, &extent);
/***************************
* Fan out from root
***************************/
/* all nodes will have the result after fanout */
input_args->result_in_rbuf = true;
/* Signal that you are ready for fanout phase */
return allreduce_fanout (bcol_module, my_ctl_pointer, my_data_pointer, process_shift, data_buffs,
sequence_number, group_size, input_args->rbuf_offset, count * (size_t) extent);
}
/**
* Shared memory blocking allreduce.
*/
int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args)
{
/* local variables */
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
int buff_idx = input_args->src_desc->buffer_index;
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
void *data_addr = (void *) input_args->src_desc->data_addr;
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
struct ompi_datatype_t *dtype = input_args->dtype;
int bcol_id = (int) bcol_module->super.bcol_id;
int rc, my_rank, leading_dim, idx;
volatile void *my_data_pointer;
volatile void *sbuf, *rbuf;
int8_t ready_flag;
/* get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
leading_dim = bcol_module->colls_no_user_data.size_of_group;
idx = SM_ARRAY_INDEX(leading_dim, buff_idx, 0);
data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx;
/* Get control structure */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
my_data_pointer = (volatile char *) data_addr;
rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset);
sbuf = (volatile void *)((char *) my_data_pointer + input_args->sbuf_offset);
/* Setup resource recycling */
/* Set for multiple instances of bcols */
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, input_args->sequence_num, bcol_id);
if (sbuf != rbuf) {
rc = ompi_datatype_copy_content_same_ddt (dtype, input_args->count, (char *)rbuf,
(char *)sbuf);
if( 0 != rc ) {
return OMPI_ERROR;
}
}
*iteration = 0;
my_ctl_pointer->ready_flag = ready_flag;
return bcol_basesmuma_allreduce_intra_fanin_fanout_progress (input_args, c_input_args);
}
/* this thing uses the old bcol private control structures */
int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
int my_rank,group_size,my_node_index;
int pair_rank, exchange, extra_rank, payload_len;
size_t dt_size;
int read_offset, write_offset;
volatile void *my_data_pointer;
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer = NULL,
*partner_ctl_pointer = NULL,
*extra_ctl_pointer = NULL;
volatile void *my_read_pointer, *my_write_pointer, *partner_read_pointer,
*extra_rank_readwrite_data_pointer,*extra_rank_read_data_pointer;
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
int8_t ready_flag;
int sbuf_offset,rbuf_offset,flag_offset;
int root,count;
struct ompi_op_t *op;
int64_t sequence_number=input_args->sequence_num;
struct ompi_datatype_t *dtype;
int first_instance = 0;
int leading_dim,idx;
int buff_idx;
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
/*volatile void **data_buffs;*/
volatile mca_bcol_basesmuma_payload_t *data_buffs;
netpatterns_pair_exchange_node_t *my_exchange_node;
/*
* Get addressing information
*/
buff_idx = input_args->src_desc->buffer_index;
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
leading_dim = bcol_module->colls_no_user_data.size_of_group;
idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
/*
* Get SM control structures and payload buffers
*/
ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
bcol_module->colls_with_user_data.ctl_buffs+idx;
/*data_buffs = (volatile void **)
bcol_module->colls_with_user_data.data_buffs+idx;*/
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs + idx;
/*
* Get control structure and payload buffer
*/
my_ctl_pointer = ctl_structs[my_rank];
if (my_ctl_pointer->sequence_number < sequence_number) {
first_instance=1;
}
my_data_pointer = data_buffs[my_rank].payload;
/*
* Align node index to around sbgp root
*/
root = input_args->root;
my_node_index = my_rank - root;
if (0 > my_node_index) {
my_node_index += group_size;
}
/*
* Get data from arguments
*/
sbuf_offset = input_args->sbuf_offset;
rbuf_offset = input_args->rbuf_offset;
op = input_args->op;
count = input_args->count;
dtype = input_args->dtype;
/*
* Get my node for the reduction tree
*/
my_exchange_node = &(bcol_module->recursive_doubling_tree);
if (first_instance) {
my_ctl_pointer->index = 1;
my_ctl_pointer->starting_flag_value = 0;
flag_offset = 0;
my_ctl_pointer->flag = -1;
/*
for( i = 0; i < NUM_SIGNAL_FLAGS; i++){
my_ctl_pointer->flags[ALLREDUCE_FLAG] = -1;
}
*/
} else {
my_ctl_pointer->index++;
flag_offset = my_ctl_pointer->starting_flag_value;
}
/* signal that I have arrived */
/* opal_atomic_wmb (); */
my_ctl_pointer->sequence_number = sequence_number;
/* If we use this buffer more than once by an sm module in
* a given collective, will need to distinguish between instances, so
* we pick up the right data.
*/
ready_flag = flag_offset + sequence_number + 1;
/*
* Set up pointers for using during recursive doubling phase
*/
read_offset = sbuf_offset;
write_offset = rbuf_offset;
fprintf(stderr,"read offset %d write offset %d\n",read_offset,write_offset);
my_read_pointer = (volatile void *)((char *) my_data_pointer + read_offset);
my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset);
/*
* When there are non-power 2 nodes, the extra nodes' data is copied and
* reduced by partner exchange nodes.
* Extra nodes: Nodes with rank greater nearest power of 2
* Exchange nodes: Nodes with rank lesser than nearest power of 2 that
* partner with extras nodes during reduction
*/
if (0 < my_exchange_node->n_extra_sources) {
/*
* Signal extra node that data is ready
*/
opal_atomic_wmb ();
my_ctl_pointer->flag = ready_flag;
if (EXCHANGE_NODE == my_exchange_node->node_type) {
extra_rank = my_exchange_node->rank_extra_source;
extra_ctl_pointer = ctl_structs[extra_rank];
extra_rank_readwrite_data_pointer = (void *) ((char *) data_buffs[extra_rank].payload +
read_offset);
/*
* Wait for data to get ready
*/
while (!((sequence_number == extra_ctl_pointer->sequence_number) &&
(extra_ctl_pointer->flag >= ready_flag))){
}
ompi_op_reduce(op,(void *)extra_rank_readwrite_data_pointer,
(void *)my_read_pointer, count, dtype);
}
}
/* --Exchange node that reduces with extra node --: Signal to extra node that data is read
* --Exchange node that doesn't reduce data with extra node --: This assignment
* is used so it can sync with other nodes during exchange phase
* --Extra node--: It can pass to next phase
*/
ready_flag++;
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
my_ctl_pointer->flag = ready_flag;
/*
* Exchange data with all the nodes that are less than max_power_2
*/
for (exchange=0 ; exchange < my_exchange_node->n_exchanges ; exchange++) {
int tmp=0;
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
my_ctl_pointer->flag = ready_flag;
pair_rank=my_exchange_node->rank_exchanges[exchange];
partner_ctl_pointer = ctl_structs[pair_rank];
partner_read_pointer = (volatile void *) ((char *)data_buffs[pair_rank].payload + read_offset);
my_read_pointer = (volatile void *)((char *) my_data_pointer + read_offset);
my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset);
/*
* Wait for partner to be ready, so we can read
*/
/*
JSL ---- FIX ME !!!!! MAKE ME COMPLIANT WITH NEW BUFFERS
while (!IS_ALLREDUCE_PEER_READY(partner_ctl_pointer,
ready_flag, sequence_number)) {
}
*/
/*
* Perform reduction operation
*/
ompi_3buff_op_reduce(op,(void *)my_read_pointer, (void *)partner_read_pointer,
(void *)my_write_pointer, count, dtype);
/*
* Signal that I am done reading my partner's data
*/
ready_flag++;
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
my_ctl_pointer->flag = ready_flag;
while (ready_flag > partner_ctl_pointer->flag){
opal_progress();
}
/*
* Swap read and write offsets
*/
tmp = read_offset;
read_offset = write_offset;
write_offset = tmp;
}
/*
* Copy data in from the "extra" source, if need be
*/
if (0 < my_exchange_node->n_extra_sources) {
if (EXTRA_NODE == my_exchange_node->node_type) {
int extra_rank_read_offset=-1,my_write_offset=-1;
/* Offset the ready flag to sync with
* exchange node which might going through exchange phases
* unlike the extra node
*/
ready_flag = ready_flag + my_exchange_node->log_2;
if (my_exchange_node->log_2%2) {
extra_rank_read_offset = rbuf_offset;
my_write_offset = rbuf_offset;
} else {
extra_rank_read_offset = sbuf_offset;
my_write_offset = sbuf_offset;
}
my_write_pointer = (volatile void*)((char *)my_data_pointer + my_write_offset);
extra_rank = my_exchange_node->rank_extra_source;
extra_ctl_pointer = ctl_structs[extra_rank];
extra_rank_read_data_pointer = (volatile void *) ((char *)data_buffs[extra_rank].payload +
extra_rank_read_offset);
/*
* Wait for the exchange node to be ready
*/
ompi_datatype_type_size(dtype, &dt_size);
payload_len = count*dt_size;
#if 0
fix me JSL !!!!!
while (!IS_DATA_READY(extra_ctl_pointer, ready_flag, sequence_number)){
}
#endif
memcpy((void *)my_write_pointer,(const void *)
extra_rank_read_data_pointer, payload_len);
ready_flag++;
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
my_ctl_pointer->flag = ready_flag;
} else {
/*
* Signal parent that data is ready
*/
opal_atomic_wmb ();
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
my_ctl_pointer->flag = ready_flag;
/* wait until child is done to move on - this buffer will
* be reused for the next stripe, so don't want to move
* on too quick.
*/
extra_rank = my_exchange_node->rank_extra_source;
extra_ctl_pointer = ctl_structs[extra_rank];
}
}
input_args->result_in_rbuf = my_exchange_node->log_2 & 1;
my_ctl_pointer->starting_flag_value += 1;
return BCOL_FN_COMPLETE;
}

Просмотреть файл

@ -1,487 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "bcol_basesmuma.h"
#define __TEST_BLOCKING__ 1
#define __TEST_WAIT__ 0
#define __TEST_TEST__ 0
/* debug
* #include "opal/sys/timer.h"
*
* extern uint64_t timers[7];
* end debug */
/* debug */
/* end debug */
int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
comm_attribs.bcoll_type = BCOL_BCAST;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1048576;
comm_attribs.data_src = DATA_SRC_KNOWN;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_bcast_k_nomial_knownroot,
bcol_basesmuma_bcast_k_nomial_knownroot);
inv_attribs.bcol_msg_min = 10000000;
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_bcast_k_nomial_knownroot,
bcol_basesmuma_bcast_k_nomial_knownroot);
comm_attribs.data_src = DATA_SRC_UNKNOWN;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_bcast_k_nomial_anyroot,
bcol_basesmuma_bcast_k_nomial_anyroot);
comm_attribs.data_src = DATA_SRC_UNKNOWN;
inv_attribs.bcol_msg_min = 10000000;
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
#ifdef __PORTALS_AVAIL__
comm_attribs.waiting_semantics = BLOCKING;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_lmsg_scatter_allgather_portals_bcast,
bcol_basesmuma_lmsg_scatter_allgather_portals_bcast);
comm_attribs.waiting_semantics = NON_BLOCKING;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast,
bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast);
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast,
bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast);
#else
/*
if (super->use_hdl) {
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_hdl_zerocopy_bcast,
bcol_basesmuma_hdl_zerocopy_bcast);
} else { */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, NULL, NULL);
/*
bcol_basesmuma_binary_scatter_allgather_segment,
bcol_basesmuma_binary_scatter_allgather_segment);
*/
/* } */
#endif
return OMPI_SUCCESS;
}
/* includes shared memory optimization */
/**
* Shared memory blocking Broadcast - fanin, for small data buffers.
* This routine assumes that buf (the input buffer) is a single writer
* multi reader (SWMR) shared memory buffer owned by the calling rank
* which is the only rank that can write to this buffers.
* It is also assumed that the buffers are registered and fragmented
* at the ML level and that buf is sufficiently large to hold the data.
*
*
* @param buf - SWMR shared buffer within a sbgp that the
* executing rank can write to.
* @param count - the number of elements in the shared buffer.
* @param dtype - the datatype of a shared buffer element.
* @param root - the index within the sbgp of the root.
* @param module - basesmuma module.
*/
int bcol_basesmuma_bcast(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
int group_size, process_shift, my_node_index;
int my_rank;
int rc = OMPI_SUCCESS;
int my_fanout_parent;
int leading_dim, buff_idx, idx;
volatile int8_t ready_flag;
int count=input_args->count;
struct ompi_datatype_t* dtype=input_args->dtype;
int root=input_args->root;
int64_t sequence_number=input_args->sequence_num;
mca_bcol_basesmuma_module_t* bcol_module=
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
int bcol_id = (int) bcol_module->super.bcol_id;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile char* parent_data_pointer;
mca_bcol_basesmuma_header_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
netpatterns_tree_node_t* my_fanout_read_tree;
size_t pack_len = 0, dt_size;
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr );
#if 0
fprintf(stderr,"Entering sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
fflush(stderr);
#endif
/* we will work only on packed data - so compute the length*/
ompi_datatype_type_size(dtype, &dt_size);
pack_len=count*dt_size;
buff_idx = input_args->src_desc->buffer_index;
/* Get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
/* Align node index to around sbgp root */
process_shift = root;
my_node_index = my_rank - root;
if(0 > my_node_index ) {
my_node_index += group_size;
}
/* get my node for the bcast tree */
my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]);
my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
if(group_size <= my_fanout_parent){
my_fanout_parent -= group_size;
}
/* Set pointer to current proc ctrl region */
/*my_ctl_pointer = ctl_structs[my_rank]; */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
/* setup resource recycling */
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
/*
* Fan out from root
*/
if(ROOT_NODE == my_fanout_read_tree->my_node_type) {
input_args->result_in_rbuf = false;
/* Root should only signal it is ready */
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
}else if(LEAF_NODE == my_fanout_read_tree->my_node_type) {
input_args->result_in_rbuf = false;
/*
* Get parent payload data and control data.
* Get the pointer to the base address of the parent's payload buffer.
* Get the parent's control buffer.
*/
parent_data_pointer = data_buffs[my_fanout_parent].payload;
parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
/* Wait until parent signals that data is ready */
/* The order of conditions checked in this loop is important, as it can
* result in a race condition.
*/
while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){
opal_progress();
}
/* Copy the rank to a shared buffer writable by the current rank */
memcpy(data_addr, (void *)parent_data_pointer, pack_len);
if( 0 != rc ) {
return OMPI_ERROR;
}
}else{
input_args->result_in_rbuf = false;
/* Interior node */
/* Get parent payload data and control data */
parent_data_pointer = data_buffs[my_fanout_parent].payload;
parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
/* Wait until parent signals that data is ready */
/* The order of conditions checked in this loop is important, as it can
* result in a race condition.
*/
while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){
opal_progress();
}
/* Copy the rank to a shared buffer writable by the current rank */
memcpy(data_addr, (void *)parent_data_pointer,pack_len);
/* Signal to children that they may read the data from my shared buffer */
opal_atomic_wmb ();
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
}
/* if I am the last instance of a basesmuma function in this collectie,
* release the resrouces */
my_ctl_pointer->starting_flag_value[bcol_id]++;
return rc;
}
/*zero-copy large massage communication methods*/
#if 0
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
int group_size, process_shift, my_node_index;
int my_rank, first_instance=0, flag_offset;
int rc = OMPI_SUCCESS;
int my_fanout_parent;
int leading_dim, buff_idx, idx;
volatile int64_t ready_flag;
int count=input_args->count;
struct ompi_datatype_t* dtype=input_args->dtype;
int root=input_args->root;
int64_t sequence_number=input_args->sequence_num;
mca_bcol_basesmuma_module_t* bcol_module=
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
netpatterns_tree_node_t* my_fanout_read_tree;
size_t pack_len = 0, dt_size;
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr);
struct mca_hdl_base_descriptor_t *hdl_desc;
struct mca_hdl_base_segment_t *hdl_seg;
int ret, completed, ridx/*remote rank index*/;
bool status;
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer= NULL;
volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer= NULL;
volatile mca_bcol_basesmuma_ctl_struct_t *child_ctl_pointer= NULL;
struct mca_hdl_base_module_t* hdl = bcol_module->hdl_module[0];
/* we will work only on packed data - so compute the length*/
ompi_datatype_type_size(dtype, &dt_size);
pack_len = count * dt_size;
buff_idx = input_args->src_desc->buffer_index;
/* Get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **)
bcol_module->colls_with_user_data.ctl_buffs+idx;
my_ctl_pointer = ctl_structs[my_rank];
/* Align node index to around sbgp root */
process_shift = root;
my_node_index = my_rank - root;
if(0 > my_node_index ) {
my_node_index += group_size;
}
/* get my node for the bcast tree */
my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]);
my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
if(group_size <= my_fanout_parent){
my_fanout_parent -= group_size;
}
/* setup resource recycling */
if( my_ctl_pointer->sequence_number < sequence_number ) {
first_instance = 1;
}
if( first_instance ) {
/* Signal arrival */
my_ctl_pointer->flag = -1;
my_ctl_pointer->index = 1;
/* this does not need to use any flag values , so only need to
* set the value for subsequent values that may need this */
my_ctl_pointer->starting_flag_value = 0;
flag_offset = 0;
} else {
/* only one thread at a time will be making progress on this
* collective, so no need to make this atomic */
my_ctl_pointer->index++;
}
/* increment the starting flag by one and return */
flag_offset = my_ctl_pointer->starting_flag_value;
ready_flag = flag_offset + sequence_number + 1;
my_ctl_pointer->sequence_number = sequence_number;
hdl_desc = (mca_hdl_base_descriptor_t *)
malloc (sizeof (mca_hdl_base_descriptor_t) * 1);
/*prepare a hdl data segment*/
hdl_seg = (mca_hdl_base_segment_t*)
malloc ( sizeof (mca_hdl_base_segment_t) * 1);
hdl_seg->seg_addr.pval = input_args->sbuf;
hdl_seg->seg_len = pack_len;
hdl->endpoint->ready_flag = ready_flag;
hdl->endpoint->local_ctrl = my_ctl_pointer;
hdl->endpoint->sbgp_contextid =
bcol_module->super.sbgp_partner_module->group_comm->c_contextid;
/*
* Fan out from root
*/
if(ROOT_NODE == my_fanout_read_tree->my_node_type) {
input_args->result_in_rbuf = false;
hdl_desc->des_src = hdl_seg;
hdl_desc->des_src_cnt = 1;
hdl_desc->isroot = true;
/*As the general semantics, there might multiple pairs of send/recv
*on the topology tree*/
for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
child_ctl_pointer =
ctl_structs[my_fanout_read_tree->children_ranks[ridx]];
hdl->endpoint->remote_ctrl = child_ctl_pointer;
ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
if (ret != OMPI_SUCCESS) {
BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
goto exit_ERROR;
}
}
}else if(LEAF_NODE == my_fanout_read_tree->my_node_type) {
input_args->result_in_rbuf = false;
/*
* Get parent payload data and control data.
* Get the pointer to the base address of the parent's payload buffer.
* Get the parent's control buffer.
*/
parent_ctl_pointer = ctl_structs[my_fanout_parent];
hdl_desc->des_dst = hdl_seg;
hdl_desc->des_dst_cnt = 1;
hdl_desc->isroot = false;
hdl->endpoint->remote_ctrl = parent_ctl_pointer;
#if __TEST_BLOCKING__
ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
#else
ret = hdl->hdl_recvi(hdl, hdl->endpoint, NULL, 0, 0, &hdl_desc);
#endif
#if __TEST_WAIT__
ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc);
BASESMUMA_VERBOSE(1,("wait on rank %d is done!", my_rank));
#endif
if (OMPI_SUCCESS != ret) {
BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
goto exit_ERROR;
}
status = false;
#if __TEST_TEST__
while (!status) {
hdl->hdl_test(&hdl_desc, &completed, &status);
opal_progress();
BASESMUMA_VERBOSE(1, ("test on rank %d ........", my_rank));
}
#endif
goto Release;
}else{
input_args->result_in_rbuf = false;
/* Interior node */
/* Get parent payload data and control data */
parent_ctl_pointer = ctl_structs[my_fanout_parent];
hdl_desc->des_dst = hdl_seg;
hdl_desc->des_dst_cnt = 1;
hdl_desc->isroot = false;
hdl->endpoint->remote_ctrl = parent_ctl_pointer;
ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
if (OMPI_SUCCESS != ret) {
goto exit_ERROR;
}
if (OMPI_SUCCESS != ret) {
BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
goto exit_ERROR;
}
/* Signal to children that they may read the data from my shared buffer */
opal_atomic_wmb ();
hdl_desc->des_src = hdl_seg;
hdl_desc->des_src_cnt = 1;
for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
child_ctl_pointer =
ctl_structs[my_fanout_read_tree->children_ranks[ridx]];
hdl->endpoint->remote_ctrl = child_ctl_pointer;
ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
if (ret != OMPI_SUCCESS) {
BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
goto exit_ERROR;
}
}
goto Release;
}
Release:
/* if I am the last instance of a basesmuma function in this collectie,
* release the resrouces */
if (IS_LAST_BCOL_FUNC(c_input_args)) {
rc = bcol_basesmuma_free_buff(
&(bcol_module->colls_with_user_data),
sequence_number);
}
my_ctl_pointer->starting_flag_value += 1;
return BCOL_FN_COMPLETE;
exit_ERROR:
return OMPI_ERROR;
}
#endif

Просмотреть файл

@ -1,895 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "bcol_basesmuma_utils.h"
#include "bcol_basesmuma.h"
/* debug
* #include "opal/sys/timer.h"
*
* extern uint64_t timers[7];
* end debug */
/* debug */
#include <unistd.h>
/* end debug */
/* includes shared memory optimization */
#define BCOL_BASESMUMA_SM_PROBE(src_list, n_src, my_index, matched, src) \
do { \
int j; \
for( j = 0; j < n_src; j++) { \
parent_ctl_pointer = data_buffs[src_list[j]].ctl_struct; \
parent_data_pointer = data_buffs[src_list[j]].payload; \
if( IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) { \
src = src_list[j]; \
matched = 1; \
break; \
} \
} \
} while(0)
/*
#define IS_LARGE_DATA_READY(peer, my_flag, my_sequence_number) \
(((peer)->sequence_number == (my_sequence_number) && \
(peer)->flags[BCAST_FLAG] >= (my_flag) \
)? true : false )
*/
/*
#define IS_KNOWN_ROOT_DATA_READY(peer, my_flag, my_sequence_number) \
(((peer)->sequence_number == (my_sequence_number) && \
(peer)->flags[BCAST_FLAG][bcol_id] >= (my_flag) \
)? true : false )
*/
#define BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(src_list, n_src, my_index, matched, src, flag_index, bcol_id) \
do { \
int j; \
for( j = 0; j < n_src; j++) { \
/* fprintf(stderr,"my_rank %d and %d\n",my_rank,1); */ \
if(src_list[j] != -1) { \
parent_ctl_pointer = ctl_structs[src_list[j]]; \
parent_data_pointer = (void *) data_buffs[src_list[j]].ctl_struct; \
/*fprintf(stderr,"my_rank %d ready flag %d partner flag %d and %d\n",my_rank,ready_flag,parent_ctl_pointer->flag,2); */ \
if( IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, flag_index, bcol_id)) { \
src = src_list[j]; \
matched = 1; \
index = j; \
/* fprintf(stderr,"found it from %d!\n",src);*/ \
break; \
} \
} \
} \
} while(0)
#define K_NOMIAL_DATA_SRC(radix, my_group_index, group_size, group_root, data_src, radix_mask) \
do { \
int relative_rank = (my_group_index >= group_root) ? my_group_index - group_root : \
my_group_index - group_root + group_size; \
radix_mask = 1; \
while (radix_mask < group_size) { \
if (relative_rank % (radix * radix_mask)) { \
data_src = relative_rank/(radix * radix_mask) * (radix * radix_mask) + group_root; \
if (data_src >= group_size) data_src -= group_size; \
break; \
} \
radix_mask *= radix; \
} \
} while (0)
int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
mca_bcol_basesmuma_module_t* bcol_module=
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
int i, matched = 0;
int group_size;
int my_rank;
int leading_dim,
buff_idx,
idx;
int count = input_args->count;
struct ompi_datatype_t* dtype = input_args->dtype;
int64_t sequence_number = input_args->sequence_num;
int radix =
mca_bcol_basesmuma_component.k_nomial_radix;
int radix_mask;
int16_t data_src = -1;
volatile int8_t ready_flag;
int bcol_id = (int) bcol_module->super.bcol_id;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile char* parent_data_pointer;
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
size_t pack_len = 0;
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr +
input_args->sbuf_offset);
#if 0
fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
fflush(stderr);
#endif
/* we will work only on packed data - so compute the length*/
BASESMUMA_VERBOSE(3, ("Calling bcol_basesmuma_bcast_k_nomial_knownroot"));
pack_len = mca_bcol_base_get_buff_length(dtype, count);
/* Some hierarchical algorithms have data that is accumulated at each step
* this factor accounts for this
*/
pack_len = pack_len*input_args->hier_factor;
buff_idx = input_args->buffer_index;
/* Get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
leading_dim = bcol_module->colls_no_user_data.size_of_group;
idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs + idx;
/* Set pointer to current proc ctrl region */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
/* setup resource recycling */
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
/* removing dependence on sequence number */
/* I believe this is resolved now with the signaling flags */
/*
ready_temp = 1 + (int8_t) flag_offset + (int8_t) bcol_id;
if( ready_temp >= my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) {
ready_flag = ready_temp;
} else {
ready_flag = my_ctl_pointer->flags[BCAST_FLAG][bcol_id];
}
opal_atomic_wmb ();
my_ctl_pointer->sequence_number = sequence_number;
*/
/* non-blocking broadcast algorithm */
/* If I am the root, then signal ready flag */
if(input_args->root_flag) {
BASESMUMA_VERBOSE(10,("I am the root of the data"));
/*
* signal ready flag
*/
opal_atomic_wmb ();
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
/* root is finished */
goto Release;
}
/* Calculate source of the data */
K_NOMIAL_DATA_SRC(radix, my_rank, group_size,
input_args->root_route->rank, data_src, radix_mask);
parent_ctl_pointer = data_buffs[data_src].ctl_struct;
parent_data_pointer = data_buffs[data_src].payload;
for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
if(IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, BCAST_FLAG, bcol_id)) {
matched = 1;
break;
}
}
/* If not matched, then hop out and put me on progress list */
if(0 == matched ) {
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
return BCOL_FN_NOT_STARTED;
}
/* else, we found our root within the group ... */
BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", data_src));
/* copy the data */
memcpy(data_addr, (void *) parent_data_pointer, pack_len);
/* set the memory barrier to ensure completion */
opal_atomic_wmb ();
/* signal that I am done */
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
Release:
my_ctl_pointer->starting_flag_value[bcol_id]++;
return BCOL_FN_COMPLETE;
}
/**
* Shared memory non-blocking Broadcast - K-nomial fan-out for small data buffers.
* This routine assumes that buf (the input buffer) is a single writer
* multi reader (SWMR) shared memory buffer owned by the calling rank
* which is the only rank that can write to this buffers.
* It is also assumed that the buffers are registered and fragmented
* at the ML level and that buf is sufficiently large to hold the data.
*
*
* @param buf - SWMR shared buffer within a sbgp that the
* executing rank can write to.
* @param count - the number of elements in the shared buffer.
* @param dtype - the datatype of a shared buffer element.
* @param root - the index within the sbgp of the root.
* @param module - basesmuma module.
*/
int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
mca_bcol_basesmuma_module_t* bcol_module=
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
int i;
int group_size;
int my_rank;
int leading_dim, buff_idx, idx;
int count=input_args->count;
struct ompi_datatype_t* dtype=input_args->dtype;
int64_t sequence_number=input_args->sequence_num;
int radix = cs->k_nomial_radix;
int radix_mask;
int relative_rank;
int pow_k_group_size;
volatile int8_t ready_flag;
int bcol_id = (int) bcol_module->super.bcol_id;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile void* parent_data_pointer;
volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
size_t pack_len = 0;
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr +
input_args->sbuf_offset);
#if 0
fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
fflush(stderr);
#endif
/* we will work only on packed data - so compute the length*/
pack_len = mca_bcol_base_get_buff_length(dtype, count);
buff_idx = input_args->buffer_index;
/* Get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
/* get pow_k_levels and pow_k_group_size */
pow_k_group_size = bcol_module->pow_k;
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
/* Set pointer to current proc ctrl region */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
/* non-blocking broadcast algorithm */
/* If I am the root, then signal ready flag */
if(input_args->root_flag) {
BASESMUMA_VERBOSE(10,("I am the root of the data"));
/*
* set the radix_mask */
radix_mask = pow_k_group_size;
/* send to children */
opal_atomic_wmb ();
BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,
radix,0,
my_rank,group_size, ready_flag);
/* root is finished */
goto Release;
}
/* If I am not the root, then poll on possible "senders'" control structs */
for( i = 0; i < cs->num_to_probe; i++) {
if( ready_flag == my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) {
/* else, we found our root within the group ... */
parent_data_pointer = data_buffs[my_ctl_pointer->src].payload;
BASESMUMA_VERBOSE(5,("%d found it from %d \n",my_rank,my_ctl_pointer->src));
/* memcopy the data */
memcpy(data_addr, (void *) parent_data_pointer, pack_len);
/* compute my relative rank */
relative_rank = (my_rank - my_ctl_pointer->src) < 0 ? my_rank -
my_ctl_pointer->src + group_size : my_rank - my_ctl_pointer->src;
/* compute my radix mask */
radix_mask = 1;
while(radix_mask < group_size ){
if( 0 != relative_rank % (radix*radix_mask)) {
/* found it */
break;
}
radix_mask *= radix;
}
/* go one step back */
radix_mask /= radix;
/* send to children */
opal_atomic_wmb ();
BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,
radix, relative_rank,
my_rank, group_size, ready_flag);
/* bail */
goto Release;
}
}
/* If not matched, then hop out and put me on progress list */
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
/*fprintf(stderr,"bcol_id %d Not started\n",bcol_id);*/
return BCOL_FN_NOT_STARTED;
Release:
my_ctl_pointer->starting_flag_value[bcol_id]++;
return BCOL_FN_COMPLETE;
}
/* non-blocking binary scatter allgather anyroot algorithm for large data
* broadcast
*/
#if 0
/* prototype code for shared memory scatter/allgather algorithm. Signaling scheme
* works, should be used as a reference for other types of shared memory scatter/allgather
* algorithms.
*/
int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
int i, j;
int length;
int start;
int my_rank, parent_rank;
int partner;
int src = -1;
int matched = 0;
int group_size;
int first_instance=0;
int leading_dim, buff_idx, idx;
int64_t sequence_number=input_args->sequence_num;
int64_t ready_flag;
int64_t local_offset;
int flag_offset;
int pow_2, pow_2_levels;
int index = -1;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
mca_bcol_basesmuma_module_t *bcol_module =
(mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
/* use the old control structs for large messages,
* otherwise we will destroy the shared memory
* optimization
*/
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer;
mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* binomial fanout */
mca_bcol_basesmuma_ctl_struct_t *partner_ctl_pointer; /* recursive double */
/* for now, we use the payload buffer for single fragment */
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile void *parent_data_pointer; /* binomial scatter */
volatile void *partner_data_pointer; /* recursive double */
uint32_t fragment_size; /* ml buffer size for now */
/* we will transfer the entire buffer,
* so start at the base address of the ml buffer
*/
void *data_addr = (void *) ((unsigned char *) input_args->src_desc->base_data_addr);
#if 0
fprintf(stderr,"AAA Entering nb-sm large msg broadcast input_args->frag_size %d \n",input_args->frag_size);
fflush(stderr);
#endif
buff_idx = input_args->src_desc->buffer_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
/* get the largest power of two that is smaller than
* or equal to the group size
*/
pow_2_levels = bcol_module->pow_2_levels;
pow_2 = bcol_module->pow_2;
/* get the fragment size
*/
/* still just the size of the entire buffer */
fragment_size = input_args->buffer_size;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
my_rank = bcol_module->super.sbgp_partner_module->my_index;
/* grab the control structs */
ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
bcol_module->colls_with_user_data.ctl_buffs+idx;
/* grab the data buffs */
data_buffs = (mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
my_ctl_pointer = ctl_structs[my_rank];
if(my_ctl_pointer->sequence_number < sequence_number) {
first_instance = 1;
}
if(first_instance) {
my_ctl_pointer->flag = -1;
my_ctl_pointer->index = 1;
my_ctl_pointer->starting_flag_value = 0;
flag_offset = 0;
} else {
my_ctl_pointer->index++;
}
/* increment the starting flag by one and return */
flag_offset = my_ctl_pointer->starting_flag_value;
ready_flag = flag_offset + sequence_number + 1;
my_ctl_pointer->sequence_number = sequence_number;
/* am I the root */
if(input_args->root_flag) {
/* if I've already been here, then
* hop down to the allgather
*/
if(ALLGATHER == my_ctl_pointer->status) {
goto Allgather;
}
BASESMUMA_VERBOSE(10,("I am the root of the data"));
/* debug print */
/*fprintf(stderr,"I am the root %d\n",my_rank);*/
/*
* signal ready flag
*/
/* set the offset into the buffer */
my_ctl_pointer->offset = 0;
/* how many children do I have */
my_ctl_pointer->n_sends = pow_2_levels;
/* my data length */
my_ctl_pointer->length = fragment_size;
/* important that these be set before my children
* see the ready flag raised
*/
opal_atomic_wmb ();
my_ctl_pointer->flag = ready_flag;
/* root is finished */
if( my_rank < pow_2 ) {
/* if I'm in the power of two group,
* then goto the allgather
*/
my_ctl_pointer->status = ALLGATHER;
goto Allgather;
} else {
/* if I'm not, then I'm done and release */
goto Release;
}
}
/* what phase am I participating in
*/
switch(my_ctl_pointer->status) {
case SCATTER:
goto Scatter;
break;
case ALLGATHER:
goto Allgather;
break;
case EXTRA_RANK:
goto Extra;
break;
default:
break;
}
Extra:
/* am I part of the non-power-of-2 group */
if( my_rank >= pow_2 ) {
/* find parent to copy from */
parent_rank = my_rank&(pow_2-1);
parent_ctl_pointer = ctl_structs[parent_rank];
/* start at the base */
parent_data_pointer = (void *) data_buffs[parent_rank].ctl_struct;
/* now, I need to do some arithmetic to
* arrive at the value everyone else does
* when they have completed the algorithm
*/
/* compute ready flag value to poll on */
ready_flag = ready_flag + pow_2_levels;
/* start to poll */
for( i = 0; i< cs->num_to_probe; i++) {
if(IS_LARGE_DATA_READY(parent_ctl_pointer,ready_flag, sequence_number)) {
/* copy the data and bail */
memcpy(data_addr,(void *)parent_data_pointer,fragment_size);
goto Release;
}
/*
else {
opal_progress();
}
*/
}
my_ctl_pointer->status = EXTRA_RANK;
/* hop out and put me onto a progress queue */
return BCOL_FN_NOT_STARTED;
}
Scatter:
/* on first entry, compute the list of possible sources */
if( NULL == my_ctl_pointer->src_ptr ) {
my_ctl_pointer->src_ptr = (int *) malloc(sizeof(int)*(pow_2_levels+1));
for( i = 0; i < pow_2_levels; i++) {
my_ctl_pointer->src_ptr[i] = my_rank ^ (1<<i);
}
/* am I participating in the non-power of two */
if((my_rank+pow_2) < group_size) {
/* extra rank that I'm paired with */
my_ctl_pointer->src_ptr[i] = my_rank + pow_2;
} else {
/* no extra rank to worry about */
my_ctl_pointer->src_ptr[i] = -1;
}
}
/* If I am not the root, then poll on possible "senders'" control structs */
for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
/* Shared memory iprobe */
BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(my_ctl_pointer->src_ptr, pow_2_levels+1,
my_rank, matched, src);
}
/* If not matched, then hop out and put me on progress list */
if(0 == matched ) {
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
my_ctl_pointer->status = SCATTER;
return BCOL_FN_NOT_STARTED;
} else if ( src >= pow_2 ){
/* If matched from an extra rank, then get the whole message from partner */
memcpy((void *) data_addr, (void *) parent_data_pointer,
parent_ctl_pointer->length);
/* now I am the psuedo-root in the power-of-two group */
my_ctl_pointer->offset = 0;
my_ctl_pointer->length = parent_ctl_pointer->length;
my_ctl_pointer->n_sends = parent_ctl_pointer->n_sends;
/* set the memory barrier */
opal_atomic_wmb ();
/* fire the ready flag */
my_ctl_pointer->flag = ready_flag;
my_ctl_pointer->status = ALLGATHER;
/* go to the allgather */
goto Allgather;
}
/* we need to see whether this is really
* who we are looking for
*/
for( i = 0; i < parent_ctl_pointer->n_sends; i++) {
/* debug print */
/*
fprintf(stderr,"I am %d checking on a hit from %d with n_sends %d\n",my_rank,src,parent_ctl_pointer->n_sends);
fflush(stderr);
*/
/* end debug */
if( my_rank == (src^(1<<i))) {
/* we found our root within the group ... */
BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
/* this is who I've been looking for */
my_ctl_pointer->n_sends = i;
if ( i > 0) {
/* compute the size of the chunk to copy */
length = (parent_ctl_pointer->length)/
(1<<(parent_ctl_pointer->n_sends - my_ctl_pointer->n_sends));
my_ctl_pointer->length = length;
my_ctl_pointer->offset =
parent_ctl_pointer->offset+length;
/*fprintf(stderr,"%d's offset %d and length %d \n",my_rank,my_ctl_pointer->offset,length);*/
/* now we can copy the data */
memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset),
(void *) ((uint64_t) parent_data_pointer+(uint64_t) parent_ctl_pointer->offset +
(uint64_t) length),
(size_t)length);
} else {
/* this "trick" takes care of the first level
* of recurssive doubling
*/
length = parent_ctl_pointer->length/
(1<<(parent_ctl_pointer->n_sends - 1));
my_ctl_pointer->length = length;
my_ctl_pointer->offset = parent_ctl_pointer->offset;
/*fprintf(stderr,"%d's offset %d and length %d\n",my_rank,my_ctl_pointer->offset,length);*/
/* now we can copy the data */
memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset),
(void *) ((uint64_t) parent_data_pointer+(uint64_t) my_ctl_pointer->offset),
(size_t)length);
}
/* set the memory barrier to ensure completion */
opal_atomic_wmb ();
/* signal that I am done */
my_ctl_pointer->flag = ready_flag;
/* set my status */
my_ctl_pointer->status = ALLGATHER;
/* time for allgather phase */
goto Allgather;
}
}
/* this is not who we are looking for,
* mark as false positive so we don't
* poll here again
*/
my_ctl_pointer->src_ptr[index] = -1;
/* probably we should jump out and put onto progress list */
my_ctl_pointer->status = SCATTER;
return BCOL_FN_NOT_STARTED;
Allgather:
/* zip it back up - we have already taken care of first level */
/* needed for non-blocking conditional */
matched = 0;
/* get my local_offset */
local_offset = my_ctl_pointer->offset;
/* bump the ready flag */
ready_flag++;
/* first level of zip up */
length = 2*fragment_size/pow_2;
/* first level of zip-up
* already includes first level of
* recursive doubling
*/
start = 1;
/* for non-blocking, check to see if I need to reset the state */
if(my_ctl_pointer->flag >= ready_flag) {
/* then reset the state */
ready_flag = my_ctl_pointer->flag;
start = my_ctl_pointer->start;
/* get the local offset */
local_offset = my_ctl_pointer->offset_zip;
/* compute the correct length */
length = length*(1<<(start - 1));
/* careful! skip over the opal_atomic_wmb () to avoid the
* cost on every re-entry
*/
goto Loop;
}
opal_atomic_wmb ();
/* I am ready, set the flag */
my_ctl_pointer->flag = ready_flag;
Loop:
for( i = start; i < pow_2_levels; i++) {
/* get my partner for this level */
partner = my_rank^(1<<i);
partner_ctl_pointer = ctl_structs[partner];
partner_data_pointer = (void *) data_buffs[partner].ctl_struct;
/* is data ready */
for( j = 0; j < cs->num_to_probe && matched == 0; j++) {
if(IS_LARGE_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) {
/* debug prints
fprintf(stderr,"666 I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d partner_offset %d\n",
my_rank,sequence_number,partner, ready_flag,partner_ctl_pointer->flag,buff_idx,partner_ctl_pointer->offset);
*/
/* debug print */
#if 0
fprintf(stderr,"I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d \n",
my_rank,sequence_number,partner, ready_flag,parent_ctl_pointer->flag,buff_idx);
#endif
/* end debug prints */
assert(partner_ctl_pointer->flag >= ready_flag);
/* found it */
matched = 1;
/* only copy it, if you sit at a lower level in the tree */
if( my_ctl_pointer->n_sends <= partner_ctl_pointer->n_sends ) {
/* calculate the local offset based on partner's remote offset */
if( partner_ctl_pointer->offset < my_ctl_pointer->offset ) {
/* then I'm looking "up" the tree */
local_offset -= length;
/* debug print */
/*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
/* end debug */
memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
(void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset),
length);
} else {
/* I'm looking "down" the tree */
local_offset += length;
/* debug print */
/*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
/* end debug */
memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
(void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset),
length);
/* reset my local offset */
local_offset -= length;
}
}
/* bump the ready flag */
ready_flag++;
/* ensure completion */
opal_atomic_wmb ();
/* fire the flag for the next level */
my_ctl_pointer->flag = ready_flag;
/* double the length */
length *= 2;
}
}
/* check to see what kind of progress I've made */
if( 0 == matched ) {
/* save state, hop out and try again later */
my_ctl_pointer->start = i;
/* save the local offset */
my_ctl_pointer->offset_zip = local_offset;
/* put in progress queue */
return BCOL_FN_STARTED;
}
/* else, start next level of recursive doubling */
matched = 0;
}
/* cleanup */
if(NULL != my_ctl_pointer->src_ptr) {
free(my_ctl_pointer->src_ptr);
my_ctl_pointer->src_ptr = NULL;
}
Release:
/* If I am the last instance, release the resource */
/*
if( IS_LAST_BCOL_FUNC(c_input_args)) {
rc = bcol_basesmuma_free_buff(
&(bcol_module->colls_with_user_data),
sequence_number);
}
*/
my_ctl_pointer->starting_flag_value++;
my_ctl_pointer->status = FINISHED;
return BCOL_FN_COMPLETE;
}
#endif
#if 0
int mca_bcol_basesmuma_bcast_binomial_scatter_allgather(void *desc)
{
/* local variables */
int rc, n_frags_sent;
uint32_t stripe_number;
int count, count_processed;
size_t dt_size;
uint32_t n_data_segments_to_schedule;
ompi_datatype_t *dtype;
message_descriptor_t *message_descriptor;
mca_bcol_basesmuma_module_t *bcol_module;
int pipe_depth;
/* get the full message descriptor */
/* compute the number of fragments to send */
/* start to fill the pipeline */
return OMPI_SUCCESS;
}
#endif

Просмотреть файл

@ -1,486 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2014 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/patterns/comm/coll_ops.h"
#include "opal/dss/dss.h"
#include "bcol_basesmuma.h"
/*
* With support for nonblocking collectives, we don't have an upper
* limit on the number of outstanding collectives per communicator.
* Also, since we want to avoid communication to figure out which
* buffers other ranks in the group will use, we will rely on the
* fact that collective operations are called in the same order
* in each process, to assign a unique ID to each collective operation.
* We use this to create a static mapping from the index to the buffer
* that will be used. Also, because there is no limit to the number of
* outstanding collective operations, we use a generation index for each
* memory bank, so the collective will use the buffer only when the
* correct generation of the bank is ready for use.
*/
int bcol_basesmuma_get_buff_index( sm_buffer_mgmt *buff_block,
uint64_t buff_id )
{
/* local variables */
int memory_bank;
uint64_t generation;
int index=-1;
/* get the bank index that will be used */
memory_bank=buff_id& buff_block->mask;
memory_bank = memory_bank SHIFT_DOWN buff_block->log2_num_buffs_per_mem_bank;
/* get the generation of the bank this maps to */
generation = buff_id SHIFT_DOWN (buff_block->log2_number_of_buffs);
/* check to see if the bank is available */
if( generation == buff_block->ctl_buffs_mgmt[memory_bank].
bank_gen_counter ) {
/* get the buffer index that will be returned */
index=buff_id & buff_block->mask;
/* no in-use counter increment, as the mapping is static, and
* all we need to know if the number of collectives that complete */
} else {
/* progress communications so that resources can be freed up */
opal_progress();
}
/* return */
return index;
}
/* release the shared memory buffers
* buf_id is the unique ID assigned to the particular buffer
*/
int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block,
uint64_t buff_id )
{
/* local variables */
int ret=OMPI_SUCCESS;
int memory_bank;
uint64_t generation;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
/* get the bank index that will be used */
memory_bank=buff_id& buff_block->mask;
memory_bank = memory_bank SHIFT_DOWN buff_block->log2_num_buffs_per_mem_bank;
/* get the generation of the bank this maps to */
generation = buff_id SHIFT_DOWN (buff_block->log2_number_of_buffs);
/* the generation counter should not change until all resrouces
* associated with this bank have been freed.
*/
assert(generation == buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter);
(void)generation; // silence compiler warning
/*
* increment counter of completed buffers
*/
OPAL_THREAD_ADD32(&(buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed),
1);
/*
* If I am the last to checkin - initiate resource recycling
*/
if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed ==
buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) {
/* Lock to ensure atomic recycling of resources */
OPAL_THREAD_LOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
/* make sure someone else did not already get to this */
if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed !=
buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) {
/* release lock and exit */
OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
} else {
sm_nbbar_desc_t *p_sm_nb_desc = NULL;
/* initiate the freeing of resources. Need to make sure the other
* ranks in the group are also done with their resources before this
* block is made available for use again.
* No one else will try to allocate from this block or free back to
* this block until the next genration counter has been incremented,
* so will just reset the number of freed buffers to 0, so no one else
* will try to also initialize the recycling of these resrouces
*/
buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed=0;
/* Start the nonblocking barrier */
p_sm_nb_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
p_sm_nb_desc->coll_buff = buff_block;
bcol_basesmuma_rd_nb_barrier_init_admin(p_sm_nb_desc);
if( NB_BARRIER_DONE !=
buff_block->ctl_buffs_mgmt[memory_bank].
nb_barrier_desc.collective_phase) {
opal_list_t *list=&(cs->nb_admin_barriers);
opal_list_item_t *append_item;
/* put this onto the progression list */
OPAL_THREAD_LOCK(&(cs->nb_admin_barriers_mutex));
append_item=(opal_list_item_t *)
&(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
opal_list_append(list,append_item);
OPAL_THREAD_UNLOCK(&(cs->nb_admin_barriers_mutex));
/* progress communications so that resources can be freed up */
opal_progress();
} else {
/* mark the block as available */
(buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter)++;
}
/* get out of here */
OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
}
}
/* return */
return ret;
}
/*
* Allocate buffers for storing non-blocking collective descriptions, required
* for making code re-entrant
*
*/
static int init_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc,
void *base_addr, uint32_t num_banks,
uint32_t num_buffers_per_bank,
uint32_t size_buffer,
uint32_t header_size,
int group_size,
int pow_k)
{
uint32_t i, j, ci;
mca_bcol_basesmuma_nb_coll_buff_desc_t *tmp_desc = NULL;
int k_nomial_radix = mca_bcol_basesmuma_component.k_nomial_radix;
int pow_k_val = (0 == pow_k) ? 1 : pow_k;
int num_to_alloc = (k_nomial_radix - 1) * pow_k_val * 2 + 1 ;
*desc = (mca_bcol_basesmuma_nb_coll_buff_desc_t *)calloc(num_banks * num_buffers_per_bank, sizeof(mca_bcol_basesmuma_nb_coll_buff_desc_t));
if (NULL == *desc) {
return OMPI_ERROR;
}
tmp_desc = *desc;
for (i = 0; i < num_banks; i++) {
for (j = 0; j < num_buffers_per_bank; j++) {
ci = i * num_buffers_per_bank + j;
tmp_desc[ci].bank_index = i;
tmp_desc[ci].buffer_index = j;
/* *2 is for gather session +1 for extra peer */
tmp_desc[ci].requests = (ompi_request_t **)
calloc(num_to_alloc, sizeof(ompi_request_t *));
tmp_desc[ci].data_addr = (void *)
((unsigned char*)base_addr + ci * size_buffer + header_size);
BASESMUMA_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr));
}
}
return OMPI_SUCCESS;
}
/*
* Free buffers for storing non-blocking collective descriptions.
*
*/
void cleanup_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc,
uint32_t num_banks,
uint32_t num_buffers_per_bank)
{
uint32_t ci;
if (NULL != *desc) {
for (ci=0; ci<num_banks*num_buffers_per_bank; ci++) {
if (NULL != ((*desc)[ci]).requests) {
free(((*desc)[ci]).requests);
((*desc))[ci].requests = NULL;
}
}
free(*desc);
*desc = NULL;
}
}
#if 1
/* New init function used for new control scheme where we put the control
* struct at the top of the payload buffer
*/
int bcol_basesmuma_bank_init_opti(struct mca_bcol_base_memory_block_desc_t *payload_block,
uint32_t data_offset,
mca_bcol_base_module_t *bcol_module,
void *reg_data)
{
/* assumption here is that the block has been registered with
* sm bcol hence has been mapped by each process, need to be
* sure that memory is mapped amongst sm peers
*/
/* local variables */
int ret = OMPI_SUCCESS, i, j;
sm_buffer_mgmt *pload_mgmt;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
bcol_basesmuma_registration_data_t *sm_reg_data =
(bcol_basesmuma_registration_data_t *) reg_data;
mca_bcol_basesmuma_module_t *sm_bcol =
(mca_bcol_basesmuma_module_t *) bcol_module;
mca_bcol_base_memory_block_desc_t *ml_block = payload_block;
size_t malloc_size;
bcol_basesmuma_smcm_file_t input_file;
int leading_dim,loop_limit,buf_id;
unsigned char *base_ptr;
mca_bcol_basesmuma_module_t *sm_bcol_module=
(mca_bcol_basesmuma_module_t *)bcol_module;
int my_idx, array_id;
mca_bcol_basesmuma_header_t *ctl_ptr;
void **results_array=NULL, *mem_offset;
mca_bcol_basesmuma_local_mlmem_desc_t *ml_mem = &sm_bcol_module->ml_mem;
/* first, we get a pointer to the payload buffer management struct */
pload_mgmt = &(sm_bcol->colls_with_user_data);
/* go ahead and get the header size that is cached on the payload block
*/
sm_bcol->total_header_size = data_offset;
/* allocate memory for pointers to mine and my peers' payload buffers
* difference here is that now we use our new data struct
*/
malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank*
pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t);
pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size);
if( !pload_mgmt->data_buffs) {
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto exit_ERROR;
}
/* allocate some memory to hold the offsets */
results_array = (void **) malloc(pload_mgmt->size_of_group * sizeof (void *));
if (NULL == results_array) {
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto exit_ERROR;
}
/* setup the input file for the shared memory connection manager */
input_file.file_name = sm_reg_data->file_name;
input_file.size = sm_reg_data->size;
input_file.size_ctl_structure = 0;
input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE;
input_file.mpool_size = sm_reg_data->size;
/* call the connection manager and map my shared memory peers' file
*/
ret = bcol_basesmuma_smcm_allgather_connection(
sm_bcol,
sm_bcol->super.sbgp_partner_module,
&(cs->sm_connections_list),
&(sm_bcol->payload_backing_files_info),
sm_bcol->super.sbgp_partner_module->group_comm,
input_file, cs->payload_base_fname,
false);
if( OMPI_SUCCESS != ret ) {
goto exit_ERROR;
}
/* now we exchange offset info - don't assume symmetric virtual memory
*/
mem_offset = (void *) ((uintptr_t) ml_block->block->base_addr -
(uintptr_t) cs->sm_payload_structs->data_addr);
/* call into the exchange offsets function */
ret=comm_allgather_pml(&mem_offset, results_array, sizeof (void *), MPI_BYTE,
sm_bcol_module->super.sbgp_partner_module->my_index,
sm_bcol_module->super.sbgp_partner_module->group_size,
sm_bcol_module->super.sbgp_partner_module->group_list,
sm_bcol_module->super.sbgp_partner_module->group_comm);
if( OMPI_SUCCESS != ret ) {
goto exit_ERROR;
}
/* convert memory offset to virtual address in current rank */
leading_dim = pload_mgmt->size_of_group;
loop_limit = ml_block->num_banks*ml_block->num_buffers_per_bank;
for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {
/* get the base pointer */
int array_id=SM_ARRAY_INDEX(leading_dim,0,i);
if( i == sm_bcol_module->super.sbgp_partner_module->my_index) {
/* me */
base_ptr=cs->sm_payload_structs->map_addr;
} else {
base_ptr=sm_bcol_module->payload_backing_files_info[i]->
sm_mmap->map_addr;
}
/* first, set the pointer to the control struct */
pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
(uintptr_t)(((uint64_t)(uintptr_t)results_array[array_id])+(uint64_t)(uintptr_t)base_ptr);
/* second, calculate where to set the data pointer */
pload_mgmt->data_buffs[array_id].payload=(void *)
(uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
(uint64_t)(uintptr_t) data_offset);
for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) {
int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
/* now, play the same game as above
*
* first, set the control struct's position */
pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
(uintptr_t)(((uint64_t)(uintptr_t)(pload_mgmt->data_buffs[array_id_m1].ctl_struct) +
(uint64_t)(uintptr_t)ml_block->size_buffer));
/* second, set the payload pointer */
pload_mgmt->data_buffs[array_id].payload =(void *)
(uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
(uint64_t)(uintptr_t) data_offset);
}
}
/* done with the index array */
free (results_array);
results_array = NULL;
/* initialize my control structures!! */
my_idx = sm_bcol_module->super.sbgp_partner_module->my_index;
leading_dim = sm_bcol_module->super.sbgp_partner_module->group_size;
for( buf_id = 0; buf_id < loop_limit; buf_id++){
array_id = SM_ARRAY_INDEX(leading_dim,buf_id,my_idx);
ctl_ptr = pload_mgmt->data_buffs[array_id].ctl_struct;
/* initialize the data structures */
for( j = 0; j < SM_BCOLS_MAX; j++){
for( i = 0; i < NUM_SIGNAL_FLAGS; i++){
ctl_ptr->flags[i][j] = -1;
}
}
ctl_ptr->sequence_number = -1;
ctl_ptr->src = -1;
}
/* setup the data structures needed for releasing the payload
* buffers back to the ml level
*/
for( i=0 ; i < (int) ml_block->num_banks ; i++ ) {
sm_bcol->colls_with_user_data.
ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor=
ml_block;
}
ml_mem->num_banks = ml_block->num_banks;
ml_mem->bank_release_counter = calloc(ml_block->num_banks, sizeof(uint32_t));
ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank;
ml_mem->size_buffer = ml_block->size_buffer;
/* pointer to ml level descriptor */
ml_mem->ml_mem_desc = ml_block;
if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc,
ml_block->block->base_addr,
ml_mem->num_banks,
ml_mem->num_buffers_per_bank,
ml_mem->size_buffer,
data_offset,
sm_bcol_module->super.sbgp_partner_module->group_size,
sm_bcol_module->pow_k)) {
BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n"));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
exit_ERROR:
if (NULL != results_array) {
free(results_array);
}
return ret;
}
#endif
/* Basesmuma interface function used for buffer release */
#if 0
/* gvm
* A collective operation calls this routine to release the payload buffer.
* All processes in the shared memory sub-group of a bcol should call the non-blocking
* barrier on the last payload buffer of a memory bank. On the completion
* of the non-blocking barrier, the ML callback is called which is responsible
* for recycling the memory bank.
*/
mca_bcol_basesmuma_module_t *sm_bcol_module
int bcol_basesmuma_free_payload_buff(
struct mca_bcol_base_memory_block_desc_t *block,
sm_buffer_mgmt *ctl_mgmt,
uint64_t buff_id)
{
/* local variables */
int ret = OMPI_SUCCESS;
memory_bank = BANK_FROM_BUFFER_IDX(buff_id);
ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed++;
OPAL_THREAD_ADD32(&(ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed),1);
if (ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed == block->size_buffers_bank){
/* start non-blocking barrier */
bcol_basesmuma_rd_nb_barrier_init_admin(
&(ctl_mgmt->ctl_buffs_mgmt[memory_bank].nb_barrier_desc));
if (NB_BARRIER_DONE !=
ctl_mgmt->ctl_buffs_mgmt[memory_bank].
nb_barrier_desc.collective_phase){
/* progress the barrier */
opal_progress();
}
else{
/* free the buffer - i.e. initiate callback to ml level */
block->ml_release_cb(block,memory_bank);
}
}
return ret;
}
#endif

Просмотреть файл

@ -1,380 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "opal/mca/mpool/base/base.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "opal/align.h"
#include "bcol_basesmuma.h"
/*
* Public string showing the coll ompi_sm V2 component version number
*/
const char *mca_bcol_basesmuma_component_version_string =
"Open MPI bcol - basesmuma collective MCA component version " OMPI_VERSION;
/*
* Local functions
*/
static int basesmuma_register(void);
static int basesmuma_open(void);
static int basesmuma_close(void);
static int mca_bcol_basesmuma_deregister_ctl_sm(
mca_bcol_basesmuma_component_t *bcol_component);
static inline int mca_bcol_basesmuma_param_register_int(
const char* param_name, int default_value, int *storage)
{
*storage = default_value;
return mca_base_component_var_register(&mca_bcol_basesmuma_component.super.bcol_version, param_name,
NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
}
static inline int mca_bcol_basesmuma_param_register_bool(
const char* param_name, bool default_value, bool *storage)
{
*storage = default_value;
return mca_base_component_var_register(&mca_bcol_basesmuma_component.super.bcol_version, param_name,
NULL, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
}
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component = {
/* First, fill in the super */
{
/* First, the mca_component_t struct containing meta
information about the component itself */
.bcol_version = {
MCA_BCOL_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "basesmuma",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = basesmuma_open,
.mca_close_component = basesmuma_close,
.mca_register_component_params = basesmuma_register,
},
/* Initialization / querying functions */
.collm_init_query = mca_bcol_basesmuma_init_query,
.collm_comm_query = mca_bcol_basesmuma_comm_query,
.init_done = false,
.need_ordering = false,
.priority = 0, /* (default) priority */
},
};
/*
* Register the component
*/
static int basesmuma_register(void)
{
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
/* set component priority */
mca_bcol_basesmuma_param_register_int("priority", 90, &cs->super.priority);
/* Number of memory banks */
mca_bcol_basesmuma_param_register_int("basesmuma_num_ctl_banks", 2,
&cs->basesmuma_num_mem_banks);
/* Number of regions per memory bank */
mca_bcol_basesmuma_param_register_int("basesmuma_num_buffs_per_bank", 16,
&cs->basesmuma_num_regions_per_bank);
/* number of polling loops to allow pending resources to
* complete their work
*/
mca_bcol_basesmuma_param_register_int("n_poll_loops", 4, &cs->n_poll_loops);
/* Number of groups supported */
mca_bcol_basesmuma_param_register_int("n_groups_supported", 100,
&cs->n_groups_supported);
/* order of fanin tree */
mca_bcol_basesmuma_param_register_int("radix_fanin", 2, &cs->radix_fanin);
/* order of fanout tree */
mca_bcol_basesmuma_param_register_int("radix_fanout", 2, &cs->radix_fanout);
/* order of read tree */
mca_bcol_basesmuma_param_register_int("radix_read_tree", 3,
&cs->radix_read_tree);
/* order of reduction fanout tree */
mca_bcol_basesmuma_param_register_int("order_reduction_tree", 2,
&cs->order_reduction_tree);
/* k-nomial radix */
mca_bcol_basesmuma_param_register_int("k_nomial_radix", 3, &cs->k_nomial_radix);
/* number of polling loops for non-blocking algorithms */
mca_bcol_basesmuma_param_register_int("num_to_probe", 10, &cs->num_to_probe);
/* radix of the k-ary scatter tree */
mca_bcol_basesmuma_param_register_int("scatter_kary_radix", 4,
&cs->scatter_kary_radix);
/* register parmeters controlling message fragementation */
mca_bcol_basesmuma_param_register_int("min_frag_size", getpagesize(),
&cs->super.min_frag_size);
mca_bcol_basesmuma_param_register_int("max_frag_size", FRAG_SIZE_NO_LIMIT,
&cs->super.max_frag_size);
/* by default use pre-registered shared memory segments */
/* RLG NOTE: When we have a systematic way to handle single memory
* copy semantics, we need to update this logic
*/
mca_bcol_basesmuma_param_register_bool("can_use_user_buffers", false,
&cs->super.can_use_user_buffers);
mca_bcol_basesmuma_param_register_int("verbose", 0, &cs->verbose);
return OMPI_SUCCESS;
}
/*
* Open the component
*/
static int basesmuma_open(void)
{
/* local variables */
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
int ret = OMPI_SUCCESS;
opal_mutex_t *mutex_ptr;
int dummy;
/*
* Make sure that the number of banks is a power of 2
*/
cs->basesmuma_num_mem_banks=
ompi_roundup_to_power_radix(2,cs->basesmuma_num_mem_banks, &dummy);
if ( 0 == cs->basesmuma_num_mem_banks ) {
ret=OMPI_ERROR;
goto exit_ERROR;
}
/*
* Make sure that the the number of buffers is a power of 2
*/
cs->basesmuma_num_regions_per_bank=
ompi_roundup_to_power_radix(2,cs->basesmuma_num_regions_per_bank, &dummy);
if ( 0 == cs->basesmuma_num_regions_per_bank ) {
ret=OMPI_ERROR;
goto exit_ERROR;
}
/* Portals initialization */
cs->portals_init = false;
cs->portals_info = NULL;
/*
* initialization
*/
cs->sm_ctl_structs=NULL;
OBJ_CONSTRUCT(&(cs->sm_connections_list),opal_list_t);
OBJ_CONSTRUCT(&(cs->nb_admin_barriers),opal_list_t);
mutex_ptr= &(cs->nb_admin_barriers_mutex);
OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t);
/* Control structures object construct
*/
OBJ_CONSTRUCT(&(cs->ctl_structures), opal_list_t);
/* shared memory has not been registered yet */
cs->mpool_inited = false;
/* initialize base file names */
cs->clt_base_fname="sm_ctl_mem_";
cs->payload_base_fname="sm_payload_mem_";
/* initialize the size of the shared memory scartch region */
cs->my_scratch_shared_memory_size=getpagesize();
cs->my_scratch_shared_memory=NULL;
cs->scratch_offset_from_base_ctl_file=0;
/*
* register the progess function
*/
ret=opal_progress_register(bcol_basesmuma_progress);
if (MPI_SUCCESS != ret) {
opal_output(ompi_bcol_base_framework.framework_output, "failed to register the progress function");
}
return ret;
exit_ERROR:
return ret;
}
/*
* release the control structure backing file
*/
static int mca_bcol_basesmuma_deregister_ctl_sm(mca_bcol_basesmuma_component_t *bcol_component)
{
if (NULL != bcol_component->sm_ctl_structs) {
OBJ_RELEASE(bcol_component->sm_ctl_structs);
}
return OMPI_SUCCESS;
}
/*
* Close the component
*/
static int basesmuma_close(void)
{
int ret;
bcol_basesmuma_registration_data_t *net_ctx;
bcol_base_network_context_t *net_reg;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
/* gvm Leak FIX */
OPAL_LIST_DESTRUCT (&cs->ctl_structures);
/* deregister the progress function */
ret=opal_progress_unregister(bcol_basesmuma_progress);
if (MPI_SUCCESS != ret) {
opal_output(ompi_bcol_base_framework.framework_output, "failed to unregister the progress function");
}
/* remove the control structure backing file */
ret=mca_bcol_basesmuma_deregister_ctl_sm(&mca_bcol_basesmuma_component);
if (MPI_SUCCESS != ret) {
opal_output(ompi_bcol_base_framework.framework_output, "failed to remove control structure backing file");
}
/* remove the network contexts - only one network context defined for
* this component.
*/
/* file_name returne by asprintf, so need to free the resource */
if(mca_bcol_basesmuma_component.super.network_contexts ) {
net_reg=(bcol_base_network_context_t *)
mca_bcol_basesmuma_component.super.network_contexts[0];
if(net_reg) {
net_ctx=(bcol_basesmuma_registration_data_t *)net_reg->context_data;
if( net_ctx) {
if(net_ctx->file_name) {
free(net_ctx->file_name);
}
free(net_ctx);
}
free(net_reg);
}
free(mca_bcol_basesmuma_component.super.network_contexts);
mca_bcol_basesmuma_component.super.network_contexts=NULL;
}
/* normal return */
return OMPI_SUCCESS;
}
/* query to see if the component is available for use, and can
* satisfy the thread and progress requirements
*/
int mca_bcol_basesmuma_init_query(bool enable_progress_threads,
bool enable_mpi_threads)
{
/* done */
return OMPI_SUCCESS;
}
/* This routine is used to allocate shared memory for the the shared
* memory control regions.
*/
int mca_bcol_basesmuma_allocate_sm_ctl_memory(mca_bcol_basesmuma_component_t *cs)
{
/* local variables */
int name_length, ret = OMPI_SUCCESS;
size_t ctl_length;
char *name;
size_t page_size = getpagesize ();
/* set the file name */
name_length=asprintf(&name,
"%s"OPAL_PATH_SEP"%s""%0d",
ompi_process_info.job_session_dir,
cs->clt_base_fname,
(int)getpid());
if( 0 > name_length ) {
return OMPI_ERROR;
}
/* make sure name is not too long */
if ( OPAL_PATH_MAX < (name_length-1) ) {
free (name);
return OMPI_ERROR;
}
/* compute segment length */
ctl_length=(cs->basesmuma_num_mem_banks*
cs->basesmuma_num_regions_per_bank+cs->basesmuma_num_mem_banks)
*sizeof(mca_bcol_basesmuma_ctl_struct_t)*cs->n_groups_supported;
/* need two banks of memory per group - for algorithms that have
* user payload, and those that don't
*/
ctl_length*=2;
/* add space for internal library management purposes */
ctl_length+=cs->my_scratch_shared_memory_size;
/* round up to multiple of page size */
ctl_length = OPAL_ALIGN(ctl_length, page_size, size_t);
/* allocate the shared file */
cs->sm_ctl_structs=bcol_basesmuma_smcm_mem_reg (NULL, ctl_length, getpagesize(), name);
if( !cs->sm_ctl_structs) {
opal_output (ompi_bcol_base_framework.framework_output,
"In mca_bcol_basesmuma_allocate_sm_ctl_memory failed to allocathe backing file %s\n", name);
ret = OMPI_ERR_OUT_OF_RESOURCE;
}
/* free the memory allocated by asprintf for the file name -
* in mca_base_smcm_mem_reg this name is copied into a new
* memory location */
free (name);
/* successful return */
return ret;
}

Просмотреть файл

@ -1,218 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/* Recursive doubling blocking barrier */
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/patterns/net/netpatterns.h"
#include "opal/sys/atomic.h"
#include "ompi/mca/bcol/base/base.h"
#include "bcol_basesmuma.h"
/********************************************************************************/
/********************************** New Fan-In **********************************/
/********************************************************************************/
static int bcol_basesmuma_fanin_new(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
int64_t sequence_number;
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
int i, child_rank, idx, n_children, probe,
my_rank = bcol_module->super.sbgp_partner_module->my_index,
leading_dim = bcol_module->colls_no_user_data.size_of_group;
int8_t ready_flag;
int8_t bcol_id = (int8_t) bcol_module->super.bcol_id;
int buff_index = input_args->buffer_index;
int *active_requests =
&(bcol_module->ml_mem.nb_coll_desc[buff_index].active_requests);
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
int matched = 0;
volatile mca_bcol_basesmuma_payload_t *ctl_structs;
/* control structures */
volatile mca_bcol_basesmuma_header_t *my_ctl;
volatile mca_bcol_basesmuma_header_t *child_ctl;
netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node);
/* Figure out - what instance of the basesmuma bcol I am */
sequence_number = input_args->sequence_num;
idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0);
ctl_structs = (volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs + idx;
my_ctl = ctl_structs[my_rank].ctl_struct;
/* Init the header */
BASESMUMA_HEADER_INIT(my_ctl, ready_flag, sequence_number, bcol_id);
/* Cache num of children value in a local variable */
n_children = my_tree_node->n_children;
/* initialize the active requests */
*active_requests = 0;
/* create a bit map for children */
for( i = 0; i < n_children; i++){
*active_requests ^= (1<<i);
}
/* Wait until my childeren arrive */
for (i = 0; i < n_children; ++i) {
matched = 0;
/* Get child ctl struct */
child_rank = my_tree_node->children_ranks[i];
child_ctl = ctl_structs[child_rank].ctl_struct;
/* I'm sacrificing cache for concurrency */
for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
if(IS_PEER_READY(child_ctl, ready_flag, sequence_number,BARRIER_FANIN_FLAG, bcol_id)) {
matched = 1;
/* flip the bit */
*active_requests ^= (1<<i);
}
}
}
if(0 == *active_requests ) {
if(ROOT_NODE != my_tree_node->my_node_type){
/* I have no more active requests,
signal my parent */
my_ctl->flags[BARRIER_FANIN_FLAG][bcol_id] = ready_flag;
}
} else {
return BCOL_FN_STARTED;
}
my_ctl->starting_flag_value[bcol_id]++;
return BCOL_FN_COMPLETE;
}
static int bcol_basesmuma_fanin_new_progress(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
int64_t sequence_number;
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
int i, child_rank, flag_offset, idx, n_children, probe,
my_rank = bcol_module->super.sbgp_partner_module->my_index,
leading_dim = bcol_module->colls_no_user_data.size_of_group;
int8_t ready_flag;
int8_t bcol_id = (int8_t) bcol_module->super.bcol_id;
int buff_index = input_args->buffer_index;
int *active_requests =
&(bcol_module->ml_mem.nb_coll_desc[buff_index].active_requests);
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
int matched = 0;
volatile mca_bcol_basesmuma_payload_t *ctl_structs;
/* control structures */
volatile mca_bcol_basesmuma_header_t *my_ctl;
volatile mca_bcol_basesmuma_header_t *child_ctl;
netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node);
sequence_number = input_args->sequence_num;
idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0);
ctl_structs = (volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs + idx;
my_ctl = ctl_structs[my_rank].ctl_struct;
flag_offset = my_ctl->starting_flag_value[bcol_id];
ready_flag = flag_offset + 1;
my_ctl->sequence_number = sequence_number;
/* Cache num of children value in a local variable */
n_children = my_tree_node->n_children;
/* Wait until my childeren arrive */
for (i = 0; i < n_children; ++i) {
matched = 0;
/* Get child ctl struct */
if ( 1 == ((*active_requests >> i)&1) ) {
child_rank = my_tree_node->children_ranks[i];
child_ctl = ctl_structs[child_rank].ctl_struct;
/* I'm sacrificing cache for concurrency */
for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
if(IS_PEER_READY(child_ctl, ready_flag, sequence_number, BARRIER_FANIN_FLAG,bcol_id)) {
matched = 1;
/* flip the bit */
*active_requests ^= (1<<i);
}
}
}
}
if(0 == *active_requests ){
if(ROOT_NODE != my_tree_node->my_node_type){
/* If I am not the root of the fanin tree,
then signal my parent */
my_ctl->flags[BARRIER_FANIN_FLAG][bcol_id] = ready_flag;
}
} else {
return BCOL_FN_STARTED;
}
my_ctl->starting_flag_value[bcol_id]++;
return BCOL_FN_COMPLETE;
}
int bcol_basesmuma_fanin_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
BASESMUMA_VERBOSE(10, ("Basesmuma Fan-In register.\n"));
comm_attribs.bcoll_type = BCOL_FANIN;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super,
&comm_attribs, &inv_attribs,
bcol_basesmuma_fanin_new,
bcol_basesmuma_fanin_new_progress);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,123 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/* Recursive doubling blocking barrier */
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/patterns/net/netpatterns.h"
#include "opal/sys/atomic.h"
#include "ompi/mca/bcol/base/base.h"
#include "bcol_basesmuma.h"
/***********************************************************************************/
/*********************************** New Fan-Out ***********************************/
/***********************************************************************************/
static int bcol_basesmuma_fanout_new(
bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
int64_t sequence_number;
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
int idx, probe,
my_rank = bcol_module->super.sbgp_partner_module->my_index,
leading_dim = bcol_module->colls_no_user_data.size_of_group;
int8_t ready_flag;
int8_t bcol_id = (int8_t) bcol_module->super.bcol_id;
int buff_index = input_args->buffer_index;
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
volatile mca_bcol_basesmuma_payload_t *ctl_structs;
/* control structures */
volatile mca_bcol_basesmuma_header_t *my_ctl;
volatile mca_bcol_basesmuma_header_t *parent_ctl;
netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node);
/* Figure out - what instance of the basesmuma bcol I am */
sequence_number = input_args->sequence_num;
idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0);
ctl_structs = (volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs + idx;
my_ctl = ctl_structs[my_rank].ctl_struct;
/* init the header */
BASESMUMA_HEADER_INIT(my_ctl, ready_flag, sequence_number, bcol_id);
/* Wait on my parent to arrive */
if (my_tree_node->n_parents) {
parent_ctl = ctl_structs[my_tree_node->parent_rank].ctl_struct;
for( probe = 0; probe < cm->num_to_probe; probe++){
if (IS_PEER_READY(parent_ctl, ready_flag, sequence_number, BARRIER_FANOUT_FLAG, bcol_id)) {
/* signal my children */
my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag;
/* bump the starting flag */
my_ctl->starting_flag_value[bcol_id]++;
return BCOL_FN_COMPLETE;
}
}
} else {
/* I am the root of the fanout */
my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag;
/* bump the starting flag */
my_ctl->starting_flag_value[bcol_id]++;
return BCOL_FN_COMPLETE;
}
return BCOL_FN_STARTED;
}
int bcol_basesmuma_fanout_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
BASESMUMA_VERBOSE(10, ("Basesmuma Fan-Out register.\n"));
comm_attribs.bcoll_type = BCOL_FANOUT;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super,
&comm_attribs, &inv_attribs,
bcol_basesmuma_fanout_new,
bcol_basesmuma_fanout_new);
return OMPI_SUCCESS;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,626 +0,0 @@
#ifdef __PORTALS_AVAIL__
#define __PORTALS_ENABLE__
#include <unistd.h>
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "bcol_basesmuma_utils.h"
#include "bcol_basesmuma_portals.h"
#include "bcol_basesmuma.h"
#if 0
struct scatter_allgather_nb_bcast_state_t
{
/* local variables */
uint64_t length;
int my_rank, src, matched;
int *src_list;
int group_size;
int64_t ready_flag;
int pow_2, pow_2_levels;
int src_list_index;
uint64_t fragment_size; /* user buffer size */
/* Input argument variables */
void *my_userbuf;
int64_t sequence_number;
/* Extra source variables */
bool secondary_root;
int partner , extra_partner;
/* Scatter Allgather offsets */
uint64_t local_sg_offset , global_sg_offset , partner_offset ;
/* Portals messaging relevant variables */
ptl_handle_eq_t allgather_eq_h;
ptl_handle_eq_t read_eq;
ptl_event_t allgather_event;
bool msg_posted;
/* OMPI module and component variables */
mca_bcol_basesmuma_component_t *cs;
mca_bcol_basesmuma_module_t *bcol_module;
/* Control structure and payload variables */
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* scatter source */
volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer; /* scatter source */
int phase;
};
typedef struct scatter_allgather_nb_bcast_state_t sg_state_t;
#endif
bool blocked_post = false;
#define IS_SG_DATA_READY(peer, my_flag, my_sequence_number) \
(((peer)->sequence_number == (my_sequence_number) && \
(peer)->flags[BCAST_FLAG] >= (my_flag) \
)? true : false )
#define SG_LARGE_MSG_PROBE(src_list, n_src, src_list_index, matched, \
src, data_buffs, data_src_ctl_pointer, \
data_src_lmsg_ctl_pointer, ready_flag, \
sequence_number) \
do { \
int j; \
for( j = 0; j < n_src; j++) { \
if(src_list[j] != -1) { \
data_src_ctl_pointer = data_buffs[src_list[j]].ctl_struct; \
data_src_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) \
data_buffs[src_list[j]].payload; \
if( IS_SG_DATA_READY(data_src_ctl_pointer,ready_flag,sequence_number)) { \
src = src_list[j]; \
matched = 1; \
src_list_index = j; \
break; \
} \
} \
} \
} while(0)
#define SG_LARGE_MSG_NB_PROBE(src_list, n_src, src_list_index, matched, \
src, ctl_structs, data_src_ctl_pointer, \
ready_flag, sequence_number) \
do { \
int j; \
for( j = 0; j < n_src; j++) { \
if(src_list[j] != -1) { \
data_src_ctl_pointer = ctl_structs[src_list[j]]; \
if( IS_SG_DATA_READY(data_src_ctl_pointer,ready_flag,sequence_number)) { \
src = src_list[j]; \
matched = 1; \
src_list_index = j; \
break; \
} \
} \
} \
} while(0)
static inline __opal_attribute_always_inline__
int wait_for_peers(int my_rank, int npeers, volatile mca_bcol_basesmuma_payload_t *data_buffs,
int flag_value, int sn)
{
int *peers_list = NULL;
int counter = 0, diter = 0;
volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer = NULL;
peers_list = (int *)malloc(sizeof(int) * npeers);
for (diter = 0; diter < npeers; diter++ ){
peers_list[diter] = my_rank ^ (1<<diter);
assert(peers_list[diter] != -1);
}
counter = 0;
while (counter < npeers) {
for (diter = 0; diter < npeers; diter++){
if (-1 != peers_list[diter]) {
peer_ctl_pointer = data_buffs[peers_list[diter]].ctl_struct;
if (IS_SG_DATA_READY(peer_ctl_pointer, flag_value, sn)) {
counter++;
peers_list[diter] = -1;
}
}
}
opal_progress();
}
return 0;
}
static inline __opal_attribute_always_inline__
int wait_for_peers_nb(int my_rank, int npeers,
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs,
volatile int flag_value, int sn)
{
int *peers_list = NULL;
int counter = 0, diter = 0;
volatile mca_bcol_basesmuma_ctl_struct_t *peer_ctl_pointer = NULL;
peers_list = (int *)malloc(sizeof(int) * npeers);
for (diter = 0; diter < npeers; diter++ ){
peers_list[diter] = my_rank ^ (1<<diter);
assert(peers_list[diter] != -1);
}
counter = 0;
while (counter < npeers) {
for (diter = 0; diter < npeers; diter++){
if (-1 != peers_list[diter]) {
peer_ctl_pointer = ctl_structs[peers_list[diter]];
if (IS_SG_DATA_READY(peer_ctl_pointer, flag_value, sn)) {
counter++;
peers_list[diter] = -1;
}
}
}
opal_progress();
}
return 0;
}
static inline __opal_attribute_always_inline__
int wait_for_post_complete_nb(int my_rank, int npeers,
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs,
int flag_value, int sn)
{
/* int *peers_list = NULL; */
int peers_list[MAX_SM_GROUP_SIZE];
int counter = 0, diter = 0;
volatile mca_bcol_basesmuma_ctl_struct_t *peer_ctl_pointer = NULL;
/* peers_list = (int *)malloc(sizeof(int) * npeers); */
assert(npeers < MAX_SM_GROUP_SIZE);
for (diter = 0; diter < npeers; diter++ ){
peers_list[diter] = my_rank ^ (1<<diter);
assert(peers_list[diter] != -1);
}
counter = 0;
for (diter = 0; diter < npeers; diter++){
peer_ctl_pointer = ctl_structs[peers_list[diter]];
if (IS_SG_DATA_READY(peer_ctl_pointer, flag_value, sn)) {
counter++;
}
}
/* free(peers_list); */
return counter;
}
static inline __opal_attribute_always_inline__
int sg_large_msg_probe(sg_state_t *sg_state)
{
int j,n_src = sg_state->pow_2_levels+1;
for( j = 0; j < n_src; j++) {
if(sg_state->src_list[j] != -1) {
sg_state->parent_ctl_pointer = sg_state->ctl_structs[sg_state->src_list[j]];
BASESMUMA_VERBOSE(5,("Parent %d ctl pointer (parent=%x, my ctl=%x) flag %d",
sg_state->src_list[j],sg_state->parent_ctl_pointer,
sg_state->my_ctl_pointer,
sg_state->parent_ctl_pointer->flag));
if (IS_SG_DATA_READY(sg_state->parent_ctl_pointer,
sg_state->ready_flag, sg_state->sequence_number)) {
sg_state->src = sg_state->src_list[j];
sg_state->matched = 1;
sg_state->src_list_index = j;
break;
}
}
}
return 0;
}
/*
* I will post message for all the my children
*/
static inline __opal_attribute_always_inline__
int sm_portals_root_scatter(sg_state_t *sg_state)
{
int extra_src_posts = -1, scatter_posts = -1, allgather_posts = -1,
total_msg_posts = -1;
BASESMUMA_VERBOSE(10,("I am the root of the data"));
sg_state->my_ctl_pointer->offset = 0;
sg_state->my_ctl_pointer->n_sends = sg_state->pow_2_levels;
sg_state->my_ctl_pointer->length = sg_state->fragment_size;
extra_src_posts = (sg_state->my_rank + sg_state->pow_2 < sg_state->group_size ) ? 1: 0;
scatter_posts = sg_state->my_ctl_pointer->n_sends;
allgather_posts = sg_state->pow_2_levels - 1;
total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
if ( total_msg_posts <= 0) {
BASESMUMA_VERBOSE(10,("No need to post the data "));
return OMPI_SUCCESS;
}
mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
&sg_state->my_ctl_pointer->portals_buf_addr,
sg_state->my_userbuf, sg_state->fragment_size,
PTL_EQ_NONE,
total_msg_posts,
blocked_post,
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE |
PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
/*
mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
&sg_state->my_ctl_pointer->portals_buf_addr,
sg_state->my_userbuf, sg_state->fragment_size,
sg_state->allgather_eq_h,
total_msg_posts,
blocked_post,
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE |
PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
*/
sg_state->msg_posted = true ;
/*
opal_atomic_wmb();
*/
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
return OMPI_SUCCESS;
}
/*
* Im root but my rank > pow2_groupsize, so will copy to partner who
* will act as root (secondary)
*/
static inline __opal_attribute_always_inline__
int sm_portals_extra_root_scatter(sg_state_t *sg_state)
{
int scatter_partner = -1;
volatile mca_bcol_basesmuma_ctl_struct_t *scatter_partner_ctl_pointer = NULL;
int total_msg_posts = 1;
if ( total_msg_posts <= 0) {
BASESMUMA_VERBOSE(10,("No need to post the data "));
}
else {
mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
&sg_state->my_ctl_pointer->portals_buf_addr,
sg_state->my_userbuf, sg_state->fragment_size,
PTL_EQ_NONE,
total_msg_posts,
blocked_post,
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET
| PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
sg_state->msg_posted = true ;
}
opal_atomic_wmb();
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
scatter_partner = sg_state->my_rank - sg_state->pow_2;
scatter_partner_ctl_pointer =
sg_state->ctl_structs[scatter_partner];
while(!IS_SG_DATA_READY(scatter_partner_ctl_pointer, sg_state->ready_flag,
sg_state->sequence_number)){
opal_progress();
}
return OMPI_SUCCESS;
}
/*
* Gets msg from the partner (> pow2_groupsize) and posts the
* message acting as root
*/
static inline __opal_attribute_always_inline__
int sm_portals_secondary_root_scatter(sg_state_t *sg_state)
{
volatile mca_bcol_basesmuma_ctl_struct_t *extra_src_ctl_pointer = NULL;
int scatter_posts, allgather_posts, extra_src_posts, total_msg_posts;
sg_state->secondary_root = true;
BASESMUMA_VERBOSE(10,("I am the secondary root for the data"));
sg_state->my_ctl_pointer->offset = 0;
sg_state->my_ctl_pointer->n_sends = sg_state->pow_2_levels;
sg_state->my_ctl_pointer->length = sg_state->fragment_size;
extra_src_ctl_pointer = sg_state->ctl_structs[sg_state->src];
mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
sg_state->read_eq,
&sg_state->my_ctl_pointer->portals_buf_addr,
&extra_src_ctl_pointer->portals_buf_addr, 0,
0, sg_state->fragment_size);
extra_src_posts = 0;
scatter_posts = sg_state->my_ctl_pointer->n_sends;
allgather_posts = sg_state->pow_2_levels - 1;
total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
if (total_msg_posts > 0) {
mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
&sg_state->my_ctl_pointer->portals_buf_addr,
sg_state->my_userbuf, sg_state->fragment_size,
PTL_EQ_NONE,
total_msg_posts,
blocked_post,
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET
| PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
sg_state->msg_posted = true ;
}
opal_atomic_wmb();
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
return OMPI_SUCCESS;
}
/*
* Internode Scatter: Get data from my parent and post for my children
*/
static inline __opal_attribute_always_inline__
int sm_portals_internode_scatter(sg_state_t *sg_state)
{
int scatter_posts, allgather_posts, extra_src_posts,
total_msg_posts;
uint64_t local_offset, remote_offset;
/* compute the size of the chunk to copy */
sg_state->length = (sg_state->parent_ctl_pointer->length)/
(1<<(sg_state->parent_ctl_pointer->n_sends - sg_state->my_ctl_pointer->n_sends));
sg_state->my_ctl_pointer->length = sg_state->length;
sg_state->my_ctl_pointer->offset =
sg_state->parent_ctl_pointer->offset + sg_state->length;
local_offset = sg_state->my_ctl_pointer->offset;
remote_offset = sg_state->parent_ctl_pointer->offset +
sg_state->length;
mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
sg_state->read_eq,
&sg_state->my_ctl_pointer->portals_buf_addr,
&sg_state->parent_ctl_pointer->portals_buf_addr,local_offset,
remote_offset,sg_state->length);
/* Now post the message for other children to read */
extra_src_posts = (sg_state->my_rank + sg_state->pow_2 <
sg_state->group_size ) ? 1: 0;
scatter_posts = sg_state->my_ctl_pointer->n_sends;
allgather_posts = sg_state->pow_2_levels - 1;
total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
if (total_msg_posts > 0) {
mca_bcol_basesmuma_portals_post_msg(sg_state->cs, &sg_state->my_ctl_pointer->portals_buf_addr,
sg_state->my_userbuf, sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length,
PTL_EQ_NONE,
total_msg_posts,
blocked_post,
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE
| PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
sg_state->msg_posted = true;
}
/*
opal_atomic_wmb();
*/
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
return OMPI_SUCCESS;
}
/*
* Bcast's Allgather Phase:
* Combines data from all processes using recursive doubling algorithm
*/
static inline __opal_attribute_always_inline__
int sm_portals_bcasts_allgather_phase(sg_state_t *sg_state)
{
int ag_loop, partner;
volatile mca_bcol_basesmuma_ctl_struct_t *partner_ctl_pointer = NULL; /* recursive double */
for( ag_loop = 1; ag_loop < sg_state->pow_2_levels; ag_loop++) {
/* get my partner for this level */
partner = sg_state->my_rank^(1<<ag_loop);
partner_ctl_pointer = sg_state->ctl_structs[partner];
/* Block until partner is at this level of recursive-doubling stage */
while(!IS_SG_DATA_READY(partner_ctl_pointer, sg_state->ready_flag,
sg_state->sequence_number)) {
opal_progress();
}
assert(partner_ctl_pointer->flag >= sg_state->ready_flag);
if (partner_ctl_pointer->offset < sg_state->my_ctl_pointer->offset) {
sg_state->global_sg_offset -= sg_state->length;
sg_state->local_sg_offset = sg_state->global_sg_offset;
} else {
sg_state->local_sg_offset = sg_state->global_sg_offset + sg_state->length;
}
BASESMUMA_VERBOSE(10,("Allgather Phase: Get message from process %d, length %d",
partner, sg_state->length));
mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
sg_state->read_eq,
&sg_state->my_ctl_pointer->portals_buf_addr,
&partner_ctl_pointer->portals_buf_addr,sg_state->local_sg_offset,
sg_state->local_sg_offset, sg_state->length);
sg_state->ready_flag++;
opal_atomic_wmb();
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
/* Block until partner is at this level of recursive-doubling stage */
while(!IS_SG_DATA_READY(partner_ctl_pointer, sg_state->ready_flag,
sg_state->sequence_number)) {
opal_progress();
}
/* double the length */
sg_state->length *= 2;
}
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__
int init_sm_group_info(sg_state_t *sg_state, int buff_idx)
{
int idx, leading_dim;
int first_instance=0;
int flag_offset;
/* Get addresing information */
sg_state->group_size = sg_state->bcol_module->colls_no_user_data.size_of_group;
leading_dim = sg_state->bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
BASESMUMA_VERBOSE(1,("My buffer idx %d group size %d, leading dim %d, idx %d",
buff_idx,sg_state->group_size,leading_dim,idx));
/* grab the ctl buffs */
sg_state->ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **)
sg_state->bcol_module->colls_with_user_data.ctl_buffs+idx;
sg_state->my_rank = sg_state->bcol_module->super.sbgp_partner_module->my_index;
sg_state->my_ctl_pointer = sg_state->ctl_structs[sg_state->my_rank];
if (sg_state->my_ctl_pointer->sequence_number < sg_state->sequence_number) {
first_instance = 1;
}
if(first_instance) {
sg_state->my_ctl_pointer->flag = -1;
sg_state->my_ctl_pointer->index = 1;
sg_state->my_ctl_pointer->starting_flag_value = 0;
flag_offset = 0;
} else {
sg_state->my_ctl_pointer->index++;
}
/* For bcast we shud have only entry to this bcol
assert(sg_state->my_ctl_pointer->flag == -1);
*/
/* increment the starting flag by one and return */
flag_offset = sg_state->my_ctl_pointer->starting_flag_value;
sg_state->ready_flag = flag_offset + sg_state->sequence_number + 1;
sg_state->my_ctl_pointer->sequence_number = sg_state->sequence_number;
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__
int init_sm_portals_sg_info(sg_state_t *sg_state)
{
/* Get portals info*/
mca_bcol_basesmuma_portal_proc_info_t *portals_info;
int rc = OMPI_SUCCESS;
int sg_matchbits;
portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)sg_state->cs->portals_info;
sg_matchbits = sg_state->sequence_number ;
/* Construct my portal buffer address and copy to payload buffer */
mca_bcol_basesmuma_construct_portal_address(&sg_state->my_ctl_pointer->portals_buf_addr,
portals_info->portal_id.nid,
portals_info->portal_id.pid,
sg_matchbits,
sg_state->bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
sg_state->my_ctl_pointer->portals_buf_addr.userbuf = sg_state->my_userbuf;
sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length = sg_state->fragment_size;
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__
int compute_src_from_root(int group_root, int my_group_rank, int pow2, int
group_size)
{
int root, relative_rank, src, i;
if (group_root < pow2) {
root = group_root;
} else {
/* the source of the data is extra node,
the real root it represented by some rank from
pow2 group */
root = group_root - pow2;
/* shortcut for the case when my rank is root for the group */
if (my_group_rank == root) {
return group_root;
}
}
relative_rank = (my_group_rank - root) < 0 ? my_group_rank - root + pow2 :
my_group_rank - root;
for (i = 1; i < pow2; i<<=1) {
if (relative_rank & i) {
src = my_group_rank ^ i;
if (src >= pow2)
src -= pow2;
return src;
}
}
return -1;
}
int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args);
int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args);
int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args);
#endif

Просмотреть файл

@ -1,452 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
/* #define __PORTALS_AVAIL__ */
#ifdef __PORTALS_AVAIL__
#define __PORTALS_ENABLE__
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "bcol_basesmuma_utils.h"
#include "bcol_basesmuma_portals.h"
/* debug */
#include <unistd.h>
/* end debug */
/**
* Shared memory non-blocking Broadcast - K-nomial fan-out for small data buffers.
* This routine assumes that buf (the input buffer) is a single writer
* multi reader (SWMR) shared memory buffer owned by the calling rank
* which is the only rank that can write to this buffers.
* It is also assumed that the buffers are registered and fragmented
* at the ML level and that buf is sufficiently large to hold the data.
*
*
* @param buf - SWMR shared buffer within a sbgp that the
* executing rank can write to.
* @param count - the number of elements in the shared buffer.
* @param dtype - the datatype of a shared buffer element.
* @param root - the index within the sbgp of the root.
* @param module - basesmuma module.
*/
int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
#if 0
/* local variables */
mca_bcol_basesmuma_module_t* bcol_module=
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
int i, matched = 0;
int src=-1;
int group_size;
int my_rank, first_instance=0, flag_offset;
int rc = OMPI_SUCCESS;
int leading_dim, buff_idx, idx;
int count=input_args->count;
struct ompi_datatype_t* dtype=input_args->dtype;
int64_t sequence_number=input_args->sequence_num;
volatile int64_t ready_flag;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile char* parent_data_pointer;
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
void *userbuf = (void *)((unsigned char *)input_args->userbuf);
size_t pack_len = 0, dt_size;
struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL;
struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL;
mca_bcol_basesmuma_portal_proc_info_t *portals_info;
portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info;
/* we will work only on packed data - so compute the length*/
ompi_datatype_type_size(dtype, &dt_size);
pack_len=count*dt_size;
buff_idx = input_args->src_desc->buffer_index;
/* Get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
/* Set pointer to current proc ctrl region */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) data_buffs[my_rank].payload;
/* setup resource recycling */
if( my_ctl_pointer->sequence_number < sequence_number ) {
first_instance=1;
}
if( first_instance ) {
/* Signal arrival */
my_ctl_pointer->flag = -1;
my_ctl_pointer->index=1;
/* this does not need to use any flag values , so only need to
* set the value for subsequent values that may need this */
my_ctl_pointer->starting_flag_value=0;
flag_offset=0;
} else {
/* only one thread at a time will be making progress on this
* collective, so no need to make this atomic */
my_ctl_pointer->index++;
}
/* increment the starting flag by one and return */
flag_offset = my_ctl_pointer->starting_flag_value;
ready_flag = flag_offset + sequence_number + 1;
my_ctl_pointer->sequence_number = sequence_number;
/* Construct my portal buffer address and copy to payload buffer */
mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer,
portals_info->portal_id.nid,
portals_info->portal_id.pid,
sequence_number,
bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
/* non-blocking broadcast algorithm */
/* If I am the root, then signal ready flag */
if(input_args->root_flag) {
ptl_handle_eq_t eq_h;
ptl_event_t event;
int ret;
BASESMUMA_VERBOSE(10,("I am the root of the data"));
/* create an event queue for the incoming buffer */
ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, PTL_EQ_HANDLER_NONE, &eq_h);
if (ret != PTL_OK) {
fprintf(stderr, "PtlEQAlloc() failed: %d \n",ret);
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Post the message using portal copy */
mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf,
pack_len, eq_h, my_lmsg_ctl_pointer->nsends);
/*
* signal ready flag
*/
my_ctl_pointer->flag = ready_flag;
/* wait for a response from the client */
mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT,
&event, my_lmsg_ctl_pointer->nsends);
/* free the event queue */
ret = PtlEQFree(eq_h);
if (ret != PTL_OK) {
fprintf(stderr, "PtlEQFree() failed: %d )\n",ret);
}
/* root is finished */
goto Release;
}
/* If I am not the root, then poll on possible "senders'" control structs */
for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
/* Shared memory iprobe */
/*
BCOL_BASESMUMA_SM_PROBE(bcol_module->src, bcol_module->src_size,
my_rank, matched, src);
*/
do {
int j, n_src, my_index;
n_src = bcol_module->src_size;
for( j = 0; j < n_src; j++) {
parent_ctl_pointer = data_buffs[bcol_module->src[j]].ctl_struct;
parent_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *)
data_buffs[bcol_module->src[j]].payload;
if (IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) {
src = bcol_module->src[j];
matched = 1;
break;
}
}
} while(0);
}
/* If not matched, then hop out and put me on progress list */
if(0 == matched ) {
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
return BCOL_FN_NOT_STARTED;
}
/* else, we found our root within the group ... */
BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
/* receive the data from sender */
/* get the data buff */
/* taken care of in the macro */
/*parent_data_pointer = data_buffs[src].payload;*/
/* copy the data */
mca_bcol_basesmuma_portals_get_msg(cs, parent_lmsg_ctl_pointer, userbuf, pack_len);
/* set the memory barrier to ensure completion */
opal_atomic_wmb ();
/* signal that I am done */
my_ctl_pointer->flag = ready_flag;
/* am I the last one? If so, release buffer */
Release:
my_ctl_pointer->starting_flag_value++;
return BCOL_FN_COMPLETE;
#endif
}
#if 0
#define BASESMUMA_K_NOMIAL_SEND_SIGNAL(radix_mask, radix, my_relative_index, \
my_group_index, group_size,sm_data_buffs,sender_ready_flag, \
num_pending_sends) \
{ \
int k, rc; \
int dst; \
int comm_dst; \
volatile mca_bcol_basesmuma_header_t *recv_ctl_pointer = NULL; \
volatile mca_bcol_basesmuma_portal_buf_addr_t *recv_lmsg_ctl_pointer = NULL; \
\
num_pending_sends = 0; \
while(radix_mask > 0) { \
/* For each level of tree, do sends */ \
for (k = 1; \
k < radix && my_relative_index + radix_mask * k < group_size; \
++k) { \
\
dst = my_group_index + radix_mask * k; \
if (dst >= group_size) { \
dst -= group_size; \
} \
/* Signal the children to get data */ \
recv_ctl_pointer = data_buffs[dst].ctl; \
recv_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *) \
data_buffs[dst].payload; \
recv_lmsg_ctl_pointer->src_index = my_group_index; \
recv_lmsg_ctl_pointer->flag = sender_ready_flag; \
++num_pending_sends; \
} \
radix_mask /= radix; \
} \
\
}
int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
mca_bcol_basesmuma_module_t* bcol_module=
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
int i, matched = 0;
int src=-1;
int group_size;
int my_rank, first_instance=0, flag_offset;
int rc = OMPI_SUCCESS;
int leading_dim, buff_idx, idx;
int count=input_args->count;
struct ompi_datatype_t* dtype=input_args->dtype;
int64_t sequence_number=input_args->sequence_num;
volatile int64_t ready_flag;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile char* parent_data_pointer;
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
void *userbuf = (void *)((unsigned char *)input_args->userbuf);
size_t pack_len = 0, dt_size;
struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL;
struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL;
mca_bcol_basesmuma_portal_proc_info_t *portals_info;
portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info;
/* we will work only on packed data - so compute the length*/
ompi_datatype_type_size(dtype, &dt_size);
pack_len=count*dt_size;
buff_idx = input_args->src_desc->buffer_index;
/* Get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
group_size = bcol_module->colls_no_user_data.size_of_group;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
/* Set pointer to current proc ctrl region */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) data_buffs[my_rank].payload;
/* setup resource recycling */
if( my_ctl_pointer->sequence_number < sequence_number ) {
first_instance=1;
}
if( first_instance ) {
/* Signal arrival */
my_ctl_pointer->flag = -1;
my_ctl_pointer->index=1;
/* this does not need to use any flag values , so only need to
* set the value for subsequent values that may need this */
my_ctl_pointer->starting_flag_value=0;
flag_offset=0;
} else {
/* only one thread at a time will be making progress on this
* collective, so no need to make this atomic */
my_ctl_pointer->index++;
}
/* increment the starting flag by one and return */
flag_offset = my_ctl_pointer->starting_flag_value;
ready_flag = flag_offset + sequence_number + 1;
my_ctl_pointer->sequence_number = sequence_number;
/* Construct my portal buffer address and copy to payload buffer */
mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer,
portals_info->portal_id.nid,
portals_info->portal_id.pid,
sequence_number,
bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
my_lmsg_ctl_pointer->userbuf = userbuff;
my_lsmg_ctl_pointer->userbuf_length = fragment_length;
/* create an event queue */
ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, PTL_EQ_HANDLER_NONE, &eq_h);
/* non-blocking broadcast algorithm */
/* If I am the root, then signal ready flag */
if(input_args->root_flag) {
ptl_handle_eq_t eq_h;
ptl_event_t event;
int ret;
int root_radix_mask = sm_module->pow_knum;
BASESMUMA_VERBOSE(10,("I am the root of the data"));
if (ret != PTL_OK) {
fprintf(stderr, "PtlEQAlloc() failed: %d \n",ret);
return OMPI_ERR_OUT_OF_RESOURCE;
}
BASESMUMA_K_NOMIAL_SEND_SIGNAL(root_radix_mask, radix, 0,
my_rank, group_size, data_buffs, ready_flag, nsends) ;
mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf,
pack_len, eq_h, nsends);
/* wait for a response from the client */
mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT,
&event, nsends);
/* root is finished */
goto Release;
}
/* Im not a root so wait until someone puts data and
* compute where to get data from */
while (my_ctl_pointer->flag != ready_flag) ;
my_data_source_index = lmsg_ctl_pointer->src_index;
parent_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *)
data_buffs[my_data_source_index].payload;
mca_bcol_basesmuma_portals_get_msg(cs, parent_lmsg_ctl_pointer, userbuf, pack_len);
/* I am done getting data, should I send the data to someone */
my_relative_index = (my_rank - my_data_source_index) < 0 ? my_rank -
my_data_source_index + group_size : my_rank - my_data_source_index;
/*
* 2. Locate myself in the tree:
* calculate number of radix steps that we should to take
*/
radix_mask = 1;
while (radix_mask < group_size) {
if (0 != my_relative_index % (radix * radix_mask)) {
/* I found my level in tree */
break;
}
radix_mask *= radix;
}
/* go one step back */
radix_mask /=radix;
BASESMUMA_K_NOMIAL_SEND_SIGNAL(radix_mask, radix, my_relative_index,
my_rank, group_size,data_buffs,ready_flag,nsends)
mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf,
pack_len, eq_h, nsends);
/* wait for childrens to read */
mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT,
&event, nsends);
Release:
/* free the event queue */
ret = PtlEQFree(eq_h);
if (ret != PTL_OK) {
fprintf(stderr, "PtlEQFree() failed: %d )\n",ret);
}
my_ctl_pointer->starting_flag_value++;
return BCOL_FN_COMPLETE;
}
#endif
#endif

Просмотреть файл

@ -1,101 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "bcol_basesmuma.h"
/* Shared memory registration function: Calls into the "shared memory
connection manager" (aka - smcm) and registers a chunk of memory by
opening and mmaping a file.
@input:
void *reg_data - shared memory specific data needed by the registration
function.
void *base - pointer to memory address.
size_t size - size of memory chunk to be registered with sm.
mca_mpool_base_registration_t *reg - registration data is cached here.
@output:
returns OMPI_SUCCESS on successful registration.
returns OMPI_ERROR on failure.
*/
int mca_bcol_basesmuma_register_sm(void *context_data, void *base, size_t size,
void **reg_desc)
{
/* local variables */
int ret = OMPI_SUCCESS;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
bcol_basesmuma_registration_data_t *sm_reg =
(bcol_basesmuma_registration_data_t*) context_data;
/* cache some info on sm_reg aka "context_data", you'll need it later */
sm_reg->base_addr = base;
sm_reg->size = size;
/* call into the shared memory registration function in smcm
* we need to be sure that the memory is page aligned in order
* to "map_fixed"
*/
sm_reg->sm_mmap = bcol_basesmuma_smcm_mem_reg(base, size,
sm_reg->data_seg_alignment,
sm_reg->file_name);
if(NULL == sm_reg->sm_mmap) {
opal_output (ompi_bcol_base_framework.framework_output, "Bcol_basesmuma memory registration error");
return OMPI_ERROR;
}
/* don't let other communicators re-register me! */
cs->mpool_inited = true;
/* alias back to component */
cs->sm_payload_structs = sm_reg->sm_mmap;
return ret;
}
/* Shared memory deregistration function - deregisters memory by munmapping it and removing the
shared memory file.
Basic steps (please let me know if this is incompatible with your notion of deregistration
or if it causes problems on cleanup):
1. munmap the shared memory file.
2. set the base pointer to the mmaped memory to NULL.
3. permanently remove the shared memory file from the directory.
*/
int mca_bcol_basesmuma_deregister_sm(void *context_data, void *reg)
{
/* local variables */
bcol_basesmuma_registration_data_t *sm_reg =
(bcol_basesmuma_registration_data_t*) context_data;
if (sm_reg->sm_mmap) {
OBJ_RELEASE(sm_reg->sm_mmap);
}
/* set the pointer to NULL */
sm_reg->base_addr = NULL;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,687 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/patterns/net/netpatterns.h"
#include "opal/util/show_help.h"
#include "opal/align.h"
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h"
#include "bcol_basesmuma.h"
#include "bcol_basesmuma_utils.h"
#ifdef __PORTALS_AVAIL__
#include "bcol_basesmuma_portals.h"
#endif
/*
* Local functions
*/
static int alloc_lmsg_reduce_offsets_array(mca_bcol_basesmuma_module_t *sm_module)
{
int rc = OMPI_SUCCESS, i = 0;
netpatterns_k_exchange_node_t *k_node = &sm_module->knomial_exchange_tree;
int n_exchanges = k_node->n_exchanges;
/* Precalculate the allreduce offsets */
if (0 < k_node->n_exchanges) {
sm_module->reduce_offsets = (int **)malloc(n_exchanges * sizeof(int*));
if (!sm_module->reduce_offsets) {
rc = OMPI_ERROR;
return rc;
}
for (i=0; i < n_exchanges ; i++) {
sm_module->reduce_offsets[i] = (int *)malloc (sizeof(int) * NOFFSETS);
if (!sm_module->reduce_offsets[i]){
rc = OMPI_ERROR;
return rc;
}
}
}
return rc;
}
static int free_lmsg_reduce_offsets_array(mca_bcol_basesmuma_module_t *sm_module)
{
int rc = OMPI_SUCCESS, i = 0;
netpatterns_k_exchange_node_t *k_node = &sm_module->knomial_exchange_tree;
int n_exchanges = k_node->n_exchanges;
if (sm_module->reduce_offsets) {
for (i=0; i < n_exchanges; i++) {
free (sm_module->reduce_offsets[i]);
}
free(sm_module->reduce_offsets);
}
return rc;
}
static void
mca_bcol_basesmuma_module_construct(mca_bcol_basesmuma_module_t *module)
{
/* initialize all values to 0 */
memset((void*)((uintptr_t) module + sizeof (module->super)), 0, sizeof (*module) - sizeof (module->super));
module->super.bcol_component = (mca_bcol_base_component_t *) &mca_bcol_basesmuma_component;
module->super.list_n_connected = NULL;
module->super.hier_scather_offset = 0;
}
static void
mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
{
/* local variables */
mca_sbgp_base_module_t *sbgp_module = sm_module->super.sbgp_partner_module;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
/*
* release allocated resrouces
*/
/* ...but not until you're sure you have no outstanding collectives */
while(0 != opal_list_get_size(&(cs->nb_admin_barriers))) {
opal_progress();
}
#ifdef __PORTALS_AVAIL__
/* Remove portals bcast specific resources */
if ( PTL_OK != PtlEQFree(sm_module->sg_state.read_eq)) {
BASESMUMA_VERBOSE(10,("PtlEQFree() failed: )"));
}
#endif
/* Remove Lmsg Reduce Offsets Array */
free_lmsg_reduce_offsets_array(sm_module);
/* collective topology data */
if( sm_module->fanout_read_tree) {
for (int i = 0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
if(0 < sm_module->fanout_read_tree[i].n_children ) {
free(sm_module->fanout_read_tree[i].children_ranks);
sm_module->fanout_read_tree[i].children_ranks=NULL;
}
}
free(sm_module->fanout_read_tree);
sm_module->fanout_read_tree=NULL;
}
/* gvm Leak FIX Reduction_tree[].children_ranks has
* to be removed. I don't how to get the size (which is
* size of subgroup) of array reduction_tree
*/
if( sm_module->reduction_tree) {
for (int i = 0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
if(0 < sm_module->reduction_tree[i].n_children ) {
free(sm_module->reduction_tree[i].children_ranks);
sm_module->reduction_tree[i].children_ranks=NULL;
}
}
free(sm_module->reduction_tree);
sm_module->reduction_tree=NULL;
}
/* gvm Leak FIX */
if (sm_module->fanout_node.children_ranks){
free(sm_module->fanout_node.children_ranks);
sm_module->fanout_node.children_ranks = NULL;
}
if (sm_module->fanin_node.children_ranks){
free(sm_module->fanin_node.children_ranks);
sm_module->fanin_node.children_ranks = NULL;
}
/* colls_no_user_data resrouces */
if(sm_module->colls_no_user_data.ctl_buffs_mgmt){
free(sm_module->colls_no_user_data.ctl_buffs_mgmt);
sm_module->colls_no_user_data.ctl_buffs_mgmt=NULL;
}
if(sm_module->colls_no_user_data.ctl_buffs){
free(sm_module->colls_no_user_data.ctl_buffs);
sm_module->colls_no_user_data.ctl_buffs=NULL;
}
/* return control */
opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->no_userdata_ctl);
/* colls_with_user_data resrouces */
/*
*debug print */
/*
fprintf(stderr,"AAA colls_with_user_data.ctl_buffs %p \n",
sm_module->colls_with_user_data.ctl_buffs_mgmt);
end debug */
if(sm_module->colls_with_user_data.ctl_buffs_mgmt){
free(sm_module->colls_with_user_data.ctl_buffs_mgmt);
sm_module->colls_with_user_data.ctl_buffs_mgmt=NULL;
}
if(sm_module->colls_with_user_data.ctl_buffs){
free(sm_module->colls_with_user_data.ctl_buffs);
sm_module->colls_with_user_data.ctl_buffs=NULL;
}
if(sm_module->shared_memory_scratch_space) {
free(sm_module->shared_memory_scratch_space);
sm_module->shared_memory_scratch_space=NULL;
}
/* return control */
opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->userdata_ctl);
#if 1
if(sm_module->scatter_kary_tree) {
for (int i = 0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
if(0 < sm_module->scatter_kary_tree[i].n_children) {
free(sm_module->scatter_kary_tree[i].children_ranks);
sm_module->scatter_kary_tree[i].children_ranks=NULL;
}
}
free(sm_module->scatter_kary_tree);
}
#endif
if(NULL != sm_module->super.list_n_connected ){
free(sm_module->super.list_n_connected);
sm_module->super.list_n_connected = NULL;
}
cleanup_nb_coll_buff_desc(&sm_module->ml_mem.nb_coll_desc,
sm_module->ml_mem.num_banks,
sm_module->ml_mem.num_buffers_per_bank);
for (int i = 0; i < BCOL_NUM_OF_FUNCTIONS; i++){
/* gvm FIX: Go through the list and destroy each item */
/* Destroy the function table object for each bcol type list */
OPAL_LIST_DESTRUCT((&sm_module->super.bcol_fns_table[i]));
}
if (NULL != sm_module->payload_backing_files_info) {
bcol_basesmuma_smcm_release_connections (sm_module, sbgp_module, &cs->sm_connections_list,
&sm_module->payload_backing_files_info);
}
if (NULL != sm_module->ctl_backing_files_info) {
bcol_basesmuma_smcm_release_connections (sm_module, sbgp_module, &cs->sm_connections_list,
&sm_module->ctl_backing_files_info);
}
if (NULL != sm_module->ml_mem.bank_release_counter) {
free(sm_module->ml_mem.bank_release_counter);
sm_module->ml_mem.bank_release_counter = NULL;
}
if (NULL != sm_module->colls_with_user_data.data_buffs) {
free((void *)sm_module->colls_with_user_data.data_buffs);
sm_module->colls_with_user_data.data_buffs = NULL;
}
/* free the k-nomial allgather tree here */
netpatterns_cleanup_recursive_knomial_allgather_tree_node(&sm_module->knomial_allgather_tree);
netpatterns_cleanup_recursive_doubling_tree_node(&sm_module->recursive_doubling_tree);
netpatterns_cleanup_recursive_knomial_tree_node(&sm_module->knomial_exchange_tree);
/* done */
}
static void bcol_basesmuma_set_small_msg_thresholds(struct mca_bcol_base_module_t *super)
{
mca_bcol_basesmuma_module_t *basesmuma_module =
(mca_bcol_basesmuma_module_t *) super;
size_t basesmuma_offset = bcol_basesmuma_data_offset_calc(basesmuma_module);
/* Set the Allreduce threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_ALLREDUCE] =
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
/* Set the Bcast threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_BCAST] =
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
/* Set the Gather threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_GATHER] =
(basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) /
ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
/* Set the ALLgather threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_ALLGATHER] =
(basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) /
ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
/* Set the Reduce threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_REDUCE] =
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
/* Set the Scatter threshold, for Basesmuma it equals to ML buffer size - data offset */
super->small_message_thresholds[BCOL_SCATTER] =
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
}
/* setup memory management and collective routines */
static void load_func(mca_bcol_base_module_t *super)
{
int fnc;
/* Loading memory management and collective functions */
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
super->bcol_function_table[fnc] = NULL;
}
/*super->bcol_function_table[BCOL_BARRIER] = bcol_basesmuma_recursive_double_barrier;*/
#ifdef __PORTALS_AVAIL__
super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_lmsg_scatter_allgather_portals_bcast;
/* super->bcol_function_table[BCOL_BCAST] =
bcol_basesmuma_lmsg_bcast_k_nomial_anyroot; */
#endif
/*super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_bcast;*/
/*super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_binary_scatter_allgather_segment;*/
/*super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_bcast_k_nomial_anyroot;*/
super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_bcast;
#ifdef __PORTALS_AVAIL__
super->bcol_function_table[BCOL_BCAST] =
bcol_basesmuma_lmsg_scatter_allgather_portals_bcast;
#endif
/* super->bcol_function_table[BCOL_ALLREDUCE] = bcol_basesmuma_allreduce_intra_fanin_fanout; */
super->bcol_function_table[BCOL_ALLREDUCE] = bcol_basesmuma_allreduce_intra_recursive_doubling;
super->bcol_function_table[BCOL_REDUCE] = bcol_basesmuma_reduce_intra_fanin_old;
/* memory management */
super->bcol_memory_init = bcol_basesmuma_bank_init_opti;
super->k_nomial_tree = bcol_basesmuma_setup_knomial_tree;
/* Set thresholds */
super->set_small_msg_thresholds = bcol_basesmuma_set_small_msg_thresholds;
}
static void load_func_with_choices(mca_bcol_base_module_t *super)
{
int fnc;
/* Loading memory management and collective functions */
for (fnc=0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
super->bcol_function_init_table[fnc] = NULL;
}
super->bcol_function_init_table[BCOL_FANIN] = bcol_basesmuma_fanin_init;
super->bcol_function_init_table[BCOL_FANOUT] = bcol_basesmuma_fanout_init;
super->bcol_function_init_table[BCOL_BARRIER] = bcol_basesmuma_barrier_init;
super->bcol_function_init_table[BCOL_BCAST] = bcol_basesmuma_bcast_init;
super->bcol_function_init_table[BCOL_ALLREDUCE] = bcol_basesmuma_allreduce_init;
super->bcol_function_init_table[BCOL_REDUCE] = bcol_basesmuma_reduce_init;
super->bcol_function_init_table[BCOL_GATHER] = bcol_basesmuma_gather_init;
super->bcol_function_init_table[BCOL_ALLGATHER] = bcol_basesmuma_allgather_init;
super->bcol_function_init_table[BCOL_SYNC] = bcol_basesmuma_memsync_init;
/* memory management */
super->bcol_memory_init = bcol_basesmuma_bank_init_opti;
super->k_nomial_tree = bcol_basesmuma_setup_knomial_tree;
}
static int load_recursive_knomial_info(mca_bcol_basesmuma_module_t
*sm_module)
{
int rc = OMPI_SUCCESS;
rc = netpatterns_setup_recursive_knomial_tree_node(sm_module->super.sbgp_partner_module->group_size,
sm_module->super.sbgp_partner_module->my_index,
mca_bcol_basesmuma_component.k_nomial_radix,
&sm_module->knomial_exchange_tree);
return rc;
}
int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super)
{
mca_bcol_basesmuma_module_t *sm_module = (mca_bcol_basesmuma_module_t *) super;
return netpatterns_setup_recursive_knomial_allgather_tree_node(sm_module->super.sbgp_partner_module->group_size,
sm_module->super.sbgp_partner_module->my_index,
mca_bcol_basesmuma_component.k_nomial_radix,
super->list_n_connected,
&sm_module->knomial_allgather_tree);
}
/* query to see if the module is available for use on the given
* communicator, and if so, what it's priority is. This is where
* the backing shared-memory file is created.
*/
mca_bcol_base_module_t **
mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules)
{
/* local variables */
mca_bcol_base_module_t **sm_modules = NULL;
mca_bcol_basesmuma_module_t *sm_module;
bcol_basesmuma_registration_data_t *sm_reg_data;
int ret, my_rank, name_length;
char *name;
int i;
int bcast_radix;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
/*mca_base_component_list_item_t *hdl_cli = NULL;*/
/*int hdl_num;*/
/* at this point I think there is only a sinle shared
memory bcol that we need to be concerned with */
/* No group, no modules */
if (OPAL_UNLIKELY(NULL == module)) {
return NULL;
}
/* allocate and initialize an sm_bcol module */
sm_module = OBJ_NEW(mca_bcol_basesmuma_module_t);
/* set the subgroup */
sm_module->super.sbgp_partner_module=module;
(*num_modules)=1;
cs->super.n_net_contexts = *num_modules;
sm_module->reduction_tree = NULL;
sm_module->fanout_read_tree = NULL;
ret=netpatterns_setup_recursive_doubling_tree_node(
module->group_size,module->my_index,
&(sm_module->recursive_doubling_tree));
if(OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "Error setting up recursive_doubling_tree \n");
return NULL;
}
/* setup the fanin tree - this is used only as part of a hierarchical
* barrier, so will set this up with rank 0 as the root */
my_rank=module->my_index;
ret=netpatterns_setup_narray_tree(cs->radix_fanin,
my_rank,module->group_size,&(sm_module->fanin_node));
if(OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "Error setting up fanin tree \n");
return NULL;
}
/* setup the fanout tree - this is used only as part of a hierarchical
* barrier, so will set this up with rank 0 as the root */
ret=netpatterns_setup_narray_tree(cs->radix_fanout,
my_rank,module->group_size,&(sm_module->fanout_node));
if(OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "Error setting up fanout tree \n");
return NULL;
}
/*
* Setup the broadcast tree - this is used only as part of a hierarchical
* bcast, so will set this up with rank 0 as the root.
*/
/* set the radix of the bcast tree */
bcast_radix = cs->radix_read_tree;
/* initialize fan-out read tree */
sm_module->fanout_read_tree=(netpatterns_tree_node_t*) malloc(
sizeof(netpatterns_tree_node_t)*module->group_size);
if( NULL == sm_module->fanout_read_tree ) {
goto Error;
}
for(i = 0; i < module->group_size; i++){
ret = netpatterns_setup_narray_tree(bcast_radix,
i, module->group_size, &(sm_module->fanout_read_tree[i]));
if(OMPI_SUCCESS != ret) {
goto Error;
}
}
ret = load_recursive_knomial_info(sm_module);
if (OMPI_SUCCESS != ret) {
BASESMUMA_VERBOSE(10, ("Failed to load recursive knomial tree"));
goto Error;
}
/* Allocate offsets array for lmsg reduce */
ret = alloc_lmsg_reduce_offsets_array(sm_module);
if (OMPI_SUCCESS != ret) {
BASESMUMA_VERBOSE(10, ("Failed to allocate reduce offsets array"));
goto Error;
}
/* initialize reduction tree */
sm_module->reduction_tree=(netpatterns_tree_node_t *) malloc(
sizeof(netpatterns_tree_node_t )*module->group_size);
if( NULL == sm_module->reduction_tree ) {
goto Error;
}
ret=netpatterns_setup_multinomial_tree(
cs->order_reduction_tree,module->group_size,
sm_module->reduction_tree);
if( MPI_SUCCESS != ret ) {
goto Error;
}
/* get largest power of k for given group size */
sm_module->pow_k_levels = pow_sm_k(cs->k_nomial_radix,
sm_module->super.sbgp_partner_module->group_size,
&(sm_module->pow_k));
/* get largest power of 2 for a given group size
* used in scatter allgather
*/
sm_module->pow_2_levels = pow_sm_k(2,
sm_module->super.sbgp_partner_module->group_size,
&(sm_module->pow_2));
/*
* setup scatter data
*/
sm_module->scatter_kary_radix=cs->scatter_kary_radix;
sm_module->scatter_kary_tree=NULL;
ret=netpatterns_setup_narray_tree_contigous_ranks(
sm_module->scatter_kary_radix,
sm_module->super.sbgp_partner_module->group_size,
&(sm_module->scatter_kary_tree));
if(OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "In base_bcol_masesmuma_setup_library_buffers and scatter k-ary tree setup failed \n");
return NULL;
}
/* setup the module shared memory management */
ret=base_bcol_basesmuma_setup_library_buffers(sm_module, cs);
if(OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "In base_bcol_masesmuma_setup_library_buffers and mpool was not successfully setup!\n");
return NULL;
}
/* setup the collectives and memory management */
/* check to see whether or not the mpool has been inited */
/* allocate some space for the network contexts */
if(!cs->mpool_inited) {
/* if it's empty, then fill it for first time */
cs->super.network_contexts = (bcol_base_network_context_t **)
malloc((cs->super.n_net_contexts)*
sizeof(bcol_base_network_context_t *));
/* you need to do some basic setup - define the file name,
* set data seg alignment and size of cntl structure in sm
* file.
*/
/* give the payload sm file a name */
name_length=asprintf(&name,
"%s"OPAL_PATH_SEP"0%s%0d",
ompi_process_info.job_session_dir,
cs->payload_base_fname,
(int)getpid());
if( 0 > name_length ) {
opal_output (ompi_bcol_base_framework.framework_output, "Failed to assign the shared memory payload file a name\n");
return NULL;
}
/* make sure name is not too long */
if ( OPAL_PATH_MAX < (name_length-1) ) {
opal_output (ompi_bcol_base_framework.framework_output, "Shared memory file name is too long!\n");
return NULL;
}
/* set the name and alignment characteristics */
sm_reg_data = (bcol_basesmuma_registration_data_t *) malloc(
sizeof(bcol_basesmuma_registration_data_t));
sm_reg_data->file_name = name;
sm_reg_data->data_seg_alignment = getpagesize();
sm_reg_data->size_ctl_structure = 0;
cs->super.network_contexts[0] = (bcol_base_network_context_t *)
malloc(sizeof(bcol_base_network_context_t));
cs->super.network_contexts[0]->context_data =
(void *) sm_reg_data;
cs->super.network_contexts[0]->
register_memory_fn = mca_bcol_basesmuma_register_sm;
cs->super.network_contexts[0]->
deregister_memory_fn = mca_bcol_basesmuma_deregister_sm;
sm_module->super.network_context = cs->super.network_contexts[0];
} else {
sm_module->super.network_context = cs->super.network_contexts[0];
}
/* Set the header size */
sm_module->super.header_size = sizeof(mca_bcol_basesmuma_header_t);
/*initialize the hdl module if it's to be enabled*/
#if 0
if (module->use_hdl) {
sm_module->super.use_hdl = module->use_hdl;
hdl_cli = (mca_base_component_list_item_t *)
opal_list_get_first(&mca_hdl_base_components_in_use);
sm_module->hdl_module = ((mca_hdl_base_component_t*)
hdl_cli->cli_component)->hdl_comm_query(sm_module, &hdl_num);
if (1 != hdl_num || sm_module->hdl_module == NULL) {
ML_ERROR(("hdl modules are not successfully initialized!\n"));
goto Error;
}
} else {
sm_module->hdl_module = NULL;
}
#else
sm_module->hdl_module = NULL;
#endif
/* collective setup */
load_func(&(sm_module->super));
load_func_with_choices(&(sm_module->super));
/*
* This initializes all collective algorithms
*/
ret = mca_bcol_base_bcol_fns_table_init(&(sm_module->super));
if (OMPI_SUCCESS != ret) {
goto Error;
}
sm_module->super.supported_mode = 0;
/* NTH: this is not set anywhere on the trunk as of 08/13/13 */
#if 0
if (module->use_hdl) {
sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY;
}
#endif
/* Initializes portals library required for basesmuma large message */
#ifdef __PORTALS_AVAIL__
/* Enable zero copy mode */
sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY;
ret = mca_bcol_basesmuma_portals_init(cs);
if (OMPI_SUCCESS != ret) {
return NULL;
}
sm_module->sg_state.phase = INIT;
ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
PTL_EQ_HANDLER_NONE, &sm_module->sg_state.read_eq);
if (ret != PTL_OK) {
BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d",ret));
return NULL;
}
#endif
/* blocking recursive double barrier test */
/*
{
opal_output (ompi_bcol_base_framework.framework_output, "BBB About to hit the barrier test\n");
int rc;
bcol_function_args_t bogus;
rc = bcol_basesmuma_rd_barrier_init(&(sm_module->super));
rc = bcol_basesmuma_recursive_double_barrier(
&bogus, &(sm_module->super));
}
*/
/* in this case we only expect a single network context.
in the future we should loop around this */
sm_modules = (mca_bcol_base_module_t **) malloc(sizeof(mca_bcol_base_module_t *));
if( !sm_modules ) {
opal_output (ompi_bcol_base_framework.framework_output, "In base_bcol_masesmuma_setup_library_buffers failed to allocate memory for sm_modules\n");
return NULL;
}
sm_modules[0] = &(sm_module->super);
return sm_modules;
Error:
/* cleanup */
if( sm_module->reduction_tree ) {
free(sm_module->reduction_tree);
sm_module->reduction_tree=NULL;
}
return NULL;
}
OBJ_CLASS_INSTANCE(mca_bcol_basesmuma_module_t,
mca_bcol_base_module_t,
mca_bcol_basesmuma_module_construct,
mca_bcol_basesmuma_module_destruct);

Просмотреть файл

@ -1,74 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "bcol_basesmuma.h"
/* the progress function to be called from the opal progress function
*/
int bcol_basesmuma_progress(void)
{
/* local variables */
volatile int32_t *cntr;
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
/* check to see if release of memory blocks needs to be done */
if( opal_list_get_size(&(cs->nb_admin_barriers)) ) {
sm_nbbar_desc_t *item_ptr;
opal_list_t *list=&(cs->nb_admin_barriers);
/* process only if the list is non-empty */
if( !OPAL_THREAD_TRYLOCK(&cs->nb_admin_barriers_mutex)) {
for (item_ptr = (sm_nbbar_desc_t*) opal_list_get_first(list);
item_ptr != (sm_nbbar_desc_t*) opal_list_get_end(list);
item_ptr = (sm_nbbar_desc_t*) opal_list_get_next(item_ptr) )
{
bcol_basesmuma_rd_nb_barrier_progress_admin(item_ptr);
/* check to see if an complete */
if( NB_BARRIER_DONE == item_ptr->collective_phase ) {
/* barrier is complete - remove from the list. No need
* to put it on another list, as it is part of the memory
* bank control structure, and will be picked up
* again when needed.
*/
int index=
item_ptr->pool_index;
/* old way - ctl_struct specific */
/*
volatile uint64_t *cntr= (volatile uint64_t *)
&(item_ptr->sm_module->colls_no_user_data.
ctl_buffs_mgmt[index].bank_gen_counter);
*/
cntr= (volatile int32_t *) &(item_ptr->coll_buff->
ctl_buffs_mgmt[index].bank_gen_counter);
item_ptr=(sm_nbbar_desc_t*)opal_list_remove_item((opal_list_t *)list,
( opal_list_item_t *)item_ptr);
/* increment the generation number */
OPAL_THREAD_ADD32(cntr,1);
}
}
OPAL_THREAD_UNLOCK(&cs->nb_admin_barriers_mutex);
}
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,218 +0,0 @@
/*
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/* Recursive doubling blocking barrier */
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/patterns/net/netpatterns.h"
#include "opal/sys/atomic.h"
#include "bcol_basesmuma.h"
#if 0
int bcol_basesmuma_recursive_double_barrier(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange, flag_to_set;
int pair_rank, flag_offset;
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
netpatterns_pair_exchange_node_t *my_exchange_node;
int extra_rank, my_rank, pow_2;
volatile mca_bcol_basesmuma_ctl_struct_t *partner_ctl;
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl;
int64_t sequence_number;
bool found;
int buff_index, first_instance=0;
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
#if 0
fprintf(stderr,"Entering the sm rd barrier\n");
fflush(stderr);
#endif
/* get the pointer to the segment of control structures */
my_exchange_node=&(bcol_module->recursive_doubling_tree);
my_rank=bcol_module->super.sbgp_partner_module->my_index;
pow_2=bcol_module->super.sbgp_partner_module->pow_2;
/* figure out what instance of the basesmuma bcol I am */
leading_dim=bcol_module->colls_no_user_data.size_of_group;
sequence_number=input_args->sequence_num - c_input_args->bcol_module->squence_number_offset;
buff_index=sequence_number & (bcol_module->colls_no_user_data.mask);
idx=SM_ARRAY_INDEX(leading_dim,buff_index,0);
ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
bcol_module->colls_no_user_data.ctl_buffs+idx;
my_ctl=ctl_structs[my_rank];
if( my_ctl->sequence_number < sequence_number ) {
first_instance=1;
}
/* get the pool index */
if( first_instance ) {
idx = -1;
while( idx == -1 ) {
idx=bcol_basesmuma_get_buff_index(
&(bcol_module->colls_no_user_data),sequence_number);
}
if( -1 == idx ){
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
}
my_ctl->index=1;
/* this does not need to use any flag values , so only need to
* set the value for subsequent values that may need this */
my_ctl->starting_flag_value=0;
flag_offset=0;
} else {
/* only one thread at a time will be making progress on this
* collective, so no need to make this atomic */
my_ctl->index++;
flag_offset=my_ctl->starting_flag_value;
}
/* signal that I have arrived */
my_ctl->flag = -1;
/* don't need to set this flag anymore */
my_ctl->sequence_number = sequence_number;
/* opal_atomic_wmb ();*/
if(0 < my_exchange_node->n_extra_sources) {
if (EXCHANGE_NODE == my_exchange_node->node_type) {
volatile int64_t *partner_sn;
int cnt=0;
/* I will participate in the exchange - wait for signal from extra
** process */
extra_rank = my_exchange_node->rank_extra_source;
partner_ctl=(volatile mca_bcol_basesmuma_ctl_struct_t *)ctl_structs[extra_rank];
/*partner_ctl=ctl_structs[extra_rank];*/
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
/* spin n iterations until partner registers */
loop_cnt=0;
found=false;
while( !found )
{
if( *partner_sn >= sequence_number ) {
found=true;
}
cnt++;
if( cnt == 1000 ) {
opal_progress();
cnt=0;
}
}
} else {
/* Nothing to do, already registared that I am here */
}
}
for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) {
volatile int64_t *partner_sn;
volatile int *partner_flag;
int cnt=0;
/* rank of exchange partner */
pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
partner_ctl=ctl_structs[pair_rank];
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
partner_flag=(volatile int *)&(partner_ctl->flag);
/* signal that I am at iteration exchange of the algorithm */
flag_to_set=flag_offset+exchange;
my_ctl->flag = flag_to_set;
/* check to see if the partner has arrived */
/* spin n iterations until partner registers */
found=false;
while( !found )
{
if( (*partner_sn > sequence_number) ||
( *partner_sn == sequence_number &&
*partner_flag >= flag_to_set ) ) {
found=true;
} else {
cnt++;
if( cnt == 1000 ) {
opal_progress();
cnt=0;
}
}
}
}
if(0 < my_exchange_node->n_extra_sources) {
if ( EXTRA_NODE == my_exchange_node->node_type ) {
int cnt=0;
/* I will not participate in the exchange -
* wait for signal from extra partner */
extra_rank = my_exchange_node->rank_extra_source;
partner_ctl=ctl_structs[extra_rank];
flag_to_set=flag_offset+my_exchange_node->log_2;
/* spin n iterations until partner registers */
found=false;
while( !found )
{
if (IS_PEER_READY(partner_ctl, flag_to_set, sequence_number)){
found=true;
} else {
cnt++;
if( cnt == 1000 ) {
opal_progress();
cnt=0;
}
}
}
} else {
/* signal the extra rank that I am done with the recursive
* doubling phase.
*/
flag_to_set=flag_offset+my_exchange_node->log_2;
my_ctl->flag = flag_to_set;
}
}
/* if I am the last instance of a basesmuma function in this collectie,
* release the resrouces */
if (IS_LAST_BCOL_FUNC(c_input_args)){
idx=bcol_basesmuma_free_buff(
&(bcol_module->colls_no_user_data),
sequence_number);
} else {
/* increment flag value - so next sm collective in the hierarchy
* will not collide with the current one, as they share the
* control structure */
my_ctl->starting_flag_value+=(my_exchange_node->log_2+1);
}
/* return */
return ret;
}
#endif

Просмотреть файл

@ -1,462 +0,0 @@
/*
* Copyright (c) 2009-2012 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
/* we need make cleanup with all these includes START */
#include <unistd.h>
#include <sys/types.h>
#include "ompi/constants.h"
#include "ompi/mca/bcol/bcol.h"
#include "bcol_basesmuma.h"
#include "opal/sys/atomic.h"
#include "ompi/patterns/net/netpatterns.h"
#include "ompi/mca/bcol/base/base.h"
/*
* Initialize nonblocking barrier. This is code specific for handling
* the recycling of data, and uses only a single set of control buffers.
* It also assumes that for a given process, only a single outstanding
* barrier operation will occur for a given control structure,
* with the sequence number being used for potential overlap in time
* between succesive barrier calls on different processes.
*/
int bcol_basesmuma_rd_nb_barrier_init_admin(
sm_nbbar_desc_t *sm_desc)
{
/* local variables */
int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange;
int pair_rank;
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
netpatterns_pair_exchange_node_t *my_exchange_node;
int extra_rank, my_rank;
mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl;
mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl;
int64_t bank_genaration;
bool found;
int pool_index=sm_desc->pool_index;
mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module;
/* get the pointer to the segment of control structures */
idx=sm_desc->coll_buff->number_of_buffs+pool_index;
leading_dim=sm_desc->coll_buff->size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,idx,0);
ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
sm_desc->coll_buff->ctl_buffs+idx;
bank_genaration= sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter;
my_exchange_node=&(bcol_module->recursive_doubling_tree);
my_rank=bcol_module->super.sbgp_partner_module->my_index;
my_ctl=ctl_structs[my_rank];
/* debug print */
/*
{
int ii;
for(ii = 0; ii < 6; ii++) {
fprintf(stderr,"UUU ctl_struct[%d] := %p\n",ii,
bcol_module->colls_no_user_data.ctl_buffs[ii]);
fflush(stderr);
}
}
*/
/* end debug */
/* signal that I have arrived */
my_ctl->flag = -1;
opal_atomic_wmb ();
/* don't need to set this flag anymore */
my_ctl->sequence_number = bank_genaration;
if(0 < my_exchange_node->n_extra_sources) {
if (EXCHANGE_NODE == my_exchange_node->node_type) {
volatile int64_t *partner_sn;
/* I will participate in the exchange - wait for signal from extra
** process */
extra_rank = my_exchange_node->rank_extra_source;
partner_ctl=ctl_structs[extra_rank];
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
/* spin n iterations until partner registers */
loop_cnt=0;
found=false;
while( loop_cnt < bcol_module->super.n_poll_loops )
{
if( *partner_sn >= bank_genaration ) {
found=true;
break;
}
loop_cnt++;
}
if( !found ) {
/* set restart parameters */
sm_desc->collective_phase=NB_PRE_PHASE;
return OMPI_SUCCESS;
}
} else {
/* Nothing to do, already registared that I am here */
}
}
for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) {
volatile int64_t *partner_sn;
volatile int *partner_flag;
/* rank of exchange partner */
pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
partner_ctl=ctl_structs[pair_rank];
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
partner_flag=(volatile int *)&(partner_ctl->flag);
/* signal that I am at iteration exchange of the algorithm */
my_ctl->flag = exchange;
/* check to see if the partner has arrived */
/* spin n iterations until partner registers */
loop_cnt=0;
found=false;
while( loop_cnt < bcol_module->super.n_poll_loops )
{
if( (*partner_sn > bank_genaration) ||
( *partner_sn == bank_genaration &&
*partner_flag >= exchange ) ) {
found=true;
break;
}
loop_cnt++;
}
if( !found ) {
/* set restart parameters */
sm_desc->collective_phase=NB_RECURSIVE_DOUBLING;
sm_desc->recursive_dbl_iteration=exchange;
return OMPI_SUCCESS;
}
}
if(0 < my_exchange_node->n_extra_sources) {
if ( EXTRA_NODE == my_exchange_node->node_type ) {
volatile int64_t *partner_sn;
volatile int *partner_flag;
/* I will not participate in the exchange -
* wait for signal from extra partner */
extra_rank = my_exchange_node->rank_extra_source;
partner_ctl=ctl_structs[extra_rank];
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
partner_flag=(volatile int *)&(partner_ctl->flag);
/* spin n iterations until partner registers */
loop_cnt=0;
found=false;
while( loop_cnt < bcol_module->super.n_poll_loops )
{
if( (*partner_sn > bank_genaration) ||
( (*partner_sn == bank_genaration) &&
(*partner_flag == (my_exchange_node->log_2)) ) ) {
found=true;
break;
}
loop_cnt++;
}
if( !found ) {
/* set restart parameters */
sm_desc->collective_phase=NB_POST_PHASE;
return OMPI_SUCCESS;
}
} else {
/* signal the extra rank that I am done with the recursive
* doubling phase.
*/
my_ctl->flag = my_exchange_node->n_exchanges;
}
}
/* set the barrier as complete */
sm_desc->collective_phase=NB_BARRIER_DONE;
/* return */
return ret;
}
/* admin nonblocking barrier - progress function */
int bcol_basesmuma_rd_nb_barrier_progress_admin(
sm_nbbar_desc_t *sm_desc)
{
/* local variables */
int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange;
int pair_rank, start_index, restart_phase;
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
netpatterns_pair_exchange_node_t *my_exchange_node;
int extra_rank, my_rank;
mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl;
mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl;
int64_t bank_genaration;
int pool_index=sm_desc->pool_index;
bool found;
mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module;
/* get the pointer to the segment of control structures */
idx = sm_desc->coll_buff->number_of_buffs+pool_index;
leading_dim = sm_desc->coll_buff->size_of_group;
idx = SM_ARRAY_INDEX(leading_dim,idx,0);
ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
sm_desc->coll_buff->ctl_buffs+idx;
bank_genaration = sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter;
my_exchange_node=&(bcol_module->recursive_doubling_tree);
my_rank=bcol_module->super.sbgp_partner_module->my_index;
my_ctl=ctl_structs[my_rank];
/* check to make sure that this should be progressed */
if( ( sm_desc->collective_phase == NB_BARRIER_INACTIVE ) ||
( sm_desc->collective_phase == NB_BARRIER_DONE ) )
{
return OMPI_SUCCESS;
}
/* set the restart up - and jump to the correct place in the algorithm */
restart_phase=sm_desc->collective_phase;
if ( NB_PRE_PHASE == restart_phase ) {
start_index=0;
} else if ( NB_RECURSIVE_DOUBLING == restart_phase ) {
start_index=sm_desc->recursive_dbl_iteration;
goto Exchange_phase;
} else {
goto Post_phase;
}
if(0 < my_exchange_node->n_extra_sources) {
if (EXCHANGE_NODE == my_exchange_node->node_type) {
volatile int64_t *partner_sn;
/* I will participate in the exchange - wait for signal from extra
** process */
extra_rank = my_exchange_node->rank_extra_source;
partner_ctl=ctl_structs[extra_rank];
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
/* spin n iterations until partner registers */
loop_cnt=0;
while( loop_cnt < bcol_module->super.n_poll_loops )
{
found=false;
if( *partner_sn >= bank_genaration ) {
found=true;
break;
}
loop_cnt++;
}
if( !found ) {
/* set restart parameters */
sm_desc->collective_phase=NB_PRE_PHASE;
return OMPI_SUCCESS;
}
} else {
/* Nothing to do, already registared that I am here */
}
}
Exchange_phase:
for(exchange = start_index;
exchange < my_exchange_node->n_exchanges; exchange++) {
volatile int64_t *partner_sn;
volatile int *partner_flag;
/* rank of exchange partner */
pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
partner_ctl=ctl_structs[pair_rank];
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
partner_flag=(volatile int *)&(partner_ctl->flag);
/* signal that I am at iteration exchange of the algorithm */
my_ctl->flag = exchange;
/* check to see if the partner has arrived */
/* spin n iterations until partner registers */
loop_cnt=0;
found=false;
while( loop_cnt < bcol_module->super.n_poll_loops )
{
if( (*partner_sn > bank_genaration) ||
( (*partner_sn == bank_genaration) &&
(*partner_flag >= exchange) ) ) {
found=true;
break;
}
loop_cnt++;
}
if( !found ) {
/* set restart parameters */
sm_desc->collective_phase=NB_RECURSIVE_DOUBLING;
sm_desc->recursive_dbl_iteration=exchange;
return OMPI_SUCCESS;
}
}
Post_phase:
if(0 < my_exchange_node->n_extra_sources) {
if ( EXTRA_NODE == my_exchange_node->node_type ) {
volatile int64_t *partner_sn;
volatile int *partner_flag;
/* I will not participate in the exchange -
* wait for signal from extra partner */
extra_rank = my_exchange_node->rank_extra_source;
partner_ctl=ctl_structs[extra_rank];
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
partner_flag=(volatile int *)&(partner_ctl->flag);
/* spin n iterations until partner registers */
loop_cnt=0;
found=false;
while( loop_cnt < bcol_module->super.n_poll_loops )
{
if( (*partner_sn > bank_genaration) ||
( *partner_sn == bank_genaration &&
*partner_flag == (my_exchange_node->log_2) ) ) {
found=true;
break;
}
loop_cnt++;
}
if( !found ) {
/* set restart parameters */
sm_desc->collective_phase=NB_POST_PHASE;
return OMPI_SUCCESS;
}
} else {
/* signal the extra rank that I am done with the recursive
* doubling phase.
*/
my_ctl->flag = my_exchange_node->n_exchanges;
}
}
/* set the barrier as complete */
sm_desc->collective_phase=NB_BARRIER_DONE;
/* return */
return ret;
}
static int bcol_basesmuma_memsync(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
int rc;
int memory_bank = input_args->root;
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
sm_buffer_mgmt *buff_block = &(bcol_module->colls_with_user_data);
sm_nbbar_desc_t *sm_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
sm_desc->coll_buff = buff_block;
/*
printf("XXX SYNC call\n");
*/
rc = bcol_basesmuma_rd_nb_barrier_init_admin(
sm_desc);
if (OMPI_SUCCESS != rc) {
return rc;
}
if (NB_BARRIER_DONE != sm_desc->collective_phase) {
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
opal_list_t *list=&(cs->nb_admin_barriers);
opal_list_item_t *append_item;
/* put this onto the progression list */
OPAL_THREAD_LOCK(&(cs->nb_admin_barriers_mutex));
append_item=(opal_list_item_t *)
&(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
opal_list_append(list,append_item);
OPAL_THREAD_UNLOCK(&(cs->nb_admin_barriers_mutex));
/* progress communications so that resources can be freed up */
return BCOL_FN_STARTED;
}
/* Done - bump the counter */
(buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter)++;
/*
printf("XXX SYNC call done \n");
*/
return BCOL_FN_COMPLETE;
}
static int bcol_basesmuma_memsync_progress(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
int memory_bank = input_args->root;
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
sm_buffer_mgmt *buff_block = &(bcol_module->colls_with_user_data);
sm_nbbar_desc_t *sm_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
/* I do not have to do anything, since the
progress done by basesmuma progress engine */
if (NB_BARRIER_DONE != sm_desc->collective_phase) {
return BCOL_FN_STARTED;
}
return BCOL_FN_COMPLETE;
}
int bcol_basesmuma_memsync_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
comm_attribs.bcoll_type = BCOL_SYNC;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super,
&comm_attribs, &inv_attribs,
bcol_basesmuma_memsync,
bcol_basesmuma_memsync_progress);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,382 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/op/op.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/mca/bcol/bcol.h"
#include "opal/include/opal_stdint.h"
#include "bcol_basesmuma.h"
#include "bcol_basesmuma_reduce.h"
/**
* gvm - Shared memory reduce
*/
static int bcol_basesmuma_reduce_intra_fanin_progress(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args);
int bcol_basesmuma_reduce_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
comm_attribs.bcoll_type = BCOL_REDUCE;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1048576;
comm_attribs.data_src = DATA_SRC_KNOWN;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000;
inv_attribs.datatype_bitmap = 0x11111111;
inv_attribs.op_types_bitmap = 0x11111111;
/* Set attributes for fanin fanout algorithm */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_basesmuma_reduce_intra_fanin,
bcol_basesmuma_reduce_intra_fanin_progress);
inv_attribs.bcol_msg_min = 10000000;
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, NULL, NULL);
return OMPI_SUCCESS;
}
/*
* Small data fanin reduce
* ML buffers are used for both payload and control structures
* This functions works with hierarchical allreduce and
* progress engine
*/
static inline int reduce_children (mca_bcol_basesmuma_module_t *bcol_module, volatile void *rbuf, netpatterns_tree_node_t *my_reduction_node,
int *iteration, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, ompi_datatype_t *dtype,
volatile mca_bcol_basesmuma_payload_t *data_buffs, int count, struct ompi_op_t *op, int process_shift) {
volatile mca_bcol_basesmuma_header_t * child_ctl_pointer;
int bcol_id = (int) bcol_module->super.bcol_id;
int64_t sequence_number = my_ctl_pointer->sequence_number;
int8_t ready_flag = my_ctl_pointer->ready_flag;
int group_size = bcol_module->colls_no_user_data.size_of_group;
if (LEAF_NODE != my_reduction_node->my_node_type) {
volatile char *child_data_pointer;
volatile void *child_rbuf;
/* for each child */
/* my_result_data = child_result_data (op) my_source_data */
for (int child = *iteration ; child < my_reduction_node->n_children ; ++child) {
int child_rank = my_reduction_node->children_ranks[child] + process_shift;
if (group_size <= child_rank){
child_rank -= group_size;
}
child_ctl_pointer = data_buffs[child_rank].ctl_struct;
child_data_pointer = data_buffs[child_rank].payload;
if (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, REDUCE_FLAG, bcol_id)) {
*iteration = child;
return BCOL_FN_STARTED;
}
child_rbuf = child_data_pointer + child_ctl_pointer->roffsets[bcol_id];
ompi_op_reduce(op,(void *)child_rbuf,(void *)rbuf, count, dtype);
} /* end child loop */
}
if (ROOT_NODE != my_reduction_node->my_node_type) {
opal_atomic_wmb ();
my_ctl_pointer->flags[REDUCE_FLAG][bcol_id] = ready_flag;
}
return BCOL_FN_COMPLETE;
}
static int bcol_basesmuma_reduce_intra_fanin_progress(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
netpatterns_tree_node_t *my_reduction_node;
int my_rank, my_node_index;
struct ompi_datatype_t *dtype = input_args->dtype;
int leading_dim, idx;
/* Buffer index */
int buff_idx = input_args->src_desc->buffer_index;
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
void *data_addr = (void *)input_args->src_desc->data_addr;
volatile void *rbuf;
/* get addressing information */
my_rank = bcol_module->super.sbgp_partner_module->my_index;
leading_dim = bcol_module->colls_no_user_data.size_of_group;
idx = SM_ARRAY_INDEX(leading_dim, buff_idx, 0);
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs + idx;
/* Get control structure and payload buffer */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
my_node_index = my_rank - input_args->root;
if (0 > my_node_index) {
int group_size = bcol_module->colls_no_user_data.size_of_group;
my_node_index += group_size;
}
my_reduction_node = bcol_module->reduction_tree + my_node_index;
rbuf = (volatile void *)((uintptr_t) data_addr + input_args->rbuf_offset);
return reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer, dtype,
data_buffs, input_args->count, input_args->op, input_args->root);
}
int bcol_basesmuma_reduce_intra_fanin(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
int rc=BCOL_FN_COMPLETE;
int my_rank,group_size,my_node_index;
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
netpatterns_tree_node_t *my_reduction_node;
volatile int8_t ready_flag;
int bcol_id = (int) bcol_module->super.bcol_id;
volatile void *sbuf,*rbuf;
int sbuf_offset,rbuf_offset;
int root,count;
int64_t sequence_number=input_args->sequence_num;
struct ompi_datatype_t *dtype;
int leading_dim,idx;
/* Buffer index */
int buff_idx = input_args->src_desc->buffer_index;
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile char * my_data_pointer;
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
void *data_addr = (void *)input_args->src_desc->data_addr;
#if 0
fprintf(stderr,"777 entering sm reduce \n");
#endif
/* get addressing information */
my_rank=bcol_module->super.sbgp_partner_module->my_index;
group_size=bcol_module->colls_no_user_data.size_of_group;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
/* fprintf(stderr,"AAA the devil!!\n"); */
/* Get control structure and payload buffer */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
my_data_pointer = (volatile char *)data_addr;
/* Align node index to around sbgp root */
root = input_args->root;
my_node_index = my_rank - root;
if (0 > my_node_index) {
my_node_index += group_size;
}
/* get arguments */
sbuf_offset = input_args->sbuf_offset;
rbuf_offset = input_args->rbuf_offset;
sbuf = (volatile void *)(my_data_pointer + sbuf_offset);
data_buffs[my_rank].payload = (void*)sbuf;
rbuf = (volatile void *)(my_data_pointer + rbuf_offset);
count = input_args->count;
dtype = input_args->dtype;
/* Cache my rbuf_offset */
my_ctl_pointer->roffsets[bcol_id] = rbuf_offset;
/* get my node for the reduction tree */
my_reduction_node=&(bcol_module->reduction_tree[my_node_index]);
/* init the header */
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
input_args->result_in_rbuf = (ROOT_NODE == my_reduction_node->my_node_type);
/* set starting point for progress loop */
*iteration = 0;
my_ctl_pointer->ready_flag = ready_flag;
if (sbuf != rbuf) {
rc = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf,
(char *)sbuf);
if( 0 != rc ) {
return OMPI_ERROR;
}
}
rc = reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer, dtype,
data_buffs, count, input_args->op, root);
/* Flag value if other bcols are called */
my_ctl_pointer->starting_flag_value[bcol_id]++;
/* Recycle payload buffers */
return rc;
}
/* Small data fanin reduce
* Uses SM buffer (backed by SM file) for both control structures and
* payload
*
* NTH: How does this differ from the new one? Can we replace this
* with a call to the new init then a call the new progress until
* complete?
*/
int bcol_basesmuma_reduce_intra_fanin_old(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args)
{
/* local variables */
int rc=OMPI_SUCCESS;
int my_rank,group_size,process_shift,my_node_index;
int n_children,child;
mca_bcol_basesmuma_module_t* bcol_module =
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
netpatterns_tree_node_t *my_reduction_node;
volatile int8_t ready_flag;
volatile void *sbuf,*rbuf;
int sbuf_offset,rbuf_offset;
int root,count;
struct ompi_op_t *op;
int64_t sequence_number=input_args->sequence_num;
struct ompi_datatype_t *dtype;
int leading_dim,idx;
int buff_idx;
int child_rank;
int bcol_id = (int) bcol_module->super.bcol_id;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
volatile char * my_data_pointer;
volatile char * child_data_pointer;
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_header_t * child_ctl_pointer;
#if 0
fprintf(stderr,"Entering fanin reduce \n");
#endif
/* Buffer index */
buff_idx = input_args->src_desc->buffer_index;
/* get addressing information */
my_rank=bcol_module->super.sbgp_partner_module->my_index;
group_size=bcol_module->colls_no_user_data.size_of_group;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
/*ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
bcol_module->colls_with_user_data.ctl_buffs+idx;*/
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
/* Get control structure and payload buffer */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
my_data_pointer = (volatile char *) data_buffs[my_rank].payload;
/* Align node index to around sbgp root */
root = input_args->root;
process_shift = root;
my_node_index = my_rank - root;
if (0 > my_node_index ) {
my_node_index += group_size;
}
/* get arguments */
sbuf_offset = input_args->sbuf_offset;
rbuf_offset = input_args->rbuf_offset;
sbuf = (volatile void *)(my_data_pointer + sbuf_offset);
rbuf = (volatile void *)(my_data_pointer + rbuf_offset);
op = input_args->op;
count = input_args->count;
dtype = input_args->dtype;
/* get my node for the reduction tree */
my_reduction_node=&(bcol_module->reduction_tree[my_node_index]);
n_children=my_reduction_node->n_children;
/* init the header */
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
input_args->result_in_rbuf = (ROOT_NODE == my_reduction_node->my_node_type);
rc = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf,
(char *)sbuf);
if (0 != rc) {
return OMPI_ERROR;
}
if (LEAF_NODE != my_reduction_node->my_node_type) {
volatile void *child_rbuf;
/* for each child */
/* my_result_data = child_result_data (op) my_source_data */
for (child = 0 ; child < n_children ; ++child) {
child_rank = my_reduction_node->children_ranks[child];
child_rank += process_shift;
/* wrap around */
if( group_size <= child_rank ){
child_rank-=group_size;
}
/*child_ctl_pointer = ctl_structs[child_rank];*/
child_ctl_pointer = data_buffs[child_rank].ctl_struct;
child_data_pointer = data_buffs[child_rank].payload;
child_rbuf = child_data_pointer + rbuf_offset;
/* wait until child child's data is ready for use */
while (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, REDUCE_FLAG, bcol_id)) {
opal_progress();
}
/* apply collective operation */
ompi_op_reduce(op,(void *)child_rbuf,(void *)rbuf, count,dtype);
} /* end child loop */
}
if (ROOT_NODE != my_reduction_node->my_node_type) {
opal_atomic_wmb ();
my_ctl_pointer->flags[REDUCE_FLAG][bcol_id] = ready_flag;
}
my_ctl_pointer->starting_flag_value[bcol_id]++;
return rc;
}

Просмотреть файл

@ -1,92 +0,0 @@
#ifndef __BASESMUMA_REDUCE_H_
#define __BASESMUMA_REDUCE_H_
#include "ompi_config.h"
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "bcol_basesmuma_utils.h"
#include <unistd.h>
enum {
BLOCK_OFFSET = 0,
LOCAL_REDUCE_SEG_OFFSET,
BLOCK_COUNT,
SEG_SIZE,
NOFFSETS
};
int compute_knomial_reduce_offsets(int group_index, int count, struct
ompi_datatype_t *dtype,int k_radix,int n_exchanges,
int **offsets);
int compute_knomial_reduce_offsets_reverse(int group_index, int count, struct
ompi_datatype_t *dtype,int k_radix,int n_exchanges,
int **offsets);
int bcol_basesmuma_lmsg_reduce_recursivek_scatter_reduce(mca_bcol_basesmuma_module_t *sm_module,
const int buffer_index, void *sbuf,
void *rbuf,
struct ompi_op_t *op,
const int count, struct ompi_datatype_t *dtype,
const int relative_group_index,
const int padded_start_byte,
volatile int8_t ready_flag,
volatile mca_bcol_basesmuma_payload_t *data_buffs);
int bcol_basesmuma_lmsg_reduce_knomial_gather(mca_bcol_basesmuma_module_t *basesmuma_module,
const int buffer_index,
void *sbuf,void *rbuf, int count, struct
ompi_datatype_t *dtype,
const int my_group_index,
const int padded_start_byte,
volatile int8_t rflag,
volatile mca_bcol_basesmuma_payload_t *data_buffs);
int bcol_basesmuma_lmsg_reduce_extra_root(mca_bcol_basesmuma_module_t *sm_module,
const int buffer_index, void *sbuf,
void *rbuf,
struct ompi_op_t *op,
const int count, struct ompi_datatype_t *dtype,
const int relative_group_index,
const int padded_start_byte,
volatile int8_t rflag,
volatile mca_bcol_basesmuma_payload_t *data_buffs);
int bcol_basesmuma_lmsg_reduce_extra_non_root(mca_bcol_basesmuma_module_t *sm_module,
const int buffer_index, void *sbuf,
void *rbuf,
int root,
struct ompi_op_t *op,
const int count, struct ompi_datatype_t *dtype,
const int relative_group_index,
const int group_size,
const int padded_start_byte,
volatile int8_t rflag,
volatile mca_bcol_basesmuma_payload_t *data_buffs);
int bcol_basesmuma_lmsg_reduce(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args);
int bcol_basesmuma_lmsg_reduce_extra(bcol_function_args_t *input_args,
mca_bcol_base_function_t *c_input_args);
void basesmuma_reduce_recv(int my_group_index, int peer,
void *recv_buffer,
int recv_size,
volatile int8_t ready_flag_val,
volatile mca_bcol_basesmuma_payload_t *data_buffs);
void basesmuma_reduce_send(int my_group_index,
int peer,
void *send_buffer,
int snd_size,
int send_offset,
volatile int8_t ready_flag_val,
volatile mca_bcol_basesmuma_payload_t *data_buffs);
#endif

Просмотреть файл

@ -1,442 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/include/ompi/constants.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
/*
#define IS_BARRIER_READY(peer, my_flag, my_sequence_number)\
(((peer)->sequence_number == (my_sequence_number) && \
(peer)->flags[BARRIER_RKING_FLAG][bcol_id] >= (my_flag) \
)? true : false )
*/
#define CALC_ACTIVE_REQUESTS(active_requests,peers, tree_order) \
do{ \
for( j = 0; j < (tree_order - 1); j++){ \
if( 0 > peers[j] ) { \
/* set the bit */ \
*active_requests ^= (1<<j); \
} \
} \
}while(0)
/*
* Recursive K-ing barrier
*/
/*
*
* Recurssive k-ing algorithm
* Example k=3 n=9
*
*
* Number of Exchange steps = log (basek) n
* Number of steps in exchange step = k (radix)
*
*/
int bcol_basesmuma_k_nomial_barrier_init(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variables */
int flag_offset = 0;
volatile int8_t ready_flag;
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
uint32_t buffer_index = input_args->buffer_index;
int *active_requests =
&(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
int leading_dim, buff_idx, idx;
int bcol_id = (int) bcol_module->super.bcol_id;
int i, j, probe;
int src;
int pow_k, tree_order;
int max_requests = 0; /* important to initialize this */
bool matched;
int64_t sequence_number=input_args->sequence_num;
int my_rank = bcol_module->super.sbgp_partner_module->my_index;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
/* control structures */
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;
#if 0
fprintf(stderr,"entering sm barrier sn = %d buff index = %d\n",sequence_number,input_args->buffer_index);
#endif
/* initialize the iteration counter */
buff_idx = input_args->buffer_index;
leading_dim = bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
/* Set pointer to current proc ctrl region */
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
/* init the header */
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
/* initialize these */
*iteration = 0;
*active_requests = 0;
*status = 0;
/* k-nomial parameters */
tree_order = exchange_node->tree_order;
pow_k = exchange_node->log_tree_order;
/* calculate the maximum number of requests
* at each level each rank communicates with
* at most (k - 1) peers
* so if we set k - 1 bit fields in "max_requests", then
* we have max_request == 2^(k - 1) -1
*/
for(i = 0; i < (tree_order - 1); i++){
max_requests ^= (1<<i);
}
/* let's begin the collective, starting with extra ranks and their
* respective proxies
*/
if( EXTRA_NODE == exchange_node->node_type ) {
/* then I will signal to my proxy rank*/
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
ready_flag = flag_offset + 1 + pow_k + 2;
/* now, poll for completion */
src = exchange_node->rank_extra_sources_array[0];
peer_ctl_pointer = data_buffs[src].ctl_struct;
for( i = 0; i < cm->num_to_probe ; i++ ) {
if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
goto FINISHED;
}
}
/* cache state and bail */
*iteration = -1;
return BCOL_FN_STARTED;
}else if ( 0 < exchange_node->n_extra_sources ) {
/* I am a proxy for someone */
src = exchange_node->rank_extra_sources_array[0];
peer_ctl_pointer = data_buffs[src].ctl_struct;
/* probe for extra rank's arrival */
for( i = 0, matched = false ; i < cm->num_to_probe && !matched ; i++) {
if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
/* copy it in */
matched = true;
break;
}
}
if (!matched) {
*status = ready_flag;
*iteration = -1;
return BCOL_FN_STARTED;
}
}
/* bump the ready flag */
ready_flag++;
/* we start the recursive k - ing phase */
for( *iteration = 0; *iteration < pow_k; (*iteration)++) {
/* announce my arrival */
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
/* calculate the number of active requests */
CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iteration],tree_order);
/* Now post the recv's */
for( j = 0; j < (tree_order - 1); j++ ) {
/* recv phase */
src = exchange_node->rank_exchanges[*iteration][j];
if( src < 0 ) {
/* then not a valid rank, continue */
continue;
}
peer_ctl_pointer = data_buffs[src].ctl_struct;
if( !(*active_requests&(1<<j))) {
/* then the bit hasn't been set, thus this peer
* hasn't been processed at this level
* I am putting the probe loop as the inner most loop to achieve
* better temporal locality, this comes at a cost to asynchronicity
* but should get better cache performance
*/
for( probe = 0; probe < cm->num_to_probe ; probe++){
if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
/* set this request's bit */
*active_requests ^= (1<<j);
break;
}
}
}
}
if( max_requests == *active_requests ){
/* bump the ready flag */
ready_flag++;
/*reset the active requests */
*active_requests = 0;
} else {
/* cache the state and hop out
* only the iteration needs to be tracked
*/
*status = my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id];
return BCOL_FN_STARTED;
}
}
/* bump the flag one more time for the extra rank */
ready_flag = flag_offset + 1 + pow_k + 2;
/* finish off the last piece, send the data back to the extra */
if( 0 < exchange_node->n_extra_sources ) {
/* simply announce my arrival */
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
}
FINISHED:
my_ctl_pointer->starting_flag_value[bcol_id]++;
return BCOL_FN_COMPLETE;
}
/* allgather progress function */
int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variables */
int flag_offset;
volatile int8_t ready_flag;
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
uint32_t buffer_index = input_args->buffer_index;
int *active_requests =
&(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
int *iter = iteration; /* double alias */
int leading_dim, idx, buff_idx;
int i, j, probe;
int src;
int max_requests = 0; /* critical to set this */
int pow_k, tree_order;
int bcol_id = (int) bcol_module->super.bcol_id;
bool matched;
int64_t sequence_number=input_args->sequence_num;
int my_rank = bcol_module->super.sbgp_partner_module->my_index;
volatile mca_bcol_basesmuma_payload_t *data_buffs;
/* control structures */
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;
#if 0
fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n",my_rank,
*active_requests,*iter,*status);
#endif
buff_idx = buffer_index;
leading_dim=bcol_module->colls_no_user_data.size_of_group;
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
bcol_module->colls_with_user_data.data_buffs+idx;
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
/* increment the starting flag by one and return */
flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];
ready_flag = *status;
/* k-nomial parameters */
tree_order = exchange_node->tree_order;
pow_k = exchange_node->log_tree_order;
/* calculate the maximum number of requests
* at each level each rank communicates with
* at most (k - 1) peers
* so if we set k - 1 bit fields in "max_requests", then
* we have max_request == 2^(k - 1) -1
*/
for(i = 0; i < (tree_order - 1); i++){
max_requests ^= (1<<i);
}
/* let's begin the collective, starting with extra ranks and their
* respective proxies
*/
if( EXTRA_NODE == exchange_node->node_type ) {
/* If I'm in here, then I must be looking for data */
ready_flag = flag_offset + 1 + pow_k + 2;
src = exchange_node->rank_extra_sources_array[0];
peer_ctl_pointer = data_buffs[src].ctl_struct;
for( i = 0; i < cm->num_to_probe ; i++ ) {
if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
goto FINISHED;
}
}
/* haven't found it, state is cached, bail out */
return BCOL_FN_STARTED;
}else if ( ( -1 == *iteration ) && (0 < exchange_node->n_extra_sources) ) {
/* I am a proxy for someone */
src = exchange_node->rank_extra_sources_array[0];
peer_ctl_pointer = data_buffs[src].ctl_struct;
/* probe for extra rank's arrival */
for( i = 0, matched = false ; i < cm->num_to_probe && !matched ; i++) {
if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
matched = true;
/* bump the flag */
ready_flag++;
*iteration = 0;
break;
}
}
if (!matched) {
return BCOL_FN_STARTED;
}
}
/* start the recursive k - ing phase */
for( *iter=*iteration; *iter < pow_k; (*iter)++) {
/* I am ready at this level */
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
if( 0 == *active_requests ) {
/* flip some bits, if we don't have active requests from a previous visit */
CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iter],tree_order);
}
for( j = 0; j < (tree_order - 1); j++ ) {
/* recv phase */
src = exchange_node->rank_exchanges[*iter][j];
if( src < 0 ) {
/* then not a valid rank, continue
*/
continue;
}
peer_ctl_pointer = data_buffs[src].ctl_struct;
if( !(*active_requests&(1<<j))){
/* I am putting the probe loop as the inner most loop to achieve
* better temporal locality
*/
for( probe = 0; probe < cm->num_to_probe ; probe++){
if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
/* flip the request's bit */
*active_requests ^= (1<<j);
break;
}
}
}
}
if( max_requests == *active_requests ){
/* bump the ready flag */
ready_flag++;
/* reset the active requests for the next level */
*active_requests = 0;
/* calculate the number of active requests
* logically makes sense to do it here. We don't
* want to inadvertantly flip a bit to zero that we
* set previously
*/
} else {
/* state is saved hop out
*/
*status = my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id];
return BCOL_FN_STARTED;
}
}
/* bump the flag one more time for the extra rank */
ready_flag = flag_offset + 1 + pow_k + 2;
/* finish off the last piece, send the data back to the extra */
if( 0 < exchange_node->n_extra_sources ) {
/* simply announce my arrival */
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
}
FINISHED:
my_ctl_pointer->starting_flag_value[bcol_id]++;
return BCOL_FN_COMPLETE;
}
/* Register k-nomial barrier functions to the BCOL function table,
* so they can be selected
*/
int bcol_basesmuma_barrier_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
comm_attribs.bcoll_type = BCOL_BARRIER;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_basesmuma_k_nomial_barrier_init,
bcol_basesmuma_k_nomial_barrier_progress);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,588 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2014 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "opal/mca/mpool/base/base.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/patterns/comm/coll_ops.h"
#include "opal/class/opal_object.h"
#include "opal/dss/dss.h"
#include "bcol_basesmuma.h"
int base_bcol_basesmuma_setup_ctl_struct(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *cs,
sm_buffer_mgmt *ctl_mgmt);
/* this is the new one, uses the pml allgather */
int base_bcol_basesmuma_exchange_offsets(
mca_bcol_basesmuma_module_t *sm_bcol_module,
void **result_array, uint64_t mem_offset, int loop_limit,
int leading_dim)
{
int ret=OMPI_SUCCESS,i;
int count;
int index_in_group;
char *send_buff;
char *recv_buff;
uint64_t rem_mem_offset;
/* malloc some memory */
count = sizeof(uint64_t) + sizeof(int);
send_buff = (char *) malloc(count);
recv_buff = (char *) malloc(count *
sm_bcol_module->super.sbgp_partner_module->group_size);
/* exchange the base pointer for the controls structures - gather
* every one else's infromation.
*/
/* pack the offset of the allocated region */
memcpy((void *) send_buff, (void *) &(sm_bcol_module->super.sbgp_partner_module->my_index), sizeof(int));
memcpy((void *) (send_buff+ sizeof(int)), (void *) &(mem_offset), sizeof(uint64_t));
/* get the offsets from all procs, so can setup the control data
* structures.
*/
ret=comm_allgather_pml((void *) send_buff,(void *) recv_buff,count,
MPI_BYTE,
sm_bcol_module->super.sbgp_partner_module->my_index,
sm_bcol_module->super.sbgp_partner_module->group_size,
sm_bcol_module->super.sbgp_partner_module->group_list,
sm_bcol_module->super.sbgp_partner_module->group_comm);
if( OMPI_SUCCESS != ret ) {
goto exit_ERROR;
}
/* get the control stucture offsets within the shared memory
* region and populate the control structures - we do not assume
* any symmetry in memory layout of each process
*/
/* loop over the procs in the group */
for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){
int array_id;
/* get this peer's index in the group */
memcpy((void *) &index_in_group, (void *) (recv_buff + i*count) , sizeof(int));
/* get the offset */
memcpy((void *) &rem_mem_offset, (void *) (recv_buff + i*count + sizeof(int)), sizeof(uint64_t));
array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group);
result_array[array_id]=(void *)(uintptr_t)rem_mem_offset;
}
exit_ERROR:
/* clean up */
if( NULL != send_buff ) {
free(send_buff);
send_buff = NULL;
}
if( NULL != recv_buff ) {
free(recv_buff);
recv_buff = NULL;
}
return ret;
}
#if 0
int base_bcol_basesmuma_exchange_offsets(
mca_bcol_basesmuma_module_t *sm_bcol_module,
void **result_array, uint64_t mem_offset, int loop_limit,
int leading_dim)
{
int ret=OMPI_SUCCESS,i,dummy;
int index_in_group, pcnt;
opal_list_t peers;
ompi_namelist_t *peer;
ompi_proc_t *proc_temp, *my_id;
opal_buffer_t *send_buffer = OBJ_NEW(opal_buffer_t);
opal_buffer_t *recv_buffer = OBJ_NEW(opal_buffer_t);
uint64_t rem_mem_offset;
/* exchange the base pointer for the controls structures - gather
* every one else's infromation.
*/
/* get list of procs that will participate in the communication */
OBJ_CONSTRUCT(&peers, opal_list_t);
for (i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++) {
/* get the proc info */
proc_temp = ompi_comm_peer_lookup(
sm_bcol_module->super.sbgp_partner_module->group_comm,
sm_bcol_module->super.sbgp_partner_module->group_list[i]);
peer = OBJ_NEW(ompi_namelist_t);
peer->name.jobid = proc_temp->proc_name.jobid;
peer->name.vpid = proc_temp->proc_name.vpid;
opal_list_append(&peers,&peer->super); /* this is with the new field called "super" in ompi_namelist_t struct */
}
/* pack up the data into the allgather send buffer */
if (NULL == send_buffer || NULL == recv_buffer) {
opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for sbuffer or rbuffer\n");
ret = OMPI_ERROR;
goto exit_ERROR;
}
/* get my proc information */
my_id = ompi_proc_local();
/* pack my information */
ret = opal_dss.pack(send_buffer,
&(sm_bcol_module->super.sbgp_partner_module->my_index),1,OPAL_UINT32);
if (OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "Error packing my_index!!\n");
goto exit_ERROR;
}
/* pack the offset of the allocated region */
ret = opal_dss.pack(send_buffer,&(mem_offset),1,OPAL_UINT64);
if (OMPI_SUCCESS != ret) {
goto exit_ERROR;
}
/* get the offsets from all procs, so can setup the control data
* structures.
*/
if (OMPI_SUCCESS != (ret = ompi_rte_allgather_list(&peers, send_buffer, recv_buffer))) {
opal_output (ompi_bcol_base_framework.framework_output, "ompi_rte_allgather_list returned error %d\n", ret);
goto exit_ERROR;
}
/* unpack the dummy */
pcnt=1;
ret = opal_dss.unpack(recv_buffer,&dummy, &pcnt, OPAL_INT32);
if (OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for dummy\n",ret);
goto exit_ERROR;
}
/* get the control stucture offsets within the shared memory
* region and populate the control structures - we do not assume
* any symmetry in memory layout of each process
*/
/* loop over the procs in the group */
for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){
int array_id;
pcnt=1;
ret = opal_dss.unpack(recv_buffer,&index_in_group, &pcnt, OPAL_UINT32);
if (OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote index_in_group\n",ret);
goto exit_ERROR;
}
/* get the offset */
pcnt=1;
ret = opal_dss.unpack(recv_buffer,&rem_mem_offset, &pcnt, OPAL_UINT64);
if (OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote memory offset\n",ret);
goto exit_ERROR;
}
array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group);
result_array[array_id]=(void *)rem_mem_offset;
}
/* clean up */
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
while( NULL !=peer) {
OBJ_RELEASE(peer);
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
}
OBJ_DESTRUCT(&peers);
if( send_buffer ) {
OBJ_RELEASE(send_buffer);
}
if( recv_buffer ) {
OBJ_RELEASE(recv_buffer);
}
return ret;
exit_ERROR:
/* free peer list */
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
while( NULL !=peer) {
OBJ_RELEASE(peer);
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
}
OBJ_DESTRUCT(&peers);
if( send_buffer ) {
OBJ_RELEASE(send_buffer);
}
if( recv_buffer ) {
OBJ_RELEASE(recv_buffer);
}
return ret;
}
#endif
static int base_bcol_basesmuma_exchange_ctl_params(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *cs,
sm_buffer_mgmt *ctl_mgmt, list_data_t *data_blk)
{
int ret=OMPI_SUCCESS,i,loop_limit;
int leading_dim, buf_id;
void *mem_offset;
unsigned char *base_ptr;
mca_bcol_basesmuma_ctl_struct_t *ctl_ptr;
/* data block base offset in the mapped file */
mem_offset = (void *)((uintptr_t)data_blk->data -
(uintptr_t)cs->sm_ctl_structs->data_addr);
/* number of buffers in data block */
loop_limit=cs->basesmuma_num_mem_banks+ctl_mgmt->number_of_buffs;
leading_dim=ctl_mgmt->size_of_group;
ret=comm_allgather_pml(&mem_offset, ctl_mgmt->ctl_buffs, sizeof(void *),
MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index,
sm_bcol_module->super.sbgp_partner_module->group_size,
sm_bcol_module->super.sbgp_partner_module->group_list,
sm_bcol_module->super.sbgp_partner_module->group_comm);
if( OMPI_SUCCESS != ret ) {
goto exit_ERROR;
}
#if 0
ret=base_bcol_basesmuma_exchange_offsets( sm_bcol_module,
(void **)ctl_mgmt->ctl_buffs, mem_offset, loop_limit, leading_dim);
if( OMPI_SUCCESS != ret ) {
goto exit_ERROR;
}
#endif
/* convert memory offset to virtual address in current rank */
for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {
/* get the base pointer */
int array_id=SM_ARRAY_INDEX(leading_dim,0,i);
if( i == sm_bcol_module->super.sbgp_partner_module->my_index) {
/* me */
base_ptr=cs->sm_ctl_structs->map_addr;
} else {
base_ptr=sm_bcol_module->ctl_backing_files_info[i]->sm_mmap->map_addr;
}
ctl_mgmt->ctl_buffs[array_id]=(void *)
(uintptr_t)(((uint64_t)(uintptr_t)ctl_mgmt->ctl_buffs[array_id])+(uint64_t)(uintptr_t)base_ptr);
for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) {
int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
ctl_mgmt->ctl_buffs[array_id]=(void *) (uintptr_t)((uint64_t)(uintptr_t)(ctl_mgmt->ctl_buffs[array_id_m1])+
(uint64_t)(uintptr_t)sizeof(mca_bcol_basesmuma_ctl_struct_t));
}
}
/* initialize my control structues */
for( buf_id = 0 ; buf_id < loop_limit ; buf_id++ ) {
int my_idx=sm_bcol_module->super.sbgp_partner_module->my_index;
int array_id=SM_ARRAY_INDEX(leading_dim,buf_id,my_idx);
ctl_ptr = (mca_bcol_basesmuma_ctl_struct_t *)
ctl_mgmt->ctl_buffs[array_id];
/* initialize the data structures - RLG, this is only one data
* structure that needs to be initialized, more are missing */
ctl_ptr->sequence_number=-1;
ctl_ptr->flag=-1;
ctl_ptr->index=0;
ctl_ptr->src_ptr = NULL;
}
return ret;
exit_ERROR:
return ret;
}
static int base_bcol_basesmuma_setup_ctl (mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *cs)
{
const int my_index = sm_bcol_module->super.sbgp_partner_module->my_index;;
bcol_basesmuma_smcm_file_t input_file;
int ret;
/* exchange remote addressing information if it has not already been done */
if (NULL == sm_bcol_module->ctl_backing_files_info) {
input_file.file_name=cs->sm_ctl_structs->map_path;
input_file.size=cs->sm_ctl_structs->map_size;
input_file.size_ctl_structure=0;
input_file.data_seg_alignment=BASESMUMA_CACHE_LINE_SIZE;
input_file.mpool_size=cs->sm_ctl_structs->map_size;
ret = bcol_basesmuma_smcm_allgather_connection(sm_bcol_module,
sm_bcol_module->super.sbgp_partner_module,
&(cs->sm_connections_list),
&(sm_bcol_module->ctl_backing_files_info),
sm_bcol_module->super.sbgp_partner_module->group_comm,
input_file, cs->clt_base_fname,
false);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
/* fill in the pointer to other ranks scartch shared memory */
if (NULL == sm_bcol_module->shared_memory_scratch_space) {
sm_bcol_module->shared_memory_scratch_space =
calloc (sm_bcol_module->super.sbgp_partner_module->group_size, sizeof (void *));
if (!sm_bcol_module->shared_memory_scratch_space) {
opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for shared_memory_scratch_space.");
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (int i = 0 ; i < sm_bcol_module->super.sbgp_partner_module->group_size ; ++i) {
if (i == my_index) {
/* local file data is not cached in this list */
continue;
}
sm_bcol_module->shared_memory_scratch_space[i] =
(void *)((intptr_t) sm_bcol_module->ctl_backing_files_info[i]->sm_mmap +
cs->scratch_offset_from_base_ctl_file);
}
sm_bcol_module->shared_memory_scratch_space[my_index] =
(void *)((intptr_t) cs->sm_ctl_structs->map_addr + cs->scratch_offset_from_base_ctl_file);
}
return OMPI_SUCCESS;
}
int base_bcol_basesmuma_setup_ctl_struct(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *cs,
sm_buffer_mgmt *ctl_mgmt)
{
int n_ctl, n_levels;
int n_ctl_structs;
size_t malloc_size;
/*
* set my no user-data conrol structures
*/
/* number of banks and regions per bank are already a power of 2 */
n_ctl_structs=cs->basesmuma_num_mem_banks*
cs->basesmuma_num_regions_per_bank;
/* initialize the control structure management struct -
* for collectives without user data
*---------------------------------------------------------------
*/
ctl_mgmt->number_of_buffs=n_ctl_structs;
ctl_mgmt->num_mem_banks=
cs->basesmuma_num_mem_banks;
ctl_mgmt->num_buffs_per_mem_bank=
cs->basesmuma_num_regions_per_bank;
ctl_mgmt->size_of_group=
sm_bcol_module->super.sbgp_partner_module->group_size;
ompi_roundup_to_power_radix(2,cs->basesmuma_num_regions_per_bank,&n_levels);
ctl_mgmt->log2_num_buffs_per_mem_bank=n_levels;
ompi_roundup_to_power_radix(2,n_ctl_structs,&n_levels);
ctl_mgmt->log2_number_of_buffs=n_levels;
ctl_mgmt->mask=n_ctl_structs-1;
sm_bcol_module->super.n_poll_loops=cs->n_poll_loops;
malloc_size=
(ctl_mgmt->number_of_buffs +
ctl_mgmt->num_mem_banks ) *
ctl_mgmt->size_of_group *
sizeof(void *);
ctl_mgmt->ctl_buffs = malloc(malloc_size);
if (!ctl_mgmt->ctl_buffs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/*
* setup the no-data buffer managment data
*/
n_ctl = ctl_mgmt->num_mem_banks;
ctl_mgmt->ctl_buffs_mgmt = (mem_bank_management_t *) calloc (n_ctl, sizeof (mem_bank_management_t));
if (!ctl_mgmt->ctl_buffs_mgmt) {
opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for ctl_buffs_mgmt");
free (ctl_mgmt->ctl_buffs);
ctl_mgmt->ctl_buffs = NULL;
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* initialize each individual element */
for (int i = 0 ; i < n_ctl ; ++i) {
opal_list_item_t *item;
opal_mutex_t *mutex_ptr;
ctl_mgmt->ctl_buffs_mgmt[i].available_buffers=
ctl_mgmt->num_buffs_per_mem_bank;
ctl_mgmt->ctl_buffs_mgmt[i].number_of_buffers=
ctl_mgmt->num_buffs_per_mem_bank;
mutex_ptr = &(ctl_mgmt->ctl_buffs_mgmt[i].mutex);
OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t);
ctl_mgmt->ctl_buffs_mgmt[i].index_shared_mem_ctl_structs=i;
item = (opal_list_item_t *)&(ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc);
OBJ_CONSTRUCT(item, opal_list_item_t);
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.sm_module =
sm_bcol_module;
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.pool_index = i;
/* get the sm_buffer_mgmt pointer for the control structures */
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.coll_buff = ctl_mgmt;
}
return OMPI_SUCCESS;
}
/*
* this function initializes the internal scratch buffers and control
* structures that will be used by the module. It also intitializes
* the payload buffer management structures.
*/
int base_bcol_basesmuma_setup_library_buffers(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *cs)
{
int ret=OMPI_SUCCESS,i;
int n_ctl_structs;
size_t ctl_segement_size,total_memory;
int max_elements;
unsigned char *data_ptr;
/* */
/* setup the control struct memory */
if(!cs->sm_ctl_structs) {
ret = mca_bcol_basesmuma_allocate_sm_ctl_memory(cs);
if(OMPI_SUCCESS != ret) {
opal_output (ompi_bcol_base_framework.framework_output, "In bcol_comm_query mca_bcol_basesmuma_allocate_sm_ctl_memory failed\n");
return ret;
}
/*
* put the memory onto the free list - we have worried about
* alignment in the mpool allocation, and assume that the
* ctl structures have the approriate size to mantain alignment
*/
/* figure out segment size */
n_ctl_structs=cs->basesmuma_num_mem_banks*
cs->basesmuma_num_regions_per_bank;
/* add memory for the control structure used for recycling the banks */
n_ctl_structs+=cs->basesmuma_num_mem_banks;
ctl_segement_size=n_ctl_structs*
sizeof(mca_bcol_basesmuma_ctl_struct_t);
total_memory=cs->sm_ctl_structs->map_size - (
(char *)(cs->sm_ctl_structs->data_addr)-
(char *)(cs->sm_ctl_structs->map_addr));
total_memory-=cs->my_scratch_shared_memory_size;
max_elements=total_memory/ctl_segement_size;
/* populate the free list */
data_ptr=cs->sm_ctl_structs->data_addr;
for( i=0 ; i < max_elements ; i++ ) {
list_data_t *item = OBJ_NEW(list_data_t);
if( !item ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
item->data=(void *)data_ptr;
opal_list_append(&(cs->ctl_structures),(opal_list_item_t *)item);
data_ptr+=ctl_segement_size;
}
/* set the scratch memory pointer and offset */
cs->my_scratch_shared_memory=(char *)data_ptr;
cs->scratch_offset_from_base_ctl_file=(size_t)
((char *)data_ptr-(char *)cs->sm_ctl_structs->map_addr);
/* At this stage the memory is mapped and ready to use by the local rank.
* However, the memory of other processes has not yet been mmaped into the
* memory of this process.
*/
}
/* intialize no_userdata_ctl */
sm_bcol_module->no_userdata_ctl=(list_data_t *)
opal_list_remove_last(&(cs->ctl_structures));
if (!sm_bcol_module->no_userdata_ctl) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* intialize userdata_ctl */
sm_bcol_module->userdata_ctl = (list_data_t *)
opal_list_remove_last(&(cs->ctl_structures));
if (!sm_bcol_module->userdata_ctl) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
ret = base_bcol_basesmuma_setup_ctl (sm_bcol_module, cs);
if (OMPI_SUCCESS != ret) {
return ret;
}
ret = base_bcol_basesmuma_setup_ctl_struct (sm_bcol_module, cs, &(sm_bcol_module->colls_no_user_data));
if( OMPI_SUCCESS != ret ) {
return ret;
}
ret = base_bcol_basesmuma_setup_ctl_struct (sm_bcol_module, cs, &(sm_bcol_module->colls_with_user_data));
if( OMPI_SUCCESS != ret ) {
return ret;
}
/* used for blocking recursive doubling barrier */
sm_bcol_module->index_blocking_barrier_memory_bank=0;
/* gather the offsets of the control structs relative to the base
* of the shared memory file, and fill in the table with the
* address of all the control structues.
*/
ret = base_bcol_basesmuma_exchange_ctl_params(sm_bcol_module, cs,
&(sm_bcol_module->colls_no_user_data),sm_bcol_module->no_userdata_ctl);
if( OMPI_SUCCESS != ret ) {
return ret;
}
ret = base_bcol_basesmuma_exchange_ctl_params(sm_bcol_module, cs,
&(sm_bcol_module->colls_with_user_data),sm_bcol_module->userdata_ctl);
if( OMPI_SUCCESS != ret ) {
return ret;
}
return OMPI_SUCCESS;
}
OBJ_CLASS_INSTANCE(list_data_t,
opal_list_item_t, NULL, NULL);

Просмотреть файл

@ -1,460 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif
#include "ompi/proc/proc.h"
#include "ompi/patterns/comm/coll_ops.h"
#include "opal/align.h"
#include "opal/dss/dss.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_hash_table.h"
#include "bcol_basesmuma.h"
#define SM_BACKING_FILE_NAME_MAX_LEN 256
static bcol_basesmuma_smcm_mmap_t * bcol_basesmuma_smcm_reg_mmap(void *in_ptr, int fd, size_t length,
size_t addr_offset, size_t alignment,
char *file_name);
struct file_info_t {
uint32_t vpid;
uint32_t jobid;
uint64_t file_size;
uint64_t size_ctl_structure;
uint64_t data_seg_alignment;
char file_name[SM_BACKING_FILE_NAME_MAX_LEN];
};
/* need to allocate space for the peer */
static void bcol_basesmuma_smcm_proc_item_t_construct (bcol_basesmuma_smcm_proc_item_t * item)
{
memset ((char *) item + sizeof (item->item), 0, sizeof (*item) - sizeof (item->item));
}
/* need to free the space for the peer */
static void bcol_basesmuma_smcm_proc_item_t_destruct (bcol_basesmuma_smcm_proc_item_t * item)
{
if (item->sm_mmap) {
OBJ_RELEASE(item->sm_mmap);
}
if (item->sm_file.file_name) {
free (item->sm_file.file_name);
item->sm_file.file_name = NULL;
}
}
OBJ_CLASS_INSTANCE(bcol_basesmuma_smcm_proc_item_t,
opal_list_item_t,
bcol_basesmuma_smcm_proc_item_t_construct,
bcol_basesmuma_smcm_proc_item_t_destruct);
static void bcol_basesmuma_smcm_mmap_construct (bcol_basesmuma_smcm_mmap_t *smcm_mmap)
{
memset ((char *) smcm_mmap + sizeof (smcm_mmap->super), 0, sizeof (*smcm_mmap) - sizeof (smcm_mmap->super));
}
static void bcol_basesmuma_smcm_mmap_destruct (bcol_basesmuma_smcm_mmap_t *smcm_mmap)
{
if (smcm_mmap->map_seg) {
munmap ((void *)smcm_mmap->map_seg, smcm_mmap->map_size);
smcm_mmap->map_seg = NULL;
}
if (smcm_mmap->map_path) {
free (smcm_mmap->map_path);
smcm_mmap->map_path = NULL;
}
}
OBJ_CLASS_INSTANCE(bcol_basesmuma_smcm_mmap_t, opal_list_item_t,
bcol_basesmuma_smcm_mmap_construct,
bcol_basesmuma_smcm_mmap_destruct);
/* smcm_allgather_connection:
This function is called when a shared memory subgroup wants to establish shared memory "connections" among
a group of processes.
This function DOES NOT create any shared memory backing files, it only mmaps already existing files. Shared
memory files are created by the shared memory registration function
-----------------------------------------------------------------------------------------------------------
Input params:
- sbgp module The subgrouping module contains the list of ranks to wire up.
- peer_list An opal list containing a list of bcol_basesmuma_smcm_proc_item_t types. This
contains a list of peers whose shared memory files I have already mapped.
Upon completion of the allgather exchange with all members of the group and depending on the
value of "map_all", my peers' shared memory files are mapped into my local virtual memory
space, with all pertinent information being stored in an bcol_basesmuma_smcm_proc_item_t which is
subsequently appended onto the "peer_list".
- comm The ompi_communicator_t communicator.
- input A data struct that caches the information about my shared memory file.
- map_all Bool that determines whether or not to go ahead and map the files from all of the peers
defined in the sbgp-ing module. If map_all == true, then go ahead and mmap all of the files
obtained in the exchange and append the information to the "peer_list". If map_all == false
then make a check and only mmap those peers' files whose vpid/jobid/filename combination do
not already exist in the "peer_list". Once mapping is completed, append this peer's information
to the "peer_list".
-----------------------------------------------------------------------------------------------------------
*
*/
int bcol_basesmuma_smcm_allgather_connection(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_sbgp_base_module_t *module,
opal_list_t *peer_list,
bcol_basesmuma_smcm_proc_item_t ***back_files,
ompi_communicator_t *comm,
bcol_basesmuma_smcm_file_t input,
char *base_fname,
bool map_all)
{
/* define local variables */
int rc, i, fd;
ptrdiff_t mem_offset;
ompi_proc_t *proc_temp, *my_id;
bcol_basesmuma_smcm_proc_item_t *temp;
bcol_basesmuma_smcm_proc_item_t *item_ptr;
bcol_basesmuma_smcm_proc_item_t **backing_files;
struct file_info_t local_file;
struct file_info_t *all_files=NULL;
/* sanity check */
if (strlen(input.file_name) > SM_BACKING_FILE_NAME_MAX_LEN-1) {
opal_output (ompi_bcol_base_framework.framework_output, "backing file name too long: %s len :: %d",
input.file_name, (int) strlen(input.file_name));
return OMPI_ERR_BAD_PARAM;
}
backing_files = (bcol_basesmuma_smcm_proc_item_t **)
calloc(module->group_size, sizeof(bcol_basesmuma_smcm_proc_item_t *));
if (!backing_files) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* FIXME *back_files might have been already allocated
* so free it in order to avoid a memory leak */
if (NULL != *back_files) {
free (*back_files);
}
*back_files = backing_files;
my_id = ompi_proc_local();
/* Phase One:
gather a list of processes that will participate in the allgather - I'm
preparing this list from the sbgp-ing module that was passed into the function */
/* fill in local file information */
local_file.vpid = ((orte_process_name_t*)&my_id->super.proc_name)->vpid;
local_file.jobid = ((orte_process_name_t*)&my_id->super.proc_name)->jobid;
local_file.file_size=input.size;
local_file.size_ctl_structure=input.size_ctl_structure;
local_file.data_seg_alignment=input.data_seg_alignment;
strcpy (local_file.file_name, input.file_name);
/* will exchange this data type as a string of characters -
* this routine is first called before MPI_init() completes
* and before error handling is setup, so can't use the
* MPI data types to send this data */
all_files = (struct file_info_t *) calloc(module->group_size,
sizeof (struct file_info_t));
if (!all_files) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* exchange data */
rc = comm_allgather_pml(&local_file,all_files,sizeof(struct file_info_t), MPI_CHAR,
sm_bcol_module->super.sbgp_partner_module->my_index,
sm_bcol_module->super.sbgp_partner_module->group_size,
sm_bcol_module->super.sbgp_partner_module->group_list,
sm_bcol_module->super.sbgp_partner_module->group_comm);
if( OMPI_SUCCESS != rc ) {
opal_output (ompi_bcol_base_framework.framework_output, "failed in comm_allgather_pml. Error code: %d", rc);
goto Error;
}
/* Phase four:
loop through the receive buffer, unpack the data recieved from remote peers */
for (i = 0; i < module->group_size; i++) {
struct file_info_t *rem_file = all_files + i;
/* check if this is my index or if the file is already mapped (set above). ther
* is no reason to look through the peer list again because no two members of
* the group will have the same vpid/jobid pair. ignore this previously found
* mapping if map_all was requested (NTH: not sure why exactly since we re-map
* and already mapped file) */
if (sm_bcol_module->super.sbgp_partner_module->my_index == i) {
continue;
}
proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]);
OPAL_LIST_FOREACH(item_ptr, peer_list, bcol_basesmuma_smcm_proc_item_t) {
/* if the vpid/jobid/filename combination already exists in the list,
then do not map this peer's file --- because you already have */
if (0 == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
OMPI_CAST_RTE_NAME(&proc_temp->super.proc_name),
&item_ptr->peer) &&
0 == strcmp (item_ptr->sm_file.file_name, rem_file->file_name)) {
++item_ptr->refcnt;
/* record file data */
backing_files[i] = item_ptr;
break;
}
}
if (!map_all && backing_files[i]) {
continue;
}
temp = OBJ_NEW(bcol_basesmuma_smcm_proc_item_t);
if (!temp) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto Error;
}
temp->peer.vpid = rem_file->vpid;
temp->peer.jobid = rem_file->jobid;
temp->sm_file.file_name = strdup (rem_file->file_name);
if (!temp->sm_file.file_name) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
OBJ_RELEASE(temp);
goto Error;
}
temp->sm_file.size = (size_t) rem_file->file_size;
temp->sm_file.mpool_size = (size_t) rem_file->file_size;
temp->sm_file.size_ctl_structure = (size_t) rem_file->size_ctl_structure;
temp->sm_file.data_seg_alignment = (size_t) rem_file->data_seg_alignment;
temp->refcnt = 1;
/* Phase Five:
If map_all == true, then we map every peer's file
else we check to see if I have already mapped this
vpid/jobid/filename combination and if I have, then
I do not mmap this peer's file.
*
*/
fd = open(temp->sm_file.file_name, O_RDWR, 0600);
if (0 > fd) {
opal_output (ompi_bcol_base_framework.framework_output, "SMCM Allgather failed to open sm backing file %s. errno = %d",
temp->sm_file.file_name, errno);
rc = OMPI_ERROR;
goto Error;
}
/* map the file */
temp->sm_mmap = bcol_basesmuma_smcm_reg_mmap (NULL, fd, temp->sm_file.size,
temp->sm_file.size_ctl_structure,
temp->sm_file.data_seg_alignment,
temp->sm_file.file_name);
close (fd);
if (NULL == temp->sm_mmap) {
opal_output (ompi_bcol_base_framework.framework_output, "mmapping failed to map remote peer's file");
OBJ_RELEASE(temp);
rc = OMPI_ERROR;
goto Error;
}
/* compute memory offset */
mem_offset = (ptrdiff_t) temp->sm_mmap->data_addr -
(ptrdiff_t) temp->sm_mmap->map_seg;
temp->sm_mmap->map_seg->seg_offset = mem_offset;
temp->sm_mmap->map_seg->seg_size = temp->sm_file.size - mem_offset;
/* more stuff to follow */
/* append this peer's info, including shared memory map addr, onto the
peer_list */
/* record file data */
backing_files[i] = (bcol_basesmuma_smcm_proc_item_t *) temp;
opal_list_append(peer_list, (opal_list_item_t*) temp);
}
rc = OMPI_SUCCESS;
Error:
/* error clean-up and return */
if (NULL != all_files) {
free(all_files);
}
return rc;
}
int bcol_basesmuma_smcm_release_connections (mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_sbgp_base_module_t *sbgp_module, opal_list_t *peer_list,
bcol_basesmuma_smcm_proc_item_t ***back_files)
{
bcol_basesmuma_smcm_proc_item_t **smcm_procs = *back_files;
for (int i = 0 ; i < sbgp_module->group_size ; ++i) {
if (smcm_procs[i] && 0 == --smcm_procs[i]->refcnt) {
opal_list_remove_item (peer_list, (opal_list_item_t *) smcm_procs[i]);
OBJ_RELEASE(smcm_procs[i]);
}
}
free (smcm_procs);
*back_files = NULL;
return OMPI_SUCCESS;
}
/*
* mmap the specified file as a shared file. No information exchange with other
* processes takes place within this routine.
* This function assumes that the memory has already been allocated, and only the
* mmap needs to be done.
*/
bcol_basesmuma_smcm_mmap_t *bcol_basesmuma_smcm_mem_reg(void *in_ptr,
size_t length,
size_t alignment,
char* file_name)
{
/* local variables */
int fd = -1;
bcol_basesmuma_smcm_mmap_t *map = NULL;
int rc;
/* if pointer is not allocated - return error. We have no clue how the user will allocate or
* free this memory.
*/
/* open the shared memory backing file */
fd = open(file_name, O_CREAT|O_RDWR,0600);
if (fd < 0) {
opal_output (ompi_bcol_base_framework.framework_output, "basesmuma shared memory allocation open failed with errno: %d",
errno);
return NULL;
}
if (0 != ftruncate(fd,length)) {
opal_output (ompi_bcol_base_framework.framework_output, "basesmuma shared memory allocation ftruncate failed with errno: %d",
errno);
} else {
/* ensure there is enough space for the backing store */
rc = ftruncate (fd, length);
if (0 > rc) {
opal_output (ompi_bcol_base_framework.framework_output, "failed to truncate the file to be mapped. errno: %d", errno);
close(fd);
return NULL;
}
map = bcol_basesmuma_smcm_reg_mmap(in_ptr, fd, length, 0, alignment, file_name);
if (NULL == map) {
close(fd);
return NULL;
}
}
/* no longer need this file descriptor. close it */
close (fd);
/* takes us to the top of the control structure */
return map;
}
static bcol_basesmuma_smcm_mmap_t * bcol_basesmuma_smcm_reg_mmap(void *in_ptr, int fd, size_t length,
size_t addr_offset, size_t alignment,
char *file_name)
{
/* local variables */
bcol_basesmuma_smcm_mmap_t *map;
bcol_basesmuma_smcm_file_header_t *seg;
unsigned char* myaddr = NULL;
int flags = MAP_SHARED;
/* set up the map object */
map = OBJ_NEW(bcol_basesmuma_smcm_mmap_t);
if (OPAL_UNLIKELY(NULL == map)) {
return NULL;
}
/* map the file and initialize the segment state */
if (NULL != in_ptr) {
flags |= MAP_FIXED;
}
seg = (bcol_basesmuma_smcm_file_header_t *)
mmap(in_ptr, length, PROT_READ|PROT_WRITE, flags, fd, 0);
if((void*)-1 == seg) {
OBJ_RELEASE(map);
return NULL;
}
map->map_path = strdup (file_name);
/* the first entry in the file is the control structure. the first entry
in the control structure is an mca_common_sm_file_header_t element */
map->map_seg = seg;
myaddr = (unsigned char *) seg + addr_offset;
/* if we have a data segment (i.e. if 0 != data_seg_alignement) */
if (alignment) {
myaddr = OPAL_ALIGN_PTR(myaddr, alignment, unsigned char*);
/* is addr past the end of the file? */
if ((unsigned char *) seg+length < myaddr) {
opal_output (ompi_bcol_base_framework.framework_output, "mca_bcol_basesmuma_sm_alloc_mmap: memory region too small len %lu add %p",
(unsigned long) length, (void*)myaddr);
OBJ_RELEASE(map);
munmap ((void *)seg, length);
return NULL;
}
}
map->data_addr = (unsigned char*) myaddr;
map->map_addr = (unsigned char*) seg;
map->map_size = length;
return map;
}

Просмотреть файл

@ -1,105 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef BCOL_BASESMUMA_SMCM_H
#define BCOL_BASESMUMA_SMCM_H
#include <sys/mman.h>
#include <stdio.h>
#include "ompi_config.h"
#include "ompi/proc/proc.h"
#include "opal/class/opal_object.h"
#include "opal/class/opal_list.h"
#include "opal/sys/atomic.h"
typedef struct bcol_basesmuma_smcm_file_header_t {
/* lock to control atomic access */
opal_atomic_lock_t seg_lock;
/* is the segment ready for use */
volatile int32_t seg_inited;
/* Offset to next available memory location available for allocation */
size_t seg_offset;
/* total size of the segment */
size_t seg_size;
} bcol_basesmuma_smcm_file_header_t;
typedef struct bcol_basesmuma_smcm_mmap_t {
/* double link list element */
opal_list_item_t super;
/* pointer to header imbeded in the shared memory file */
bcol_basesmuma_smcm_file_header_t *map_seg;
/* base address of the mmap'ed file */
unsigned char *map_addr;
/* base address of data segment */
unsigned char *data_addr;
/* How big it is (in bytes) */
size_t map_size;
/* Filename */
char *map_path;
} bcol_basesmuma_smcm_mmap_t;
OBJ_CLASS_DECLARATION(bcol_basesmuma_smcm_mmap_t);
/* Struct that characterizes a shared memory file */
struct bcol_basesmuma_smcm_file_t {
char *file_name;
size_t size;
size_t size_ctl_structure;
size_t data_seg_alignment;
size_t mpool_size;
};
typedef struct bcol_basesmuma_smcm_file_t bcol_basesmuma_smcm_file_t;
struct bcol_basesmuma_smcm_proc_item_t {
opal_list_item_t item; /* can put me on a free list */
int refcnt;
ompi_process_name_t peer;
bcol_basesmuma_smcm_file_t sm_file;
bcol_basesmuma_smcm_mmap_t *sm_mmap; /* Pointer to peer's sm file */
};
typedef struct bcol_basesmuma_smcm_proc_item_t bcol_basesmuma_smcm_proc_item_t;
OBJ_CLASS_DECLARATION(bcol_basesmuma_smcm_proc_item_t);
/* allocate shared memory file
* in_ptr - pointer to preallocated memory (if NULL, this will be mmaped)
* alignment - region memory alignment
* file name - fully qualified backing file name
*/
OMPI_DECLSPEC extern bcol_basesmuma_smcm_mmap_t *bcol_basesmuma_smcm_mem_reg(void *in_ptr,
size_t length,
size_t alignment,
char* file_name);
OMPI_DECLSPEC extern bcol_basesmuma_smcm_mmap_t* bcol_basesmuma_smcm_create_mmap(int fd,
size_t size, char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment);
#endif

Просмотреть файл

@ -1,103 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "bcol_basesmuma_utils.h"
/*
* Return closet power of K that is either greater than
* or equal to the group size.
*/
int pow_sm_k(int k, int number, int *pow_k)
{
int power = 0;
int n = 1;
if( 2 == k){
while(n <= number){
power++;
n <<= 1;
}
*pow_k = n >> 1;
} else {
while (n <= number) {
n *= k;
power++;
}
*pow_k = n/k;
}
return (power-1);
}
int get_k_nomial_src_list(int group_size,
int radix, int my_index,
int *src_list) {
/* local variables */
int radix_power;
int offset;
int kount = 0;
int src_temp;
radix_power = 1;
offset = 1;
while(offset < group_size) {
if( offset % (radix * radix_power) ) {
src_temp = my_index - offset;
/* wrap around */
if ( src_temp < 0 ) {
src_temp += group_size;
}
/* don't probe ghost nodes */
if( src_temp < group_size ) {
src_list[kount] = src_temp;
kount++;
}
offset+=radix_power;
} else {
radix_power *= radix;
}
}
/* return the actual number of nodes to poll on */
return kount;
}
int get_k_nomial_dst_size(int group_size, int radix, int my_index)
{
int dst_count = 0;
int radix_mask;
int k;
radix_mask = 1;
while (radix_mask < group_size) {
if (0 != my_index % (radix * radix_mask)) {
/* I found my level in tree */
break;
}
radix_mask *= radix;
}
radix_mask /= radix;
while(radix_mask > 0) {
/* For each level of tree, do sends */
for (k = 1;
k < radix && my_index + radix_mask * k < group_size;
++k) {
dst_count += 1 ;
}
radix_mask /= radix;
}
return dst_count;
}

Просмотреть файл

@ -1,64 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_BASESMUMA_UTILS_H
#define MCA_BCOL_BASESMUMA_UTILS_H
#include "ompi_config.h"
BEGIN_C_DECLS
#define BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,radix,relative_index, \
my_group_index, group_size, ready_flag) \
do { \
int k, child; \
while(radix_mask > 0){ \
for(k = 1; k < radix && relative_index+radix_mask*k<group_size; \
k++) {\
child = my_group_index+radix_mask*k; \
if(child >= group_size) { \
child -= group_size; \
} \
/*fprintf(stderr,"I am %d sending to child %d\n",my_group_index,child);*/ \
child_ctl_pointer = data_buffs[child].ctl_struct; \
child_ctl_pointer->src = my_group_index; \
/* this can be improved to make better asynchronous progress, but it's
* fine for now.
*/ \
while(child_ctl_pointer->sequence_number != sequence_number ); \
child_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; \
} \
radix_mask = radix_mask/radix; \
} \
} while( 0 )
/*
* Return closet power of K that is greater than or equal to "number".
*/
int pow_sm_k(int radix_k, int group_size, int *pow_k_group_size);
/*
* Get list of possible sources from which data may arrive based on a K-nomial tree fan-out.
*/
int get_k_nomial_src_list(int group_size, int radix,
int my_index, int *src_list);
int get_k_nomial_dst_size(int group_size, int radix, int my_index);
END_C_DECLS
#endif

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: ORNL
status: unmaintained

Просмотреть файл

@ -1,805 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_H
#define MCA_BCOL_H
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "ompi/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "opal/mca/mpool/mpool.h"
#include "ompi/mca/sbgp/sbgp.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/op/op.h"
#include "ompi/include/ompi/constants.h"
#include "ompi/patterns/net/netpatterns_knomial_tree.h"
#include "opal/util/show_help.h"
#include <limits.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/* Forward declaration - please do not remove it */
struct ml_buffers_t;
struct mca_bcol_base_coll_fn_comm_attributes_t;
struct mca_bcol_base_coll_fn_invoke_attributes_t;
struct mca_bcol_base_coll_fn_desc_t;
#define NUM_MSG_RANGES 5
#define MSG_RANGE_INITIAL (1024)*12
#define MSG_RANGE_INC 10
#define BCOL_THRESHOLD_UNLIMITED (INT_MAX)
/* Maximum size of a bcol's header. This allows us to correctly calculate the message
* thresholds. If the header of any bcol exceeds this value then increase this one
* to match. */
#define BCOL_HEADER_MAX 96
#define BCOL_HEAD_ALIGN 32 /* will turn into an MCA parameter after debug */
/*
* Functions supported
*/
enum bcol_coll {
/* blocking functions */
BCOL_ALLGATHER,
BCOL_ALLGATHERV,
BCOL_ALLREDUCE,
BCOL_ALLTOALL,
BCOL_ALLTOALLV,
BCOL_ALLTOALLW,
BCOL_BARRIER,
BCOL_BCAST,
BCOL_EXSCAN,
BCOL_GATHER,
BCOL_GATHERV,
BCOL_REDUCE,
BCOL_REDUCE_SCATTER,
BCOL_SCAN,
BCOL_SCATTER,
BCOL_SCATTERV,
BCOL_FANIN,
BCOL_FANOUT,
/* nonblocking functions */
BCOL_IALLGATHER,
BCOL_IALLGATHERV,
BCOL_IALLREDUCE,
BCOL_IALLTOALL,
BCOL_IALLTOALLV,
BCOL_IALLTOALLW,
BCOL_IBARRIER,
BCOL_IBCAST,
BCOL_IEXSCAN,
BCOL_IGATHER,
BCOL_IGATHERV,
BCOL_IREDUCE,
BCOL_IREDUCE_SCATTER,
BCOL_ISCAN,
BCOL_ISCATTER,
BCOL_ISCATTERV,
BCOL_IFANIN,
BCOL_IFANOUT,
BCOL_SYNC,
/* New function - needed for intermediate steps */
BCOL_REDUCE_TO_LEADER,
BCOL_NUM_OF_FUNCTIONS
};
typedef enum bcol_coll bcol_coll;
typedef enum bcol_elem_type {
BCOL_SINGLE_ELEM_TYPE,
BCOL_MULTI_ELEM_TYPE,
BCOL_NUM_OF_ELEM_TYPES
} bcol_elem_type;
typedef int (*mca_bcol_base_module_coll_support_all_types_fn_t)(bcol_coll coll_name);
typedef int (*mca_bcol_base_module_coll_support_fn_t)(int op, int dtype, bcol_elem_type elem_num);
/*
* Collective function status
*/
enum {
BCOL_FN_NOT_STARTED = (OMPI_ERR_MAX - 1),
BCOL_FN_STARTED = (OMPI_ERR_MAX - 2),
BCOL_FN_COMPLETE = (OMPI_ERR_MAX - 3)
};
/**
* Collective component initialization
*
* Initialize the given collective component. This function should
* initialize any component-level. data. It will be called exactly
* once during MPI_INIT.
*
* @note The component framework is not lazily opened, so attempts
* should be made to minimze the amount of memory allocated during
* this function.
*
* @param[in] enable_progress_threads True if the component needs to
* support progress threads
* @param[in] enable_mpi_threads True if the component needs to
* support MPI_THREAD_MULTIPLE
*
* @retval OMPI_SUCCESS Component successfully initialized
* @retval ORTE_ERROR An unspecified error occurred
*/
typedef int (*mca_bcol_base_component_init_query_fn_t)
(bool enable_progress_threads, bool enable_mpi_threads);
/**
* Query whether a component is available for the given sub-group
*
* Query whether the component is available for the given
* sub-group. If the component is available, an array of pointers should be
* allocated and returned (with refcount at 1). The module will not
* be used for collective operations until module_enable() is called
* on the module, but may be destroyed (via OBJ_RELEASE) either before
* or after module_enable() is called. If the module needs to release
* resources obtained during query(), it should do so in the module
* destructor.
*
* A component may provide NULL to this function to indicate it does
* not wish to run or return an error during module_enable().
*
* @note The communicator is available for point-to-point
* communication, but other functionality is not available during this
* phase of initialization.
*
* @param[in] sbgp Pointer to sub-group module.
* @param[out] priority Priority setting for component on
* this communicator
* @param[out] num_modules Number of modules that where generated
* for the sub-group module.
*
* @returns An array of pointer to an initialized modules structures if the component can
* provide a modules with the requested functionality or NULL if the
* component should not be used on the given communicator.
*/
typedef struct mca_bcol_base_module_t **(*mca_bcol_base_component_comm_query_fn_t)
(mca_sbgp_base_module_t *sbgp, int *num_modules);
typedef int (*mca_bcol_barrier_init_fn_t)(struct mca_bcol_base_module_t *bcol_module,
mca_sbgp_base_module_t *sbgp_module);
/*
* Macro for use in modules that are of type btl v2.0.0
*/
#define MCA_BCOL_BASE_VERSION_2_0_0 \
OMPI_MCA_BASE_VERSION_2_1_0("bcol", 2, 0, 0)
/* This is really an abstarction violation, but is the easiest way to get
* started. For memory management we need to know what bcol components
* have compatible memory management schemes. Such compatibility can
* be used to eliminate memory copies between levels in the collective
* operation hierarchy, by having the output buffer of one level be the
* input buffer to the next level
*/
enum {
BCOL_SHARED_MEMORY_UMA=0,
BCOL_SHARED_MEMORY_SOCKET,
BCOL_POINT_TO_POINT,
BCOL_IB_OFFLOAD,
BCOL_SIZE
};
OMPI_DECLSPEC extern int bcol_mpool_compatibility[BCOL_SIZE][BCOL_SIZE];
OMPI_DECLSPEC extern int bcol_mpool_index[BCOL_SIZE][BCOL_SIZE];
/* what are the input parameters ? too many void * pointers here */
typedef int (*bcol_register_mem_fn_t)(void *context_data, void *base,
size_t size, void **reg_desc);
/* deregistration function */
typedef int (*bcol_deregister_mem_fn_t)(void *context_data, void *reg_desc);
/* Bcol network context definition */
struct bcol_base_network_context_t {
opal_object_t super;
/* Context id - defined by upper layer, ML */
int context_id;
/* Any context information that bcol what to use */
void *context_data;
/* registration function */
bcol_register_mem_fn_t register_memory_fn;
/* deregistration function */
bcol_deregister_mem_fn_t deregister_memory_fn;
};
typedef struct bcol_base_network_context_t bcol_base_network_context_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(bcol_base_network_context_t);
/*
*primitive function types
*/
/* bcast */
enum {
/* small data function */
BCOL_BCAST_SMALL_DATA,
/* small data - dynamic decision making supported */
BCOL_BCAST_SMALL_DATA_DYNAMIC,
/* number of functions */
BCOL_NUM_BCAST_FUNCTIONS
};
/**
* BCOL instance.
*/
/* no limit on fragment size - this supports using user buffers rather
* than library buffers
*/
#define FRAG_SIZE_NO_LIMIT -1
/* forward declaration */
struct coll_bcol_collective_description_t;
struct mca_bcol_base_component_2_0_0_t {
/** Base component description */
mca_base_component_t bcol_version;
/** Component initialization function */
mca_bcol_base_component_init_query_fn_t collm_init_query;
/** Query whether component is useable for given communicator */
mca_bcol_base_component_comm_query_fn_t collm_comm_query;
/** If bcol supports all possible data types */
mca_bcol_base_module_coll_support_fn_t coll_support;
/** If bcol supports all possible data types for given collective operation */
mca_bcol_base_module_coll_support_all_types_fn_t coll_support_all_types;
/** Use this flag to prevent init_query multiple calls
in case we have the same bcol more than on a single level */
bool init_done;
/** If collective calls with bcols of this type need to be ordered */
bool need_ordering;
/** MCA parameter: Priority of this component */
int priority;
/** Bcast function pointers */
struct coll_bcol_collective_description_t *
bcast_functions[BCOL_NUM_BCAST_FUNCTIONS];
/** Number of network contexts - need this for resource management */
int n_net_contexts;
/** List of network contexts */
bcol_base_network_context_t **network_contexts;
/*
* Fragmentation support
*/
/** Minimum fragement size */
int min_frag_size;
/** Maximum fragment size */
int max_frag_size;
/** Supports direct use of user-buffers */
bool can_use_user_buffers;
};
typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_2_0_0_t;
typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_component_t);
/* forward declaration */
struct mca_coll_ml_descriptor_t;
struct mca_bcol_base_payload_buffer_desc_t;
struct mca_bcol_base_route_info_t;
typedef struct {
int order_num; /* Seq num of collective fragment */
int bcols_started; /* How many bcols need ordering have been started */
int n_fns_need_ordering; /* The number of functions are called for bcols need ordering */
} mca_bcol_base_order_info_t;
/* structure that encapsultes information propagated amongst multiple
* fragments whereby completing the entire ensemble of fragments is
* necessary in order to complete the entire collective
*/
struct bcol_fragment_descriptor_t {
/* start iterator */
int head;
/* end iterator */
int tail;
/* current iteration */
int start_iter;
/* number of full iterations this frag */
int num_iter;
/* end iter */
int end_iter;
};
typedef struct bcol_fragment_descriptor_t bcol_fragment_descriptor_t;
struct bcol_function_args_t {
/* full message sequence number */
int64_t sequence_num;
/* full message descriptor - single copy of fragment invariant
* parameters */
/* Pasha: We don need this one for new flow - remove it */
struct mca_coll_ml_descriptor_t *full_message_descriptor;
struct mca_bcol_base_route_info_t *root_route;
/* function status */
int function_status;
/* root, for rooted operations */
int root;
/* input buffer */
const void *sbuf;
void *rbuf;
const void *userbuf;
struct mca_bcol_base_payload_buffer_desc_t *src_desc;
struct mca_bcol_base_payload_buffer_desc_t *dst_desc;
/* ml buffer size */
uint32_t buffer_size;
/* index of buffer in ml payload cache */
int buffer_index;
int count;
struct ompi_datatype_t *dtype;
struct ompi_op_t *op;
int sbuf_offset;
int rbuf_offset;
/* for bcol opaque data */
void *bcol_opaque_data;
/* An output argument that will be used by BCOL function to tell ML that the result of the BCOL is in rbuf */
bool result_in_rbuf;
bool root_flag; /* True if the rank is root of operation */
bool need_dt_support; /* will trigger alternate code path for some colls */
int status; /* Used for non-blocking collective completion */
uint32_t frag_size; /* fragment size for large messages */
int hier_factor; /* factor used when bcast is invoked as a service function back down
* the tree in allgather for example, the pacl_len is not the actual
* len of the data needing bcasting
*/
mca_bcol_base_order_info_t order_info;
bcol_fragment_descriptor_t frag_info;
};
struct mca_bcol_base_route_info_t {
int level;
int rank;
};
typedef struct mca_bcol_base_route_info_t mca_bcol_base_route_info_t;
struct mca_bcol_base_lmngr_block_t {
opal_list_item_t super;
struct mca_coll_ml_lmngr_t *lmngr;
void* base_addr;
};
typedef struct mca_bcol_base_lmngr_block_t mca_bcol_base_lmngr_block_t;
OBJ_CLASS_DECLARATION(mca_bcol_base_lmngr_block_t);
struct mca_bcol_base_memory_block_desc_t {
/* memory block for payload buffers */
struct mca_bcol_base_lmngr_block_t *block;
/* Address offset in bytes -- Indicates free memory in the block */
uint64_t block_addr_offset;
/* size of the memory block */
size_t size_block;
/* number of memory banks */
uint32_t num_banks;
/* number of buffers per bank */
uint32_t num_buffers_per_bank;
/* size of a payload buffer */
uint32_t size_buffer;
/* pointer to buffer descriptors initialized */
struct mca_bcol_base_payload_buffer_desc_t *buffer_descs;
/* index of the next free buffer in the block */
uint64_t next_free_buffer;
uint32_t *bank_release_counters;
/* Counter that defines what bank should be synchronized next
* since collectives could be completed out of order, we have to make
* sure that memory synchronization collectives started in order ! */
int memsync_counter;
/* This arrays of flags used to signal that the bank is ready for recycling */
bool *ready_for_memsync;
/* This flags monitors if bank is open for usage. Usually we expect that user
* will do the check only on buffer-zero allocation */
bool *bank_is_busy;
};
/* convenience typedef */
typedef struct mca_bcol_base_memory_block_desc_t mca_bcol_base_memory_block_desc_t;
typedef void (*mca_bcol_base_release_buff_fn_t)(struct mca_bcol_base_memory_block_desc_t *ml_memblock, uint32_t buff_id);
struct mca_bcol_base_payload_buffer_desc_t {
void *base_data_addr; /* buffer address */
void *data_addr; /* buffer address + header offset */
uint64_t generation_number; /* my generation */
uint64_t bank_index; /* my bank */
uint64_t buffer_index; /* my buff index */
};
/* convenience typedef */
typedef struct mca_bcol_base_payload_buffer_desc_t mca_bcol_base_payload_buffer_desc_t;
typedef struct bcol_function_args_t bcol_function_args_t;
/* The collective operation is defined by a series of collective operations
* invoked through a function pointer. Each function may be different,
* so will store the arguments in a struct and pass a pointer to the struct,
* and use this as a way to hide the different function signatures.
*
* @param[in] input_args Structure with function arguments
* @param[in] bcol_desc Component specific paremeters
* @param[out] status return status of the function
* MCA_BCOL_COMPLETE - function completed
* MCA_BCOL_IN_PROGRESS - function incomplete
*
* @retval OMPI_SUCCESS successful completion
* @retval OMPI_ERROR function returned error
*/
/* forward declaration */
struct mca_bcol_base_module_t;
/* collective function prototype - all functions have the same interface
* so that we can call them via a function pointer */
struct mca_bcol_base_function_t;
typedef int (*mca_bcol_base_module_collective_fn_primitives_t)
(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args);
typedef int (*mca_bcol_base_module_collective_init_fn_primitives_t)
(struct mca_bcol_base_module_t *bcol_module);
/**
* function to query for collctive function attributes
*
* @param attribute (IN) the attribute of interest
* @param algorithm_parameters (OUT) the value of attribute for this
* function. If this attribute is not supported,
* OMPI_ERR_NOT_FOUND is returned.
*/
typedef int (*mca_bcol_get_collective_attributes)(int attribute,
void *algorithm_parameters);
/* data structure for tracking the relevant data needed for ml level
* algorithm construction (e.g., function selection), initialization, and
* usage.
*/
struct coll_bcol_collective_description_t {
/* collective initiation function - first functin called */
mca_bcol_base_module_collective_fn_primitives_t coll_fn;
/* collective progress function - first functin called */
mca_bcol_base_module_collective_fn_primitives_t progress_fn;
/* collective progress function - first functin called */
mca_bcol_get_collective_attributes get_attributes;
/* attributes supported - bit map */
uint64_t attribute;
};
typedef struct coll_bcol_collective_description_t
coll_bcol_collective_description_t;
/* collective operation attributes */
enum {
/* supports dynamic decisions - e.g., do not need to have the collective
* operation fully defined before it can be started
*/
BCOL_ATTRIBUTE_DYNAMIC,
/* number of attributes */
BCOL_NUM_ATTRIBUTES
};
/* For rooted collectives,
* does the algorithm knows its data source ?
*/
enum {
DATA_SRC_KNOWN=0,
DATA_SRC_UNKNOWN,
DATA_SRC_TYPES
};
enum {
BLOCKING,
NON_BLOCKING
};
/* gvm For selection logic */
struct mca_bcol_base_coll_fn_comm_attributes_t {
int bcoll_type;
int comm_size_min;
int comm_size_max;
int data_src;
int waiting_semantics;
};
typedef struct mca_bcol_base_coll_fn_comm_attributes_t
mca_bcol_base_coll_fn_comm_attributes_t;
struct mca_bcol_base_coll_fn_invoke_attributes_t {
int bcol_msg_min;
int bcol_msg_max;
uint64_t datatype_bitmap; /* Max is OMPI_DATATYPE_MAX_PREDEFINED defined to be 45 */
uint32_t op_types_bitmap; /* bit map of optypes supported */
};
typedef struct mca_bcol_base_coll_fn_invoke_attributes_t
mca_bcol_base_coll_fn_invoke_attributes_t;
struct mca_bcol_base_coll_fn_desc_t {
opal_list_item_t super;
struct mca_bcol_base_coll_fn_comm_attributes_t *comm_attr;
struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attr;
mca_bcol_base_module_collective_fn_primitives_t coll_fn;
mca_bcol_base_module_collective_fn_primitives_t progress_fn;
};
typedef struct mca_bcol_base_coll_fn_desc_t mca_bcol_base_coll_fn_desc_t;
OBJ_CLASS_DECLARATION(mca_bcol_base_coll_fn_desc_t);
/* end selection logic */
typedef int (*mca_bcol_base_module_collective_init_fn_t)
(struct mca_bcol_base_module_t *bcol_module,
mca_sbgp_base_module_t *sbgp_module);
/* per communicator memory initialization function */
typedef int (*mca_bcol_module_mem_init)(struct ml_buffers_t *registered_buffers,
mca_bcol_base_component_t *module);
/* Initialize memory block - ml_memory_block initialization interface function
*
* Invoked at the ml level, used to pass bcol specific registration information
* for the "ml_memory_block"
*
* @param[in] ml_memory_block Pointer to the ml_memory_block. This struct
* contains bcol specific registration information and a call back function
* used for resource recycling.
*
* @param[in] reg_data bcol specific registration data.
*
* @returns On Success: OMPI_SUCCESS
* On Failure: OMPI_ERROR
*
*/
/*typedef int (*mca_bcol_base_init_memory_fn_t)
(struct mca_bcol_base_memory_block_desc_t *ml_block, void *reg_data);*/
typedef int (*mca_bcol_base_init_memory_fn_t)
(struct mca_bcol_base_memory_block_desc_t *payload_block,
uint32_t data_offset,
struct mca_bcol_base_module_t *bcol,
void *reg_data);
typedef int (*mca_common_allgather_init_fn_t)
(struct mca_bcol_base_module_t *bcol_module);
typedef void (*mca_bcol_base_set_thresholds_fn_t)
(struct mca_bcol_base_module_t *bcol_module);
enum {
MCA_BCOL_BASE_ZERO_COPY = 1,
MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG = 1 << 1,
MCA_BCOL_BASE_NO_ML_BUFFER_FOR_BARRIER = 1 << 2
};
/* base module */
struct mca_bcol_base_module_t {
/* base coll component */
opal_object_t super;
/* bcol component (Pasha: Do we really need cache the component?)*/
mca_bcol_base_component_t *bcol_component;
/* network context that is used by this bcol
only one context per bcol is allowed */
bcol_base_network_context_t *network_context;
/* We are going to use the context index a lot,
int order to decrease number of dereferences
bcol->network_context->index
we are caching the value on bcol */
int context_index;
/* Set of flags that describe features supported by bcol */
uint64_t supported_mode;
/* per communicator memory initialization function */
mca_bcol_module_mem_init init_module;
/* sub-grouping module partner */
mca_sbgp_base_module_t *sbgp_partner_module;
/* size of subgroup - cache this, so can have access when
* sbgp_partner_module no longer existes */
int size_of_subgroup;
/* sequence number offset - want to make sure that we start
* id'ing collectives with id 0, so we can have simple
* resource management.
*/
int64_t squence_number_offset;
/* number of times to poll for operation completion before
* breaking out of a non-blocking collective operation
*/
int n_poll_loops;
/* size of header that will go in data buff, should not include
* any info regarding alignment, let the ml level handle this
*/
uint32_t header_size;
/* Each bcol is assigned a unique value
* see if we can get away with 16-bit id
*/
int16_t bcol_id;
/*FIXME:
* Since mca_bcol_base_module_t is the only parameter which will be passed
* into the bcol_basesmuma_bcast_init(), add the flag to indicate whether
* the hdl-based algorithms will get enabled.
*/
bool use_hdl;
/*
* Collective function pointers
*/
/* changing function signature - will replace bcol_functions */
mca_bcol_base_module_collective_fn_primitives_t bcol_function_table[BCOL_NUM_OF_FUNCTIONS];
/* Tables hold pointers to functions */
mca_bcol_base_module_collective_init_fn_primitives_t bcol_function_init_table[BCOL_NUM_OF_FUNCTIONS];
opal_list_t bcol_fns_table[BCOL_NUM_OF_FUNCTIONS];
struct mca_bcol_base_coll_fn_desc_t*
filtered_fns_table[DATA_SRC_TYPES][2][BCOL_NUM_OF_FUNCTIONS][NUM_MSG_RANGES+1][OMPI_OP_NUM_OF_TYPES][OMPI_DATATYPE_MAX_PREDEFINED];
/*
* Bcol interface function to pass bcol specific
* info and memory recycling call back
*/
mca_bcol_base_init_memory_fn_t bcol_memory_init;
/*
* netpatterns interface function, would like to invoke this on
* on the ml level
*/
mca_common_allgather_init_fn_t k_nomial_tree;
/* Each bcol caches a list which describes how many ranks
* are "below" each rank in this bcol
*/
int *list_n_connected;
/* offsets for scatter/gather */
int hier_scather_offset;
/* Small message threshold for each collective */
int small_message_thresholds[BCOL_NUM_OF_FUNCTIONS];
/* Set small_message_thresholds array */
mca_bcol_base_set_thresholds_fn_t set_small_msg_thresholds;
/* Pointer to the order counter on the upper layer,
used if the bcol needs to be ordered */
int *next_inorder;
};
typedef struct mca_bcol_base_module_t mca_bcol_base_module_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_module_t);
/* function description */
struct mca_bcol_base_function_t {
int fn_idx;
/* module */
struct mca_bcol_base_module_t *bcol_module;
/*
* The following two parameters are used for bcol modules
* that want to do some optimizations based on the fact that
* n functions from the same bcol module are called in a row.
* For example, in the iboffload case, on the first call one
* will want to initialize the MWR, and start to instantiate
* it, but only post it at the end of the last call.
* The index of this function in a sequence of consecutive
* functions from the same bcol
*/
int index_in_consecutive_same_bcol_calls;
/* number of times functions from this bcol are
* called in order
*/
int n_of_this_type_in_a_row;
/*
* number of times functions from this module are called in the
* collective operation.
*/
int n_of_this_type_in_collective;
int index_of_this_type_in_collective;
};
typedef struct mca_bcol_base_function_t mca_bcol_base_function_t;
struct mca_bcol_base_descriptor_t {
opal_free_list_item_t super;
/* Vasily: will be described in the future */
};
typedef struct mca_bcol_base_descriptor_t mca_bcol_base_descriptor_t;
static inline __opal_attribute_always_inline__ size_t
mca_bcol_base_get_buff_length(ompi_datatype_t *dtype, int count)
{
ptrdiff_t lb, extent;
ompi_datatype_get_extent(dtype, &lb, &extent);
return (size_t) (extent * count);
}
#define MCA_BCOL_CHECK_ORDER(module, bcol_function_args) \
do { \
if (*((module)->next_inorder) != \
(bcol_function_args)->order_info.order_num) { \
return BCOL_FN_NOT_STARTED; \
} \
} while (0);
#define MCA_BCOL_UPDATE_ORDER_COUNTER(module, order_info) \
do { \
(order_info)->bcols_started++; \
if ((order_info)->n_fns_need_ordering == \
(order_info)->bcols_started) { \
++(*((module)->next_inorder)); \
} \
} while (0);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* MCA_BCOL_H */

Просмотреть файл

Просмотреть файл

@ -1,66 +0,0 @@
#
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(bcol_iboffload_CPPFLAGS) $(btl_openib_CPPFLAGS)
sources = \
bcol_iboffload.h \
bcol_iboffload_device.h \
bcol_iboffload_module.c \
bcol_iboffload_mca.h \
bcol_iboffload_mca.c \
bcol_iboffload_endpoint.h \
bcol_iboffload_endpoint.c \
bcol_iboffload_frag.h \
bcol_iboffload_frag.c \
bcol_iboffload_collfrag.h \
bcol_iboffload_collfrag.c \
bcol_iboffload_task.h \
bcol_iboffload_task.c \
bcol_iboffload_component.c \
bcol_iboffload_barrier.c \
bcol_iboffload_bcast.h \
bcol_iboffload_bcast.c \
bcol_iboffload_allgather.c \
bcol_iboffload_collreq.h \
bcol_iboffload_collreq.c \
bcol_iboffload_qp_info.c \
bcol_iboffload_qp_info.h \
bcol_iboffload_fanin.c \
bcol_iboffload_fanout.c \
bcol_iboffload_allreduce.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
component_noinst =
component_install =
if MCA_BUILD_ompi_bcol_iboffload_DSO
component_install += mca_bcol_iboffload.la
else
component_noinst += libmca_bcol_iboffload.la
endif
# See ompi/mca/btl/sm/Makefile.am for an explanation of
# libmca_common_sm.la.
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_bcol_iboffload_la_SOURCES = $(sources)
mca_bcol_iboffload_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS) $(bcol_iboffload_LDFLAGS)
mca_bcol_iboffload_la_LIBADD = $(btl_openib_LIBS) $(bcol_iboffload_LIBS) \
$(OMPI_TOP_BUILDDIR)/ompi/mca/common/ofacm/libmca_common_ofacm.la \
$(OMPI_TOP_BUILDDIR)/ompi/mca/common/verbs/libmca_common_verbs.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_bcol_iboffload_la_SOURCES =$(sources)
libmca_bcol_iboffload_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS) $(bcol_iboffload_LDFLAGS)

Просмотреть файл

@ -1,765 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_H
#define MCA_BCOL_IBOFFLOAD_H
#include "ompi_config.h"
#include <stdio.h>
#include <assert.h>
#include <infiniband/mqe.h>
#include <infiniband/verbs.h>
#include <infiniband/mverbs.h>
#include "ompi/mca/mca.h"
#include "ompi/op/op.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/datatype/ompi_datatype_internal.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h"
#include "opal/mca/mpool/mpool.h"
#include "ompi/request/request.h"
#include "ompi/mca/common/ofacm/connect.h"
#include "bcol_iboffload_qp_info.h"
BEGIN_C_DECLS
#define IMM_RDMA 1
#define INLINE 1
#define NO_INLINE 0
#define MCA_IBOFFLOAD_CALC_SIZE_EXT 8
#define MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE 8
#define MCA_IBOFFLOAD_CACHE_LINE_SIZE 128
#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA
#define MCA_BCOL_IBOFFLOAD_SEND_CALC IBV_M_WR_CALC_SEND
#else
#define MCA_BCOL_IBOFFLOAD_SEND_CALC IBV_M_WR_CALC
#endif
/* 0 - barrier rdma info
1 - ML rdma info */
#define MAX_REMOTE_RDMA_INFO 2
/* forward declarations */
struct mca_bcol_iboffload_module_t;
struct mca_bcol_iboffload_collreq_t;
struct mca_bcol_iboffload_endpoint_t;
struct mca_bcol_iboffload_frag_t;
struct mca_bcol_iboffload_task_t;
struct mca_bcol_iboffload_qp_info_t;
struct mca_bcol_iboffload_collfrag_t;
struct mca_bcol_iboffload_algth_lst_t;
struct mca_bcol_iboffload_device_t;
typedef int (*mca_bcol_iboffload_coll_algth_fn_t) (
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
struct mca_bcol_iboffload_rdma_info_t {
uint64_t addr;
uint32_t rkey;
uint32_t lkey;
};
typedef struct mca_bcol_iboffload_rdma_info_t mca_bcol_iboffload_rdma_info_t;
struct mca_bcol_iboffload_rdma_buffer_desc_t {
void *data_addr; /* buffer address */
uint64_t generation_number; /* my generation */
uint64_t bank_index; /* my bank */
uint64_t buffer_index; /* my buff index */
};
typedef struct mca_bcol_iboffload_rdma_buffer_desc_t mca_bcol_iboffload_rdma_buffer_desc_t;
struct mca_bcol_iboffload_rdma_block_desc_t {
/* number of memory banks */
uint32_t num_banks;
/* number of buffers per bank */
uint32_t num_buffers_per_bank;
/* size of a payload buffer */
uint32_t size_buffer;
/* data offset from ML */
uint32_t data_offset;
/* pointer to buffer descriptors initialized */
mca_bcol_iboffload_rdma_buffer_desc_t *rdma_desc;
};
typedef struct mca_bcol_iboffload_rdma_block_desc_t mca_bcol_iboffload_rdma_block_desc_t;
/* Information that we need to keep in order to access remote
memory. For each remote peer (endpoint) we will keep this
structure */
struct mca_bcol_iboffload_rem_rdma_block_t {
/* IB related information first */
mca_bcol_iboffload_rdma_info_t ib_info;
mca_bcol_iboffload_rdma_buffer_desc_t *rdma_desc;
};
typedef struct mca_bcol_iboffload_rem_rdma_block_t mca_bcol_iboffload_rem_rdma_block_t;
enum {
MCA_BCOL_IBOFFLOAD_BK_COUNTER_INDEX = 0,
MCA_BCOL_IBOFFLOAD_BK_SYNC_INDEX,
MCA_BCOL_IBOFFLOAD_BK_LAST
};
/* Information that we need to keep in order to access and
track local memory that is used as source and destinatination
for RDMA operations */
struct mca_bcol_iboffload_local_rdma_block_t {
/* sync counter keeps next to start bank id */
int sync_counter;
/* Counter for released ml buffers */
int *bank_buffer_counter[MCA_BCOL_IBOFFLOAD_BK_LAST];
/* IB related information first */
struct mca_bcol_iboffload_rdma_info_t ib_info;
/* back pointer to original ML memory descriptor */
struct mca_bcol_base_memory_block_desc_t *ml_mem_desc;
/* Pasha: do we really need this one ?*/
/* caching ml memory descriptor configurations localy */
mca_bcol_iboffload_rdma_block_desc_t bdesc;
};
typedef struct mca_bcol_iboffload_local_rdma_block_t mca_bcol_iboffload_local_rdma_block_t;
struct mca_bcol_iboffload_recv_wr_manager {
opal_mutex_t lock;
/** Array of ready to use receive work requests.
* it is 2 dimensional array since for each
* qp size we want to keep separate recv wr */
struct ibv_recv_wr **recv_work_requests;
};
typedef struct mca_bcol_iboffload_recv_wr_manager mca_bcol_iboffload_recv_wr_manager;
/**
* Structure to hold the basic shared memory coll component. First it holds the
* base coll component, and then holds a bunch of
* sm-coll-component-specific stuff (e.g., current MCA param
* values).
*/
struct mca_bcol_iboffload_component_t {
/** Base coll component */
mca_bcol_base_component_2_0_0_t super;
/** Enable disable verbose mode */
int verbose;
int num_qps;
/** Whether we want a warning if non default GID prefix is not configured
on multiport setup */
bool warn_default_gid_prefix;
/** Whether we want a warning if the user specifies a non-existent
device and/or port via bcol_ibofflad_if_[in|ex]clude MCA params */
bool warn_nonexistent_if;
/** initial size of free lists */
int free_list_num;
/** maximum size of free lists */
int free_list_max;
/** number of elements to alloc when growing free lists */
int free_list_inc;
/** name of ib memory pool */
char* mpool_name;
/** max outstanding CQE on the CQ */
int cq_size;
/** Max size of inline data */
unsigned int max_inline_data;
/** IB partition definition */
uint32_t pkey_val;
/** Outstanding atomic reads */
unsigned int qp_ous_rd_atom;
/** IB MTU */
int mtu;
/** Recv not ready timer */
int min_rnr_timer;
/** IB timeout */
int timeout;
/** IB retry count */
int retry_count;
/** Recv not ready retry count */
int rnr_retry;
/** IB maximum pending RDMA */
int max_rdma_dst_ops;
/** IB Service level (QOS) */
int service_level;
/** Preferred communication buffer alignment in Bytes (must be power of two) */
int buffer_alignment;
/** Max tasks number for MQ */
int max_mqe_tasks;
/** Max MQ size */
int max_mq_size;
/** HCA/Port include exclude list */
char *if_include;
char **if_include_list;
char *if_exclude;
char **if_exclude_list;
/** Dummy argv-style list; a copy of names from the
if_[in|ex]clude list that we use for error checking (to ensure
that they all exist) */
char **if_list;
/** Array of ibv devices */
struct ibv_device **ib_devs;
/** devices count */
int num_devs;
/** MCA param bcol_iboffload_receive_queues */
char *receive_queues;
/** Common info about all kinds of QPs on each iboffload module */
struct mca_bcol_iboffload_qp_info_t qp_infos[MCA_BCOL_IBOFFLOAD_QP_LAST];
/** Array of iboffload devices */
opal_pointer_array_t devices;
/** Free lists of collfrag descriptors */
ompi_free_list_t collfrags_free;
/** Free lists of outstanding collective operations */
ompi_free_list_t collreqs_free;
/** Free lists for free task operations */
ompi_free_list_t tasks_free;
/** Free lists for free calc task operations */
ompi_free_list_t calc_tasks_free;
/** Free list of empty frags, that do not keep any
registration information */
ompi_free_list_t ml_frags_free;
/** Recv work request mananger */
mca_bcol_iboffload_recv_wr_manager recv_wrs;
/** We allocate some resources on the component
* with creating of the first iboffload module
* and set this flag to true */
bool init_done;
/** Maximal number of fragments of the same colective request that can be sent in parallel */
unsigned int max_pipeline_depth;
/** array mapping Open MPI reduction operators to MVerbs reduction operators */
enum ibv_m_wr_calc_op map_ompi_to_ib_calcs[OMPI_OP_NUM_OF_TYPES];
/** array mapping Open MPI data types to MVerbs data types */
enum ibv_m_wr_data_type map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_MAX_PREDEFINED];
/** The order of the exchange tree */
int exchange_tree_order;
/** Knomial tree order */
int knomial_tree_order;
/** K-nomial radix */
int k_nomial_radix;
/** Maximum number of pulls for completion check */
int max_progress_pull;
/** Barrier function selector */
int barrier_mode;
/** MCA for selecting Bruck's alltoall algorithms */
int use_brucks_smsg_alltoall_rdma;
int use_brucks_smsg_alltoall_sr;
/** radix of small-data alltoall Bruck-like algorithm */
int k_alltoall_bruck_radix;
/** alltoall small data buffer alignment */
int tmp_buf_alignment;
};
/**
* Convenience typedef
*/
typedef struct mca_bcol_iboffload_component_t mca_bcol_iboffload_component_t;
/* List of all algorithms that we use */
enum {
FANIN_ALG,
FANOUT_ALG,
RECURSIVE_DOUBLING_BARRIER_ALG,
RECURSIVE_KNOMIAL_BARRIER_ALG,
RECURSIVE_DOUBLING_ALLREDUCE_ALG,
RECURSIVE_DOUBLING_REDUCE_ALG,
RECURSIVE_DOUBLING_TREE_BCAST,
ALL_ENDPOINTS, /* connected to all peers */
ALLGATHER_KNOMIAL_ALG,
ALLGATHER_NEIGHBOR_ALG,
REMOTE_EXCHANGE_ALG,
LAST_ALG
};
struct mca_bcol_iboffload_port_t {
int id; /** Port number on device: 1 or 2 */
int stat; /** Port status - Active,Init,etc.. */
enum ibv_mtu mtu; /** MTU on this port */
uint64_t subnet_id; /** Sunnet id for the port */
uint16_t lid;
uint16_t lmc;
};
typedef struct mca_bcol_iboffload_port_t mca_bcol_iboffload_port_t;
enum {
COLL_MQ = 0,
SERVICE_MQ,
BCOL_IBOFFLOAD_MQ_NUM
};
struct mca_bcol_iboffload_module_t {
/* base structure */
mca_bcol_base_module_t super;
/* size */
int group_size;
int log_group_size;
/* size of each memory segment */
size_t segment_size;
/* collective tag */
long long collective_tag;
/* pointer to device */
struct mca_bcol_iboffload_device_t *device;
/* caching port number */
uint32_t port;
/* Connecting iboffload with ibnet module information */
/* pointer to sbgp ibnet */
mca_sbgp_ibnet_module_t *ibnet;
/* connection group inder for the ibnet */
int cgroup_index;
/* array of endpoints */
struct mca_bcol_iboffload_endpoint_t **endpoints;
/* Size of the endpoints array */
int num_endpoints;
/* caching port subnet id and lid
* the same information we have on device */
uint64_t subnet_id;
uint16_t lid;
/* Pointer to management queue */
struct mqe_context *mq[BCOL_IBOFFLOAD_MQ_NUM];
int mq_credit[BCOL_IBOFFLOAD_MQ_NUM];
/* pending list of collfrags */
opal_list_t collfrag_pending;
/* recursive-doubling tree node */
netpatterns_pair_exchange_node_t recursive_doubling_tree;
/* N exchange tree */
netpatterns_pair_exchange_node_t n_exchange_tree;
/* Knomial exchange tree */
netpatterns_k_exchange_node_t knomial_exchange_tree;
/* Knomial exchange tree */
netpatterns_k_exchange_node_t knomial_allgather_tree;
/* The array will keep pre-calculated task consumption per
* algorithm
*/
uint32_t alg_task_consump[LAST_ALG];
/* Pointer to a func that's implementation of a barrier algorithm */
mca_bcol_iboffload_coll_algth_fn_t barrier_algth;
/* Pointer to a func that's implementation of a fanin algorithm */
mca_bcol_iboffload_coll_algth_fn_t fanin_algth;
/* Pointer to a func that's implementation of a fanin algorithm */
mca_bcol_iboffload_coll_algth_fn_t fanout_algth;
/* Pointer to a func that's implementation of a allreduce algorithm */
mca_bcol_iboffload_coll_algth_fn_t allreduce_algth;
/* Pointer to a func that's implementation of a non blocking memory syncronization algorithm */
mca_bcol_iboffload_coll_algth_fn_t memsync_algth;
/* rdma block memory information */
mca_bcol_iboffload_local_rdma_block_t rdma_block;
/* The largest power of two which 1 << power_of_2
is not larger than the group size */
int power_of_2;
/* The largest power of two number which is not larger than the group size */
int power_of_2_ranks;
/* Connection status array */
bool connection_status[LAST_ALG];
/* map from communicator ranks to ibsubnet */
int *comm_to_ibnet_map;
/* order preserving value */
int64_t prev_sequence_num;
/* Temp iovec to send the data fragments -- alltoall Brucks */
struct iovec *alltoall_iovec;
struct iovec *alltoall_recv_iovec;
/* tree radix for the knomial bruck small data alltoall */
int k_alltoall_bruck_radix;
/* Temp buffer alignment for knomial bruck small data alltoall */
int tmp_buf_alignment;
/* Free task list with sge's array */
ompi_free_list_t iovec_tasks_free;
};
typedef struct mca_bcol_iboffload_module_t mca_bcol_iboffload_module_t;
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_module_t);
/**
* Global component instance
*/
OMPI_MODULE_DECLSPEC
extern mca_bcol_iboffload_component_t mca_bcol_iboffload_component;
static inline int mca_bcol_iboffload_err(const char* fmt, ...)
{
va_list list;
int ret;
va_start(list, fmt);
ret = vfprintf(stderr, fmt, list);
va_end(list);
return ret;
}
#define MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(ompi_op, c_type, l_operand, r_operand, result) \
do { \
switch (ompi_op) { \
case OMPI_OP_MAX: \
*((c_type *)&result) = ((*(c_type *)&(l_operand) > *(c_type *)&(r_operand)) ? \
*(c_type *)&(l_operand) : *(c_type *)&(r_operand)); \
break; \
case OMPI_OP_MIN: \
*((c_type *)&result) = ((*(c_type *)&(l_operand) < *(c_type *)&(r_operand)) ? \
*(c_type *)&(l_operand) : *(c_type *)&(r_operand)); \
break; \
case OMPI_OP_SUM: \
*((c_type *)&result) = (*((c_type *)&(l_operand)) + *((c_type *)&(r_operand))); \
break; \
default: \
break; \
} \
} while (0);
#define MCA_BCOL_IBOFFLOAD_PKEY_MASK 0x7fff
#define MCA_BCOL_IBOFFLOAD_DEFAULT_GID_PREFIX 0xfe80000000000000ll
#define IBOFFLOAD_ERROR(args) \
do { \
mca_bcol_iboffload_err("[%s]%s[%s:%d:%s] IBOFFLOAD ", \
ompi_process_info.nodename, \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__); \
mca_bcol_iboffload_err args; \
mca_bcol_iboffload_err("\n"); \
} while(0)
#if OPAL_ENABLE_DEBUG
#define IBOFFLOAD_VERBOSE(level, args) \
do { \
if (mca_bcol_iboffload_component.verbose >= level) { \
mca_bcol_iboffload_err("[%s]%s[%s:%d:%s] IBOFFLOAD ", \
ompi_process_info.nodename, \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__); \
mca_bcol_iboffload_err args; \
mca_bcol_iboffload_err("\n"); \
} \
} while(0)
#else
#define IBOFFLOAD_VERBOSE(level, args)
#endif
#define MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(coll_req, coll_work_req) \
do { \
opal_list_append(&(coll_req)->work_requests, \
(opal_list_item_t*) (coll_work_req)); \
(coll_work_req)->coll_full_req = (coll_req); \
} while(0)
/* Vasily: will be removed soon */
#define APPEND_TO_TASKLIST(task_ptr_to_set, event, last_event_type) \
do { \
*task_ptr_to_set = &(event)->element; \
last_event_type = &(event)->element; \
task_ptr_to_set = &((event)->element.next); \
} while(0)
#define MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(task_ptr_to_set, task) \
do { \
*task_ptr_to_set = (task); \
task_ptr_to_set = &((task)->next_task); \
} while(0)
#define MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(task_ptr_to_set, event) \
do { \
*task_ptr_to_set = &(event)->element; \
task_ptr_to_set = &((event)->element.next); \
} while(0)
#define BCOL_IS_COMPLETED(req) (((req)->n_frag_mpi_complete == (req)->n_fragments) && \
((req)->n_fragments > 0))
#define BCOL_AND_NET_ARE_COMPLETED(req) (BCOL_IS_COMPLETED(req) && \
((req)->n_frag_net_complete == (req)->n_fragments))
/* Pasha: Need to add locks here */
#define BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(module, mq_index, num_of_credits) \
(((module)->mq_credit[mq_index] -= (num_of_credits)) < 0 ? false : true)
/* Pasha: Need to add locks here */
#define BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(module, mq_index, num_of_credits) \
((module)->mq_credit[mq_index] += (num_of_credits))
#define BCOL_IBOFFLOAD_IS_FIRST_CALL(args) (0 == (args)->index_in_consecutive_same_bcol_calls)
#define BCOL_IBOFFLOAD_IS_LAST_CALL(args) (((args)->n_of_this_type_in_collective - 1) == \
(args)->index_of_this_type_in_collective)
#define BCOL_IBOFFLOAD_READY_TO_POST(args) (((args)->n_of_this_type_in_a_row - 1) == \
(args)->index_in_consecutive_same_bcol_calls)
/*
* bcol module functions
*/
int mca_bcol_iboffload_rec_doubling_start_connections(struct mca_bcol_iboffload_module_t *iboffload);
/* RDMA addr exchange with rem proc */
int mca_bcol_iboffload_exchange_rem_addr(struct mca_bcol_iboffload_endpoint_t *ep);
/* Progress function */
int mca_bcol_iboffload_component_progress(void);
/* Register memory */
int mca_bcol_iboffload_register_mr(void *reg_data, void * base, size_t size,
mca_mpool_base_registration_t *reg);
/* Deregister memory */
int mca_bcol_iboffload_deregister_mr(void *reg_data, mca_mpool_base_registration_t *reg);
/*
* The function is used for create CQ in this module.
*/
int mca_bcol_iboffload_adjust_cq(struct mca_bcol_iboffload_device_t *device,
struct ibv_cq **ib_cq);
/*
* Query to see if the component is available for use,
* and can satisfy the thread and progress requirements
*/
int mca_bcol_iboffload_init_query(bool enable_progress_threads,
bool enable_mpi_threads);
/* Interface to setup the allgather tree */
int mca_bcol_iboffload_setup_knomial_tree(mca_bcol_base_module_t *super);
/*
* Query to see if the module is available for use on
* the given communicator, and if so, what it's priority is.
*/
mca_bcol_base_module_t **
mca_bcol_iboffload_comm_query(mca_sbgp_base_module_t *sbgp, int *num_modules);
int
mca_bcol_iboffload_free_tasks_frags_resources(
struct mca_bcol_iboffload_collfrag_t *collfrag,
ompi_free_list_t *frags_free);
/**
* Shared memory blocking barrier
*/
int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments,
struct mca_bcol_base_function_t
*const_args);
int mca_bcol_iboffload_barrier_intra_recursive_doubling_start(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_barrier_intra_recursive_knomial_start(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_barrier_intra_recursive_doubling(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_nb_memory_service_barrier_start(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_fanin_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_fanout_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_barrier_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_memsync_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_allreduce_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_new_style_fanin_first_call(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_new_style_fanout_first_call(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_nb_memory_service_barrier_intra(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int mca_bcol_iboffload_coll_support_all_types(bcol_coll coll_name);
int mca_bcol_iboffload_coll_supported(int op, int dtype, bcol_elem_type elem_type);
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_fls(int num)
{
int i = 1;
int j = 0;
if (0 == num) {
return 0;
}
while (i < num) {
i <<= 1;
j++;
}
if (i > num) {
j--;
}
return j;
}
#define BCOL_IBOFFLOAD_IS_EVEN(num) (!((num) & 1))
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_ffs(int num)
{
int j = 0;
if (0 == num) {
return 0;
}
while (BCOL_IBOFFLOAD_IS_EVEN(num)) {
num >>= 1;
j++;
}
return j;
}
#if OPAL_ENABLE_DEBUG
/* Post task list MQ */
#define IS_IMM(a) (a & MQE_WR_FLAG_IMM_EXE)
#define IS_SIG(a) (a & MQE_WR_FLAG_SIGNAL)
#define IS_BLK(a) (a & MQE_WR_FLAG_BLOCK)
int task_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task);
int wait_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task);
#endif
/* MQ posting function */
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_post_mqe_tasks(
mca_bcol_iboffload_module_t *iboffload,
struct mqe_task *head_mqe)
{
int rc;
struct mqe_task *bad_mqe = NULL;
#if OPAL_ENABLE_DEBUG /* debug code */
struct mqe_task *curr_mqe_task = NULL;
int send_count = 0, recv_count = 0, wait_count = 0;
curr_mqe_task = head_mqe;
IBOFFLOAD_VERBOSE(10, ("Processing MQE Head with addr %p <START>\n",
(uintptr_t) (void*) curr_mqe_task));
while (NULL != curr_mqe_task) {
switch(curr_mqe_task->opcode) {
case MQE_WR_SEND:
IBOFFLOAD_VERBOSE(10, ("Posting task %p id 0x%x: send on QP 0x%x\n"
"rank %d, sg_entry: addr %p LEN %d lkey %u, flag[%d-%d-%d]\n",
(void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id,
curr_mqe_task->post.qp->qp_num,
task_to_rank(iboffload, curr_mqe_task),
curr_mqe_task->post.send_wr->sg_list->addr,
curr_mqe_task->post.send_wr->sg_list->length,
curr_mqe_task->post.send_wr->sg_list->lkey,
IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags)));
++send_count;
break;
case MQE_WR_RECV:
IBOFFLOAD_VERBOSE(10, ("Posting task %p id 0x%x: recv on QP 0x%x rank %d flag[%d-%d-%d]\n",
(void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id,
curr_mqe_task->post.qp->qp_num, task_to_rank(iboffload, curr_mqe_task),
IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags)));
++recv_count;
break;
case MQE_WR_CQE_WAIT:
IBOFFLOAD_VERBOSE(10, ("Posting task %p id %x: wait on CQ %p for rank %d num of waits %d flag[%d-%d-%d]\n",
(void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id,
(void*) curr_mqe_task->wait.cq, wait_to_rank(iboffload, curr_mqe_task),
curr_mqe_task->wait.count,
IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags)));
wait_count += curr_mqe_task->wait.count;
break;
default:
IBOFFLOAD_ERROR(("Fatal error, unknow packet type %d\n",
curr_mqe_task->opcode));
return OMPI_ERROR;
}
/* pointer to next task */
curr_mqe_task = curr_mqe_task->next;
}
IBOFFLOAD_VERBOSE(10, ("wait[%d] send[%d] recv[%d]\n",
wait_count, send_count, recv_count));
#endif
IBOFFLOAD_VERBOSE(10, ("Posting MQ %p <DONE>\n", (uintptr_t) head_mqe->wr_id));
rc = mqe_post_task(iboffload->mq[0], head_mqe, &bad_mqe);
if (OPAL_UNLIKELY(0 != rc)) {
IBOFFLOAD_ERROR(("ibv_post_mqe failed, errno says: %s,"
" the return code is [%d]\n",
strerror(errno), rc));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__
int lognum(int n) {
int count = 1, lognum = 0;
while (count < n) {
count = count << 1;
lognum++;
}
return lognum;
}
END_C_DECLS
#endif /* MCA_BCOL_IBOFFLOAD_H */

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,934 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_task.h"
#include "bcol_iboffload_collreq.h"
#include "bcol_iboffload_collfrag.h"
#include "bcol_iboffload_endpoint.h"
#include "ompi/mca/coll/ml/coll_ml_allocation.h"
static int mca_bcol_iboffload_barrier_init(
bcol_function_args_t *input_args,
mca_bcol_iboffload_module_t *iboffload,
collective_message_completion_callback_function cb_fn,
struct mca_bcol_iboffload_collreq_t **coll_request);
/**
* Start barrier
*/
int mca_bcol_iboffload_barrier_intra_recursive_doubling(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
/* local variables */
mca_bcol_iboffload_task_t *send_task = NULL,
*wait_task = NULL;
struct mqe_task **mqe_ptr_to_set = NULL;
mca_bcol_iboffload_collfrag_t *coll_fragment = NULL;
struct mqe_task *last_wait = NULL, /* we need ask from completion on last wait */
*last_send = NULL; /* If it no wait, we need ask for completion on last send */
int rc, exchange, extra_rank, pair_rank;
mca_bcol_iboffload_frag_t *send_fragment = NULL,
*preposted_recv_frag = NULL;
netpatterns_pair_exchange_node_t *my_exchange_node =
&iboffload->recursive_doubling_tree;
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_intra_recursive_doubling.\n"));
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
opal_list_get_last(&coll_request->work_requests);
/* Set mq credits */
coll_fragment->mq_credits = iboffload->alg_task_consump[RECURSIVE_DOUBLING_BARRIER_ALG];
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
goto out_of_resources;
}
coll_fragment->alg = RECURSIVE_DOUBLING_BARRIER_ALG;
/*
* NOTE: need to generate template, if this will be a multiple fragment
* message. This way we can progress the collective w/o knowing it's
* type - actually, this is not the case for barrier, but just a note
* to remind us that we need to generalize this.
*/
mqe_ptr_to_set = &coll_fragment->to_post;
/*
* Fill in the communication pattern
*/
/*
* If non power of 2, may need to wait for message from "extra" proc.
*/
if (0 < my_exchange_node->n_extra_sources) {
if (EXCHANGE_NODE == my_exchange_node->node_type) {
/* I will participate in the exchange (of the algorithm) -
* wait for signal from extra process */
extra_rank = my_exchange_node->rank_extra_source;
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
iboffload, extra_rank, coll_request->qp_index);
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
"Failing for getting prepost recv frag.\n"));
goto out_of_resources;
}
wait_task = mca_bcol_iboffload_get_wait_task(iboffload,
extra_rank, 1, preposted_recv_frag, coll_request->qp_index, NULL);
if (OPAL_UNLIKELY(NULL == wait_task)) {
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
"Failing for getting wait task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
} else {
/* I will not participate in the exchange - so just "register" as here */
extra_rank = my_exchange_node->rank_extra_source;
/* send - no need to send any data, in-order delivery */
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
extra_rank, coll_request->qp_index, 0,
0, SBUF,MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
send_task = mca_bcol_iboffload_get_send_task(iboffload, extra_rank,
coll_request->qp_index, send_fragment, coll_fragment, INLINE);
if (OPAL_UNLIKELY(NULL == send_task)) {
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
"Failing for getting send task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
}
}
/* loop over exchange send/recv pairs */
for (exchange = 0; exchange < my_exchange_node->n_exchanges; ++exchange) {
/* rank of exchange partner */
pair_rank = my_exchange_node->rank_exchanges[exchange];
/* post send */
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
pair_rank, coll_request->qp_index, 0,
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
assert(NULL != send_fragment);
send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank,
coll_request->qp_index,
send_fragment, coll_fragment, INLINE);
if (OPAL_UNLIKELY(NULL == send_task)) {
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
"Failing for getting send task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
/* post wait */
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
iboffload, pair_rank, coll_request->qp_index);
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
"Failing for getting prepost recv frag.\n"));
goto out_of_resources;
}
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1,
preposted_recv_frag,
coll_request->qp_index, NULL);
if (OPAL_UNLIKELY(NULL == wait_task)) {
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
"Failing for getting wait task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
}
/* if non power of 2, may need to send message to "extra" proc */
if (0 < my_exchange_node->n_extra_sources) {
if (EXTRA_NODE == my_exchange_node->node_type) {
/* I will not participate in the exchange -
* wait for signal from exchange process */
extra_rank = my_exchange_node->rank_extra_source;
/* post wait */
preposted_recv_frag =
mca_bcol_iboffload_get_preposted_recv_frag(iboffload, extra_rank,
coll_request->qp_index);
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
"Failing for getting prepost recv frag.\n"));
goto out_of_resources;
}
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1,
preposted_recv_frag,
coll_request->qp_index, NULL);
if (OPAL_UNLIKELY(NULL == wait_task)) {
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
"Failing for getting wait task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
} else {
/* I will participate in the exchange -
* send signal to extra process */
extra_rank = my_exchange_node->rank_extra_source;
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
extra_rank, coll_request->qp_index, 0,
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
send_task = mca_bcol_iboffload_get_send_task(
iboffload, extra_rank,
coll_request->qp_index,
send_fragment, coll_fragment, INLINE);
if (OPAL_UNLIKELY(NULL == send_task)) {
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
"Failing for getting send task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
}
}
/* Fill in the the rest of the coll_fragment */
IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n"));
/* end of list */
*mqe_ptr_to_set = NULL;
/* finish initializing full message descriptor */
coll_request->n_fragments = 1;
coll_request->n_frags_sent = 1;
coll_request->n_frag_mpi_complete = 0;
coll_request->n_frag_net_complete = 0;
coll_request->user_handle_freed = false;
last_wait->flags |= MQE_WR_FLAG_SIGNAL;
coll_fragment->signal_task_wr_id = last_wait->wr_id;
last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
/* post the mwr */
if (MCA_BCOL_IBOFFLOAD_QP_SYNC != coll_request->qp_index) {
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
/* Note: need to clean up */
return rc;
}
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
} else {
/* Special flow for ML service barrier , only this function supposed to
post service requests */
struct mqe_task *bad_mqe = NULL;
assert (MCA_BCOL_IBOFFLOAD_QP_SYNC == coll_request->qp_index );
/* Post to special service MQ - 1 */
rc = mqe_post_task(iboffload->mq[1], coll_fragment->to_post, &bad_mqe);
if (OPAL_UNLIKELY(0 != rc)) {
IBOFFLOAD_ERROR(("ibv_post_mqe failed on device (%s), errno says: %s,"
" the return code is [%d]\n",
ibv_get_device_name(iboffload->device->dev.ib_dev),
strerror(errno), rc));
return OMPI_ERROR;
}
}
IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
return OMPI_SUCCESS;
out_of_resources:
/* Release all resources */
IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
int mca_bcol_iboffload_barrier_intra_recursive_doubling_start(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int rc;
rc = mca_bcol_iboffload_rec_doubling_start_connections(iboffload);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return rc;
}
iboffload->barrier_algth =
mca_bcol_iboffload_barrier_intra_recursive_doubling;
return
mca_bcol_iboffload_barrier_intra_recursive_doubling(iboffload, coll_request);
}
int mca_bcol_iboffload_nb_memory_service_barrier_start(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int rc;
rc = mca_bcol_iboffload_rec_doubling_start_connections(iboffload);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return rc;
}
iboffload->memsync_algth =
mca_bcol_iboffload_barrier_intra_recursive_doubling;
return
mca_bcol_iboffload_barrier_intra_recursive_doubling
(iboffload, coll_request);
}
int mca_bcol_iboffload_nb_memory_service_barrier_intra(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variables */
int rc;
mca_bcol_iboffload_collreq_t *coll_request;
mca_bcol_iboffload_module_t *iboffload =
(mca_bcol_iboffload_module_t *) const_args->bcol_module;
/*
* recursive doubling
*/
IBOFFLOAD_VERBOSE(10, ("Memory syncranization barrier was started\n"));
/* init barrier collective request */
rc = mca_bcol_iboffload_barrier_init(input_args, iboffload, NULL, &coll_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_ERROR(("Get error from mca_bcol_iboffload_barrier_init"));
return rc;
}
/* set the qp index to special qp that is used only for synchronization */
coll_request->qp_index = MCA_BCOL_IBOFFLOAD_QP_SYNC;
/* overwrite mq index to run over service setup */
coll_request->first_collfrag.mq_index = SERVICE_MQ;
/* start the barrier */
rc = iboffload->memsync_algth(iboffload, coll_request);
if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
return rc;
}
/* complete the barrier - progress releases full request descriptors */
IBOFFLOAD_VERBOSE(10, ("Memory syncranization barrier was started\n"));
/* done */
return BCOL_FN_STARTED;
}
/* Recursive K - ing*/
static int recursive_knomial_start_connections(struct mca_bcol_iboffload_module_t *iboffload)
{
netpatterns_k_exchange_node_t *my_exchange_node =
&iboffload->knomial_exchange_tree;
int k, i, n_exchanges = my_exchange_node->n_exchanges,
**exchanges = my_exchange_node->rank_exchanges,
n_extra_src = my_exchange_node->n_extra_sources,
tree_order = my_exchange_node->tree_order - 1,
rank_extra_src;
mca_bcol_iboffload_endpoint_t *ep;
iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG] += 0;
IBOFFLOAD_VERBOSE(10, ("\nMy sbgp rank (index) - %d, "
"num of endpoints = %d, iboffload module - %p"
" extra n %d, n_exchanges %d",
iboffload->ibnet->super.my_index, iboffload->num_endpoints, iboffload,
n_extra_src, n_exchanges));
if (0 < n_extra_src) {
for (k = 0; k < n_extra_src; k++) {
iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG] += 2; /* One send task one wait */
rank_extra_src = my_exchange_node->rank_extra_sources_array[k];
ep = iboffload->endpoints[rank_extra_src];
if (iboffload->ibnet->super.my_index < ep->index) {
while(0 == (ep)->remote_zero_rdma_addr.addr) {
opal_progress();
}
} else {
IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
while (OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
}
}
}
for (i = 0; i < n_exchanges; ++i) {
for (k = 0; k < tree_order; k++) {
iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG] += 2; /* One send task one wait */
ep = iboffload->endpoints[exchanges[i][k]];
IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
if (iboffload->ibnet->super.my_index < ep->index) {
while(0 == (ep)->remote_zero_rdma_addr.addr) {
opal_progress();
}
} else {
while (OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
}
}
}
return OMPI_SUCCESS;
}
static int mca_bcol_iboffload_barrier_intra_recursive_knomial(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
/* local variables */
mca_bcol_iboffload_task_t *send_task = NULL,
*wait_task = NULL;
struct mqe_task **mqe_ptr_to_set = NULL;
mca_bcol_iboffload_collfrag_t *coll_fragment = NULL;
struct mqe_task *last_wait = NULL, /* we need ask from completion on last wait */
*last_send = NULL; /* If it no wait, we need ask for completion on last send */
int rc, exchange, extra_rank, pair_rank, k;
mca_bcol_iboffload_frag_t *send_fragment = NULL,
*preposted_recv_frag = NULL;
netpatterns_k_exchange_node_t *my_exchange_node =
&iboffload->knomial_exchange_tree;
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_intra_recursive_knomial. Node type %d\n", my_exchange_node->node_type));
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
opal_list_get_last(&coll_request->work_requests);
/* Set mq credits */
coll_fragment->mq_credits = iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG];
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
goto out_of_resources;
}
coll_fragment->alg = RECURSIVE_KNOMIAL_BARRIER_ALG;
/*
* NOTE: need to generate template, if this will be a multiple fragment
* message. This way we can progress the collective w/o knowing it's
* type - actually, this is not the case for barrier, but just a note
* to remind us that we need to generalize this.
*/
mqe_ptr_to_set = &coll_fragment->to_post;
/*
* Fill in the communication pattern
*/
/*
* If non power of 2, may need to wait for message from "extra" proc.
*/
if (0 < my_exchange_node->n_extra_sources) {
if (EXCHANGE_NODE == my_exchange_node->node_type) {
/* I will participate in the exchange (of the algorithm) -
* wait for signal from extra process */
for (k = 0; k < my_exchange_node->n_extra_sources; k++) {
extra_rank = my_exchange_node->rank_extra_sources_array[k];
IBOFFLOAD_VERBOSE(10,("Exchange [ %d ] extra get %d", k, extra_rank));
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
iboffload, extra_rank, coll_request->qp_index);
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
"Failing for getting prepost recv frag.\n"));
goto out_of_resources;
}
wait_task = mca_bcol_iboffload_get_wait_task(iboffload,
extra_rank, 1, preposted_recv_frag, coll_request->qp_index, NULL);
if (OPAL_UNLIKELY(NULL == wait_task)) {
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
"Failing for getting wait task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
}
} else {
/* I will not participate in the exchange - so just "register" as here */
extra_rank = my_exchange_node->rank_extra_sources_array[0];
IBOFFLOAD_VERBOSE(10,("Send to proxy %d", extra_rank));
/* send - no need to send any data, in-order delivery */
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
extra_rank, coll_request->qp_index, 0,
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
send_task = mca_bcol_iboffload_get_send_task(iboffload, extra_rank,
coll_request->qp_index, send_fragment, coll_fragment, INLINE);
if (OPAL_UNLIKELY(NULL == send_task)) {
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
"Failing for getting send task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
}
}
/* loop over exchange send/recv pairs */
for (exchange = 0; exchange < my_exchange_node->n_exchanges; ++exchange) {
for (k = 0; k < my_exchange_node->tree_order - 1; k++) {
/* rank of exchange partner */
pair_rank = my_exchange_node->rank_exchanges[exchange][k];
IBOFFLOAD_VERBOSE(10,("Exchange [ %d ,%d ] send to %d", exchange, k, pair_rank));
/* post send */
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
pair_rank, coll_request->qp_index, 0,
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank,
coll_request->qp_index,
send_fragment, coll_fragment, INLINE);
if (OPAL_UNLIKELY(NULL == send_task)) {
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
"Failing for getting send task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
}
for (k = 0; k < my_exchange_node->tree_order - 1; k++) {
pair_rank = my_exchange_node->rank_exchanges[exchange][k];
IBOFFLOAD_VERBOSE(10,("Exchange [ %d ,%d ] recv %d", exchange, k, pair_rank));
/* post wait */
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
iboffload, pair_rank, coll_request->qp_index);
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
"Failing for getting prepost recv frag.\n"));
goto out_of_resources;
}
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1,
preposted_recv_frag, coll_request->qp_index, NULL);
if (OPAL_UNLIKELY(NULL == wait_task)) {
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
"Failing for getting wait task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
}
}
/* if non power of 2, may need to send message to "extra" proc */
if (0 < my_exchange_node->n_extra_sources) {
if (EXTRA_NODE == my_exchange_node->node_type) {
/* I will not participate in the exchange -
* wait for signal from exchange process */
extra_rank = my_exchange_node->rank_extra_sources_array[0];
IBOFFLOAD_VERBOSE(10,("Wait from proxy %d", extra_rank));
/* post wait */
preposted_recv_frag =
mca_bcol_iboffload_get_preposted_recv_frag(iboffload, extra_rank,
coll_request->qp_index);
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
"Failing for getting prepost recv frag.\n"));
goto out_of_resources;
}
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1,
preposted_recv_frag,
coll_request->qp_index, NULL);
if (OPAL_UNLIKELY(NULL == wait_task)) {
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
"Failing for getting wait task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
} else {
/* I will participate in the exchange -
* send signal to extra process */
for (k = 0; k < my_exchange_node->n_extra_sources; k++) {
extra_rank = my_exchange_node->rank_extra_sources_array[k];
IBOFFLOAD_VERBOSE(10,("Exchange [ %d ] extra release %d", k, extra_rank));
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
extra_rank, coll_request->qp_index, 0,
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
send_task = mca_bcol_iboffload_get_send_task(
iboffload, extra_rank,
coll_request->qp_index,
send_fragment, coll_fragment, INLINE);
if (OPAL_UNLIKELY(NULL == send_task)) {
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
"Failing for getting send task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
}
}
}
/* Fill in the the rest of the coll_fragment */
IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n"));
/* end of list */
*mqe_ptr_to_set = NULL;
/* finish initializing full message descriptor */
coll_request->n_fragments = 1;
coll_request->n_frags_sent = 1;
coll_request->n_frag_mpi_complete = 0;
coll_request->n_frag_net_complete = 0;
coll_request->user_handle_freed = false;
last_wait->flags |= MQE_WR_FLAG_SIGNAL;
coll_fragment->signal_task_wr_id = last_wait->wr_id;
last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
/* post the mwr */
if (MCA_BCOL_IBOFFLOAD_QP_SYNC != coll_request->qp_index) {
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
/* Note: need to clean up */
return rc;
}
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
} else {
/* Special flow for ML service barrier , only this function supposed to
post service requests */
struct mqe_task *bad_mqe = NULL;
assert (MCA_BCOL_IBOFFLOAD_QP_SYNC == coll_request->qp_index );
/* Post to special service MQ - 1 */
rc = mqe_post_task(iboffload->mq[1], coll_fragment->to_post, &bad_mqe);
if (OPAL_UNLIKELY(0 != rc)) {
IBOFFLOAD_ERROR(("ibv_post_mqe failed on device (%s), errno says: %s,"
" the return code is [%d]\n",
ibv_get_device_name(iboffload->device->dev.ib_dev),
strerror(errno), rc));
return OMPI_ERROR;
}
}
IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
return OMPI_SUCCESS;
out_of_resources:
/* Release all resources */
IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
int mca_bcol_iboffload_barrier_intra_recursive_knomial_start(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int rc;
rc = recursive_knomial_start_connections(iboffload);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return rc;
}
iboffload->barrier_algth =
mca_bcol_iboffload_barrier_intra_recursive_knomial;
return
mca_bcol_iboffload_barrier_intra_recursive_knomial(iboffload, coll_request);
}
int mca_bcol_iboffload_rec_doubling_start_connections(mca_bcol_iboffload_module_t *iboffload)
{
netpatterns_pair_exchange_node_t *my_exchange_node =
&iboffload->recursive_doubling_tree;
int i, n_exchanges = my_exchange_node->n_exchanges,
*exchanges = my_exchange_node->rank_exchanges,
n_extra_src = my_exchange_node->n_extra_sources,
rank_extra_src = my_exchange_node->rank_extra_source;
mca_bcol_iboffload_endpoint_t *ep;
IBOFFLOAD_VERBOSE(10, ("\nMy sbgp rank (index) - %d, "
"num of endpoints = %d, iboffload module - %p\n",
iboffload->ibnet->super.my_index, iboffload->num_endpoints, iboffload));
if (0 < n_extra_src) {
iboffload->alg_task_consump[RECURSIVE_DOUBLING_BARRIER_ALG] += 2; /* One send task one wait */
ep = iboffload->endpoints[rank_extra_src];
if (iboffload->ibnet->super.my_index < ep->index) {
while(0 == (ep)->remote_zero_rdma_addr.addr) {
opal_progress();
}
} else {
IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
while (OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
}
}
for (i = 0; i < n_exchanges; ++i) {
iboffload->alg_task_consump[RECURSIVE_DOUBLING_BARRIER_ALG] += 2; /* One send task one wait */
ep = iboffload->endpoints[exchanges[i]];
if (iboffload->ibnet->super.my_index < ep->index) {
while(0 == (ep)->remote_zero_rdma_addr.addr) {
opal_progress();
}
} else {
IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
while (OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
}
}
return OMPI_SUCCESS;
}
static int mca_bcol_iboffload_barrier_init(
bcol_function_args_t *input_args,
mca_bcol_iboffload_module_t *iboffload,
collective_message_completion_callback_function cb_fn,
struct mca_bcol_iboffload_collreq_t **coll_request)
{
ompi_free_list_item_t *item;
mca_bcol_iboffload_collfrag_t *coll_fragment;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init"));
OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item);
if (OPAL_UNLIKELY(NULL == item)) {
IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n"));
return OMPI_ERR_OUT_OF_RESOURCE;
}
(*coll_request) = (mca_bcol_iboffload_collreq_t *) item;
(*coll_request)->progress_fn = iboffload->barrier_algth;
/*
* For usual barrier it is null. For memory
* service barrier we need some work to do
*/
(*coll_request)->completion_cb_fn = cb_fn;
(*coll_request)->order_info = &input_args->order_info;
(*coll_request)->module = iboffload;
(*coll_request)->ml_buffer_index = input_args->buffer_index;
(*coll_request)->buffer_info[SBUF].offset = 0;
(*coll_request)->buffer_info[RBUF].offset = 0;
(*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;
input_args->bcol_opaque_data = (void *) (*coll_request);
/*
* setup collective work request
*/
/* get collective frag */
coll_fragment = &(*coll_request)->first_collfrag;
mca_bcol_iboffload_collfrag_init(coll_fragment);
coll_fragment->mq_index = COLL_MQ;
/* set pointers for (coll frag) <-> (coll full request) */
MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment);
return OMPI_SUCCESS;
}
/************************************************************************
************************ New style Barrier *****************************
***********************************************************************/
static int mca_bcol_iboffload_new_style_barrier_progress(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
mca_bcol_iboffload_collreq_t *coll_request =
(mca_bcol_iboffload_collreq_t *)
input_args->bcol_opaque_data;
if (BCOL_IS_COMPLETED(coll_request)) {
coll_request->user_handle_freed = true;
if (COLLREQ_IS_DONE(coll_request)) {
IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
RELEASE_COLLREQ(coll_request);
}
IBOFFLOAD_VERBOSE(10, ("Barrier already done.\n"));
return BCOL_FN_COMPLETE;
}
return BCOL_FN_STARTED;
}
static int mca_bcol_iboffload_new_style_barrier_intra(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variables */
int rc;
mca_bcol_iboffload_collreq_t *coll_request;
mca_bcol_iboffload_module_t *iboffload =
(mca_bcol_iboffload_module_t *) const_args->bcol_module;
/* check for ordering */
MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args);
/*
* recursive doubling
*/
IBOFFLOAD_VERBOSE(10, ("Barrier starts.\n"));
/* init barrier collective reqeust */
rc = mca_bcol_iboffload_barrier_init(input_args, iboffload, NULL, &coll_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_ERROR(("Get error from mca_bcol_iboffload_barrier_init"));
return rc;
}
/* start the barrier */
rc = iboffload->barrier_algth(iboffload, coll_request);
if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
return BCOL_FN_NOT_STARTED;
}
/* done */
return BCOL_FN_STARTED;
}
int mca_bcol_iboffload_barrier_register(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
IBOFFLOAD_VERBOSE(10, ("Register iboffload Barrier.\n"));
comm_attribs.bcoll_type = BCOL_BARRIER;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super,
&comm_attribs, &inv_attribs,
mca_bcol_iboffload_new_style_barrier_intra,
mca_bcol_iboffload_new_style_barrier_progress);
return OMPI_SUCCESS;
}
int mca_bcol_iboffload_memsync_register(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
IBOFFLOAD_VERBOSE(10, ("Register sync function\n"));
comm_attribs.bcoll_type = BCOL_SYNC;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super,
&comm_attribs, &inv_attribs,
mca_bcol_iboffload_nb_memory_service_barrier_intra,
mca_bcol_iboffload_new_style_barrier_progress);
return OMPI_SUCCESS;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,606 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_BCAST_H
#define MCA_BCOL_IBOFFLOAD_BCAST_H
#include "ompi_config.h"
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_task.h"
#include "bcol_iboffload_collreq.h"
#include "bcol_iboffload_collfrag.h"
#include "bcol_iboffload_endpoint.h"
#include "opal/include/opal/types.h"
BEGIN_C_DECLS
int mca_bcol_iboffload_small_msg_bcast_progress(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int mca_bcol_iboffload_small_msg_bcast_extra_intra(bcol_function_args_t *fn_arguments,
struct mca_bcol_base_function_t *const_args);
int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments,
struct mca_bcol_base_function_t *const_args);
int mca_bcol_iboffload_bcast_scatter_allgather_intra(bcol_function_args_t *fn_arguments,
struct mca_bcol_base_function_t *const_args);
int mca_bcol_iboffload_zero_copy_progress(bcol_function_args_t *fn_arguments,
struct mca_bcol_base_function_t *const_args);
int mca_bcol_iboffload_bcast_scatter_allgather_extra_intra(bcol_function_args_t *fn_arguments,
struct mca_bcol_base_function_t *const_args);
int mca_bcol_iboffload_bcast_register(mca_bcol_base_module_t *super);
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_recv_rtr_setup(
struct mqe_task **last_wait,
uint32_t dest_rank,
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collfrag_t *coll_fragment)
{
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_frag_t *fragment;
/* Wait for RTR message over credit QP */
fragment = mca_bcol_iboffload_get_preposted_recv_frag(
iboffload, dest_rank,
MCA_BCOL_IBOFFLOAD_QP_CREDIT);
if (OPAL_UNLIKELY(NULL == fragment)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
task = mca_bcol_iboffload_get_wait_task(
iboffload, dest_rank, 1, fragment, MCA_BCOL_IBOFFLOAD_QP_CREDIT,
iboffload->endpoints[dest_rank]->qps[MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF].qp->lcl_qp);
if (OPAL_UNLIKELY(NULL == task)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_send_small_buff_setup(
struct mqe_task **last_send,
size_t len, uint32_t dest_rank,
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collfrag_t *coll_fragment)
{
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_frag_t *fragment;
mca_bcol_iboffload_collreq_t *coll_request =
coll_fragment->coll_full_req;
IBOFFLOAD_VERBOSE(10,("Get ml frag that I will send dest rank %d, len %d, lkey %d",
dest_rank, len, iboffload->rdma_block.ib_info.lkey));
fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank,
coll_request->qp_index, len, 0,
SBUF, /* this could be problematic */
MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML);
if (OPAL_UNLIKELY(NULL == fragment)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
IBOFFLOAD_VERBOSE(10,("Get an rdma task for dest %d for packet size %d",
dest_rank,len));
task = mca_bcol_iboffload_get_rdma_task(
dest_rank, 0,
fragment, iboffload, coll_fragment);
if (OPAL_UNLIKELY(NULL == task)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
*last_send = &task->element;
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_send_large_buff_setup(
struct mqe_task **last_send,
int buf_index, int offset,
size_t len, uint32_t dest_rank,
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collfrag_t *coll_fragment)
{
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_frag_t *fragment;
mca_bcol_iboffload_collreq_t *coll_request =
coll_fragment->coll_full_req;
fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank,
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
len,
offset, buf_index, MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML);
if (OPAL_UNLIKELY(NULL == fragment)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
task = mca_bcol_iboffload_get_send_task(
iboffload, dest_rank,
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
fragment, coll_fragment, NO_INLINE);
if (OPAL_UNLIKELY(NULL == task)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
*last_send = &task->element;
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_send_rtr_setup(
struct mqe_task **last_send,
uint32_t dest_rank,
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collfrag_t *coll_fragment)
{
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_frag_t *fragment;
/* Recv is ready , Send RTR message */
fragment = mca_bcol_iboffload_get_send_frag(coll_fragment->coll_full_req,
dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT, 0,
0, RBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
if (OPAL_UNLIKELY(NULL == fragment)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
task = mca_bcol_iboffload_get_send_task(iboffload, dest_rank,
MCA_BCOL_IBOFFLOAD_QP_CREDIT,
fragment, coll_fragment, INLINE);
if (OPAL_UNLIKELY(NULL == task)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
IBOFFLOAD_VERBOSE(10, ("dest_rank - %d. qp index - %d.\n",
dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT));
*last_send = &task->element;
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_recv_small_preposted_buff_setup(
struct mqe_task **last_wait,
size_t len, uint32_t dest_rank,
int qp_index,
int nwaits,
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collfrag_t *coll_fragment)
{
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_frag_t *fragment;
IBOFFLOAD_VERBOSE(10,("Get preposted recv from rank %d", dest_rank));
fragment = mca_bcol_iboffload_get_preposted_recv_frag(
iboffload, dest_rank,
qp_index);
if (OPAL_UNLIKELY(NULL == fragment)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, nwaits,
fragment, qp_index, NULL);
if (OPAL_UNLIKELY(NULL == task)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
*last_wait = &task->element;
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_recv_small_buff_setup(
struct mqe_task **last_wait,
size_t len, uint32_t dest_rank,
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collfrag_t *coll_fragment)
{
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_frag_t *fragment;
mca_bcol_iboffload_collreq_t *coll_request =
coll_fragment->coll_full_req;
IBOFFLOAD_VERBOSE(10, ("Get preposted recv from rank %d", dest_rank));
fragment = mca_bcol_iboffload_get_preposted_recv_frag(
iboffload, dest_rank,
coll_request->qp_index);
if (OPAL_UNLIKELY(NULL == fragment)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1,
fragment, coll_request->qp_index, NULL);
if (OPAL_UNLIKELY(NULL == task)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
*last_wait = &task->element;
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_recv_large_buff_setup(
struct mqe_task **last_wait,
int buf_index, int offset,
size_t len, uint32_t dest_rank,
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collfrag_t *coll_fragment)
{
int num_preposted;
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_frag_t *fragment;
mca_bcol_iboffload_collreq_t *coll_request = coll_fragment->coll_full_req;
/* Post message to recv queue for large messages */
fragment = mca_bcol_iboffload_get_ml_frag(
iboffload, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, len,
coll_request->buffer_info[buf_index].iboffload_reg->mr->lkey,
(uint64_t)((unsigned char *)coll_request->buffer_info[buf_index].buf + offset));
if (OPAL_UNLIKELY(NULL == fragment)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
num_preposted = mca_bcol_iboffload_prepost_ml_recv_frag(
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
dest_rank, fragment, iboffload);
if (0 >= num_preposted) {
IBOFFLOAD_ERROR(("Failed to prepost recv fragments "
"return code - %d; dest_rank - %d",
num_preposted, dest_rank));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1,
fragment, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, NULL);
if (OPAL_UNLIKELY(NULL == task)) {
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
*last_wait = &task->element;
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__
int bcol_iboffload_binomial_root_to_src(int group_root, int my_rank,
int pow2_size, int group_size, int *distance)
{
int root, relative_rank, src,
pow2_distance = 0, i;
if (group_root < pow2_size) {
root = group_root;
} else {
/* the source of the data is extra node,
the real root it represented by some rank from
pow2 group */
root = group_root - pow2_size;
/* shortcut for the case when my rank is root for the group */
if (my_rank == root) {
*distance = -1;
return group_root;
}
}
relative_rank = (my_rank - root) < 0 ? my_rank - root + pow2_size :
my_rank - root;
for (i = 1; i < pow2_size; i<<=1, pow2_distance++) {
if (relative_rank & i) {
src = my_rank ^ i;
if (src >= pow2_size)
src -= pow2_size;
*distance = pow2_distance;
IBOFFLOAD_VERBOSE(10, ("AAAAA d %d rel %d it %d root %d my %d", *distance, relative_rank, i, root, my_rank));
return src;
}
}
/* error case */
*distance = -1;
return -1;
}
static inline void bcol_iboffload_setup_binomial_connection(mca_bcol_iboffload_module_t *iboffload)
{
netpatterns_pair_exchange_node_t *my_exchange_node =
&iboffload->recursive_doubling_tree;
int i, n_exchanges = my_exchange_node->n_exchanges,
*exchanges = my_exchange_node->rank_exchanges,
n_extra_src = my_exchange_node->n_extra_sources,
my_rank = iboffload->ibnet->super.my_index,
rank_extra_src = my_exchange_node->rank_extra_source;
mca_bcol_iboffload_endpoint_t *ep;
IBOFFLOAD_VERBOSE(10, ("Open connections.\n"));
if (0 < n_extra_src) {
ep = iboffload->endpoints[rank_extra_src];
while (OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
#if OPAL_ENABLE_DEBUG
{
int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
assert(NULL != ep->qps[qp_index].qp->lcl_qp);
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.",
ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num));
}
}
#endif
/* Connect to all extra nodes */
if (EXTRA_NODE == my_exchange_node->node_type) {
for (i = iboffload->power_of_2_ranks;
i < iboffload->num_endpoints; ++i) {
if (i != my_rank) {
ep = iboffload->endpoints[i];
IBOFFLOAD_VERBOSE(10, ("subgroup rank %d: Connect to rank %d.\n", my_rank, i));
while (OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
#if OPAL_ENABLE_DEBUG
{
int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
assert(NULL != ep->qps[qp_index].qp->lcl_qp);
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.",
ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num));
}
}
#endif
}
}
}
}
for (i = 0; i < n_exchanges; ++i) {
ep = iboffload->endpoints[exchanges[i]];
while (OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
#if OPAL_ENABLE_DEBUG
{
int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
assert(NULL != ep->qps[qp_index].qp->lcl_qp);
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.",
ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num));
}
}
#endif
}
/* set the connection status to connected */
iboffload->connection_status[RECURSIVE_DOUBLING_TREE_BCAST] = true;
}
static inline __opal_attribute_always_inline__
int bcol_iboffload_bcast_binomial_gather(mca_bcol_iboffload_module_t *iboffload_module,
struct mqe_task **last_send, struct mqe_task **last_wait,
mca_bcol_iboffload_collfrag_t *coll_fragment,
int count, int base_block_size, int radix_mask_pow)
{
int rc;
int i;
int my_group_index = iboffload_module->ibnet->super.my_index;
int delta, rdelta;
IBOFFLOAD_VERBOSE(10, ("bcol_iboffload_bcast_binomial_gather %d %d",
radix_mask_pow, my_group_index));
/* we assume the iteration #iteration already was completed with probe */
for (i = 0; i < iboffload_module->power_of_2; i++) {
int pow2 = 1 << i;
int peer_index = my_group_index ^ pow2;
int slen, rlen,
send_offset,
recv_offset;
if (i > radix_mask_pow) {
slen = rlen = pow2 * base_block_size;
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
rdelta = count - recv_offset;
if (rdelta > 0) {
IBOFFLOAD_VERBOSE(10, ("Recv1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
pow2, 1 << iboffload_module->power_of_2,
recv_offset, rlen, peer_index));
rc = mca_bcol_iboffload_send_rtr_setup(last_send,
peer_index, iboffload_module,
coll_fragment);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
return OMPI_ERROR;
}
}
delta = count - send_offset;
if (delta > 0) {
if (delta < slen) {
/* recv the tail */
slen = delta;
}
IBOFFLOAD_VERBOSE(10, ("Send1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
pow2, 1 << iboffload_module->power_of_2,
send_offset, slen, peer_index));
rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
return OMPI_ERROR;
}
rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index,
iboffload_module, coll_fragment);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
return OMPI_ERROR;
}
}
if (rdelta > 0) {
if (rdelta < rlen) {
/* recv the tail */
rlen = rdelta;
}
rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait,
SBUF, recv_offset, rlen, peer_index,
iboffload_module, coll_fragment);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
return OMPI_ERROR;
}
}
} else if (i == radix_mask_pow) {
/* only receive data */
rlen = pow2 * base_block_size;
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
delta = count - recv_offset;
if (0 >= delta) {
/* we have nothing to send, skip the iteration */
continue;
}
if (delta < rlen) {
/* recv the tail */
rlen = delta;
}
/* receive data from the peer */
IBOFFLOAD_VERBOSE(10, ("Recv2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
pow2,
1 << iboffload_module->power_of_2,
recv_offset,
rlen, peer_index));
rc = mca_bcol_iboffload_send_rtr_setup(last_send,
peer_index, iboffload_module,
coll_fragment);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
return OMPI_ERROR;
}
rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait,
SBUF, recv_offset, rlen, peer_index,
iboffload_module, coll_fragment);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
return OMPI_ERROR;
}
} else if (i < radix_mask_pow) {
/* Only send data */
slen = pow2 * base_block_size;
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
delta = count - send_offset;
if (0 >= delta) {
/* we have nothing to send, skip the iteration */
continue;
}
if (delta < slen) {
slen = delta;
}
IBOFFLOAD_VERBOSE(10, ("Send2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
pow2,
1 << iboffload_module->power_of_2,
send_offset,
slen,
peer_index));
rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
return OMPI_ERROR;
}
rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index,
iboffload_module, coll_fragment);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
return OMPI_ERROR;
}
}
}
return OMPI_SUCCESS;
}
END_C_DECLS
#endif

Просмотреть файл

@ -1,51 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <string.h>
#include "bcol_iboffload_collreq.h"
#include "bcol_iboffload_collfrag.h"
static void
collfrag_constructor(struct mca_bcol_iboffload_collfrag_t *collfrag)
{
collfrag->n_sends = 0;
collfrag->n_sends_completed = 0;
memset(collfrag->pre_posted_recvs, 0,
sizeof(struct mca_bcol_iboffload_task_t *) * MAX_MQE_TASKS);
collfrag->signal_task_wr_id = (uint64_t) 0;
collfrag->complete = false;
collfrag->seq_n = -1;
collfrag->coll_full_req = NULL;
collfrag->unpack_size = 0;
collfrag->tasks_posted = 0;
collfrag->to_post = NULL;
collfrag->task_next = NULL;
collfrag->tasks_to_release = NULL;
collfrag->in_pending_list = false;
}
static void
collfrag_destruct(struct mca_bcol_iboffload_collfrag_t *collfrag)
{
}
OBJ_CLASS_INSTANCE(mca_bcol_iboffload_collfrag_t,
ompi_free_list_item_t,
collfrag_constructor,
collfrag_destruct);

Просмотреть файл

@ -1,144 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_COLLFRAG_H
#define MCA_BCOL_IBOFFLOAD_COLLFRAG_H
#include "ompi_config.h"
#include <infiniband/mqe.h>
#include <infiniband/verbs.h>
#include <infiniband/mverbs.h>
#include "bcol_iboffload.h"
#include "opal/class/ompi_free_list.h"
BEGIN_C_DECLS
#define MAX_MQE_TASKS 128 /* Pasha - do we want to make it dynamic ?*/
struct mca_bcol_iboffload_task_t;
struct mca_bcol_iboffload_collreq_t;
/* collective fragment descriptor */
struct mca_bcol_iboffload_collfrag_t {
ompi_free_list_item_t super;
/* number of asynchronous sends scheduled */
uint32_t n_sends;
/* number of sends completed */
uint32_t n_sends_completed;
/* Algorithm ID that was user for this fragment*/
int32_t alg;
/* pre-posted receive sources */
struct mca_bcol_iboffload_task_t *pre_posted_recvs[MAX_MQE_TASKS];
/* cache here pointer to signaled task */
uint64_t signal_task_wr_id;
/* mwr completion from the mcq */
volatile bool complete;
/* sequence number - we use it for
correct ordering of resources release */
uint32_t seq_n;
/* pointer to the full collective request descriptor */
struct mca_bcol_iboffload_collreq_t *coll_full_req;
size_t unpack_size;
bool in_pending_list;
/* Num of posted tasks */
int tasks_posted;
/* Pointer to head of not posted elements list */
struct mqe_task *to_post;
/* Pointer to tail next */
struct mqe_task **tail_next;
/* List of the all tasks of this coll frag */
struct mca_bcol_iboffload_task_t *tasks_to_release;
/* Pointer to the next elem in All tasks list */
struct mca_bcol_iboffload_task_t **task_next;
/* Num of needed mq credits */
int mq_credits;
/* MQ index, that used for this frag */
int mq_index;
/*
* Last wait sequence number; zero i.e.
* there isn't any wait in the coll request
*/
int32_t last_wait_num;
/* fragment descriptor for non contiguous data */
bcol_fragment_descriptor_t *bcol_frag_info;
/* frag-len of ml buffer */
int frag_len;
};
typedef struct mca_bcol_iboffload_collfrag_t mca_bcol_iboffload_collfrag_t;
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_collfrag_t);
static inline __opal_attribute_always_inline__
void mca_bcol_iboffload_collfrag_init(
mca_bcol_iboffload_collfrag_t *cf)
{
/* init the request */
cf->n_sends = 0;
cf->complete = false;
cf->n_sends_completed = 0;
cf->alg = -1;
cf->in_pending_list = false;
cf->tail_next = NULL;
cf->tasks_posted = 0;
cf->to_post = NULL;
cf->mq_credits = 0;
cf->mq_index = 0;
cf->tasks_to_release = NULL;
cf->task_next = &cf->tasks_to_release;
cf->last_wait_num = 0;
}
static inline __opal_attribute_always_inline__
struct mca_bcol_iboffload_collfrag_t *
mca_bcol_iboffload_get_collfrag(void)
{
ompi_free_list_item_t *item;
mca_bcol_iboffload_collfrag_t *cf;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
/* blocking allocation for collectives fragment */
OMPI_FREE_LIST_GET_MT(&cm->collfrags_free, item);
if (OPAL_UNLIKELY(NULL == item)) {
IBOFFLOAD_ERROR(("Failed to allocated collfrag.\n"));
return NULL;
}
cf = (mca_bcol_iboffload_collfrag_t*) item;
mca_bcol_iboffload_collfrag_init(cf);
return cf;
}
END_C_DECLS
#endif

Просмотреть файл

@ -1,50 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "bcol_iboffload_collreq.h"
static void
collreq_construct(struct mca_bcol_iboffload_collreq_t *collreq)
{
int i;
collreq->n_fragments = 0;
collreq->n_frag_mpi_complete = 0;
collreq->n_frag_net_complete = 0;
collreq->user_handle_freed = false;
for (i = 0; i < BCOL_IBOFFLOAD_BUFFERS; i++) {
collreq->buffer_info[i].buf = NULL;
collreq->buffer_info[i].offset = 0;
collreq->buffer_info[i].iboffload_reg = NULL;
}
OBJ_CONSTRUCT(&collreq->work_requests, opal_list_t);
OBJ_CONSTRUCT(&collreq->first_collfrag, mca_bcol_iboffload_collfrag_t);
OBJ_CONSTRUCT(&collreq->send_convertor, opal_convertor_t);
OBJ_CONSTRUCT(&collreq->recv_convertor, opal_convertor_t);
}
static void
collreq_destruct(struct mca_bcol_iboffload_collreq_t *collreq)
{
OBJ_DESTRUCT(&collreq->work_requests);
OBJ_DESTRUCT(&collreq->first_collfrag);
OBJ_DESTRUCT(&collreq->send_convertor);
OBJ_DESTRUCT(&collreq->recv_convertor);
}
OBJ_CLASS_INSTANCE(mca_bcol_iboffload_collreq_t,
ompi_request_t,
collreq_construct,
collreq_destruct);

Просмотреть файл

@ -1,273 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_COLLREQ_H
#define MCA_BCOL_IBOFFLOAD_COLLREQ_H
#include "ompi_config.h"
#include <infiniband/mqe.h>
#include <infiniband/verbs.h>
#include <infiniband/mverbs.h>
#include "opal/class/ompi_free_list.h"
#include "bcol_iboffload.h"
#include "bcol_iboffload_device.h"
#include "bcol_iboffload_collfrag.h"
#define SBUF 0
#define RBUF 1
#define BCOL_IBOFFLOAD_BUFFERS 2
BEGIN_C_DECLS
struct mca_bcol_iboffload_reg_t;
/*
* collective progress function
*/
typedef int (*collective_message_progress_function)(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *full_message_descriptor);
/*
* callback function to be called after the collective work request
* completes. This is invoked in user-space, and is typically where
* data may be copied out of library buffers, or when any other user-
* level protocol may be completed
*
* input:
* callback data: typically, this may be the work request just finished
*/
typedef int (*collective_message_completion_callback_function)(
void *callback_data);
struct mca_bcol_iboffload_buff_info {
void *buf;
size_t offset;
uint32_t lkey;
struct mca_bcol_iboffload_reg_t *iboffload_reg;
};
typedef struct mca_bcol_iboffload_buff_info mca_bcol_iboffload_buff_info;
/*
* Collective message descriptor
* the mca_bcol_iboffload_message_desc_t was replaced with mca_bcol_iboffload_collreq_t
* *************************************************************************************************
*
* Brief description of iboffload collective request dependencies:
*
* mca_bcol_iboffload_collreq_t <----<< Full coll request
* |
* --(0)-- mca_bcol_iboffload_collfrag_t <----<< Fragment of coll request ( for example
* | | 10MB Bcast maybe split to 2MB fragments )
* | |
* | --(0)-- mca_bcol_iboffload_task_t---mqe_task
* | | |
* | | ---mca_bcol_iboffload_frag_t---ibv_sge
* | --(1)-- mca_bcol_iboffload_task_t---mqe_task
* | | |
* | | ---mca_bcol_iboffload_frag_t---ibv_sge
* | ..(M)..
* |
* --(1)-- mca_bcol_iboffload_collfrag_t
* |
* ..(N)..
*
* *************************************************************************************************
*/
struct mca_bcol_iboffload_collreq_t {
ompi_request_t super;
/* op type */
struct ompi_op_t *op;
/* Sometimes the operation that should be performed
by the IB is different than the mpi_op and is then set
by the pack_data_for_calc function */
enum ibv_m_wr_calc_op actual_ib_op;
/* Sometimes the data type that should be used by the IB
to peroform the calc s different than the mpi dtype,
and is then set by the pack_data_for_calc function */
enum ibv_m_wr_data_type actual_ib_dtype;
/* data type */
struct ompi_datatype_t *dtype;
/* convertor for send operation */
opal_convertor_t send_conv;
/* convertor for recv operation */
opal_convertor_t recv_conv;
/*
* count (in data type units)
*/
uint64_t count;
/*
* root of collective operation
*/
int root;
/* number of message fragments */
int n_fragments;
/* number of fragments sent - all resrouces for a fragment are allocated
* or none at all are
*/
int n_frags_sent;
/* number of fragments completed from the MPI perspective */
int n_frag_mpi_complete;
/* number of fragments completed from a network perspective */
int n_frag_net_complete;
/* collective free and may be released - message complete from the
** MPI perspective, the network prespective, and the user is done
** with the message handle */
volatile bool user_handle_freed;
/* list of collective fragements - only 1 for now */
opal_list_t work_requests;
/* message progress function */
collective_message_progress_function progress_fn;
/* work request completion callback function */
collective_message_completion_callback_function completion_cb_fn;
/* index of qp with enough length of buffs for this collective */
int qp_index;
bool if_bcol_last;
/* The flag is used for the last bcol to indicate if the calculation should be done by the cpu */
bool do_calc_in_cpu;
/* in Allreduce case, if (true == do_calc_in_cpu) =>
the final result will be calc on local CPU */
uint64_t l_operand;
uint64_t r_operand;
/* caching ML-rdma buffer descriptor */
mca_bcol_iboffload_rdma_buffer_desc_t *ml_rdma_desc;
/* ML buffer index code */
int ml_buffer_index;
/* In the current implementation the collrequest connected to 1 single
iboffload module */
struct mca_bcol_iboffload_module_t *module;
mca_bcol_iboffload_collfrag_t first_collfrag;
/* Send/recv buffs info - user buffers registration if needed etc. */
mca_bcol_iboffload_buff_info buffer_info[BCOL_IBOFFLOAD_BUFFERS];
/* My bi nominal tree children in this collective */
int *bi_nominal_tree_children;
/* Convertors for send/recv if needed */
opal_convertor_t send_convertor;
opal_convertor_t recv_convertor;
/* Order info from upper layer */
mca_bcol_base_order_info_t *order_info;
};
typedef struct mca_bcol_iboffload_collreq_t mca_bcol_iboffload_collreq_t;
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_collreq_t);
#define COLLREQ_IS_DONE(cr) (cr->user_handle_freed && \
(cr->n_frag_mpi_complete == cr->n_fragments) && \
(cr->n_frag_net_complete == cr->n_fragments))
#define RELEASE_COLLREQ(cr) \
do { \
(cr)->user_handle_freed = false; \
OMPI_FREE_LIST_RETURN_MT(&mca_bcol_iboffload_component.collreqs_free, \
(ompi_free_list_item_t *) (cr)); \
} while (0)
static inline __opal_attribute_always_inline__
int mca_bcol_iboffload_free_resources_and_move_to_pending(
mca_bcol_iboffload_collfrag_t *coll_fragment,
mca_bcol_iboffload_module_t *iboffload)
{
int rc = mca_bcol_iboffload_free_tasks_frags_resources(coll_fragment,
iboffload->device->frags_free);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return rc;
}
IBOFFLOAD_VERBOSE(10, ("iboffload - %p, coll_fragment - %p, "
"coll frag in_pending_list ? - %d, pending_list size - %d.\n",
iboffload, coll_fragment, coll_fragment->in_pending_list,
opal_list_get_size(&iboffload->collfrag_pending)));
BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(iboffload, coll_fragment->mq_index, coll_fragment->mq_credits);
/* Remove coll frag from coll request opal_list */
opal_list_remove_item(&coll_fragment->coll_full_req->work_requests,
(opal_list_item_t *) coll_fragment);
if (false == coll_fragment->in_pending_list) {
/* Put the collfrag on pending list */
coll_fragment->in_pending_list = true;
opal_list_append(&iboffload->collfrag_pending,
(opal_list_item_t *) coll_fragment);
} else {
/* The item is already on pending list =>
insert it first that not break order
between frags on the list */
opal_list_prepend(&iboffload->collfrag_pending,
(opal_list_item_t *) coll_fragment);
}
return OMPI_SUCCESS;
}
/* Forward declaration */
struct mca_bcol_iboffload_reg_t;
static inline __opal_attribute_always_inline__
int mca_bcol_iboffload_prepare_buffer(
void *buffer,
size_t size,
struct mca_bcol_iboffload_reg_t **registration_handler,
mca_bcol_iboffload_module_t *iboffload)
{
int rc;
mca_mpool_base_registration_t *reg = NULL;
assert(size > 0);
rc = iboffload->device->mpool->mpool_register(
iboffload->device->mpool,
buffer, size,
(uint32_t) 0 /* flags */,
&reg);
*registration_handler =
(struct mca_bcol_iboffload_reg_t *) reg;
return rc;
}
int mca_bcol_iboffload_coll_req_implement(
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collreq_t *coll_request);
END_C_DECLS
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,73 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_DEVICE_H
#define MCA_BCOL_IBOFFLOAD_DEVICE_H
#include "ompi_config.h"
#include <infiniband/mqe.h>
#include <infiniband/mverbs.h>
#include <infiniband/verbs.h>
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#define BCOL_IBOFFLOAD_DUMMY_MEM_SIZE 1
BEGIN_C_DECLS
/* Device OBJ */
struct mca_bcol_iboffload_device_t {
opal_list_item_t super;
bool activated;
struct ompi_common_ofacm_base_dev_desc_t dev;
struct ibv_pd *ib_pd;
struct ibv_device_attr ib_dev_attr;
int num_act_ports;
struct mca_bcol_iboffload_port_t *ports;
struct ibv_cq *ib_cq;
/* CQ for MQs of all iboffload modules on this device */
struct ibv_cq *ib_mq_cq;
/* The free list of registered buffers
* since the registration depends on PD, it is
* most resonable place to keep the frags */
ompi_free_list_t *frags_free;
mca_mpool_base_module_t *mpool;
/* netowrk context */
bcol_base_network_context_t *net_context;
/* We keep dummy frags for all QPs on each device,
possibly some of QPs don't need it but anyway we distribute dummy
for them. All dummies point to a same byte of memory. */
mca_bcol_iboffload_frag_t dummy_frags[MCA_BCOL_IBOFFLOAD_QP_LAST];
/* Registred memory for the dummy frags */
char dummy_mem[BCOL_IBOFFLOAD_DUMMY_MEM_SIZE];
/* Registration info of the dummy memory */
mca_bcol_iboffload_reg_t dummy_reg;
};
typedef struct mca_bcol_iboffload_device_t mca_bcol_iboffload_device_t;
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_device_t);
END_C_DECLS
#endif /* MCA_BCOL_IBOFFLOAD_DEVICE_H */

Просмотреть файл

@ -1,373 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <infiniband/mverbs.h>
#include "ompi/constants.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/mca/common/ofacm/connect.h"
#include "opal/threads/mutex.h"
#include "opal/class/opal_object.h"
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_device.h"
#include "bcol_iboffload_endpoint.h"
static void mca_bcol_iboffload_endpoint_construct(mca_bcol_iboffload_endpoint_t *ep)
{
ep->iboffload_module = NULL;
ep->ibnet_proc = NULL;
ep->qps = (mca_bcol_iboffload_endpoint_qp_t *)
calloc(mca_bcol_iboffload_component.num_qps,
sizeof(mca_bcol_iboffload_endpoint_qp_t));
ep->index = 0;
OBJ_CONSTRUCT(&ep->endpoint_lock, opal_mutex_t);
OBJ_CONSTRUCT(&ep->pending_frags, opal_list_t);
memset(ep->recv_cq, 0, IBOFFLOAD_CQ_LAST * sizeof(ep->recv_cq[0]));
memset(&ep->qp_config, 0, sizeof(ompi_common_ofacm_base_qp_config_t));
ep->cpc_context = NULL;
memset(&ep->remote_zero_rdma_addr, 0, sizeof(mca_bcol_iboffload_rdma_info_t));
memset(&ep->remote_rdma_block, 0, sizeof(mca_bcol_iboffload_rem_rdma_block_t));
ep->need_toset_remote_rdma_info = false;
}
static void mca_bcol_iboffload_endpoint_destruct(mca_bcol_iboffload_endpoint_t *ep)
{
int qp_index, num_qps, i;
ompi_free_list_item_t *item;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
num_qps = cm->num_qps;
IBOFFLOAD_VERBOSE(10, ("Destruct: ep - %p, ep->index - %d", ep, ep->index));
if (NULL != ep->qps) {
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
do {
item = (ompi_free_list_item_t *)
opal_list_remove_first(&ep->qps[qp_index].preposted_frags);
if(OPAL_LIKELY(NULL != item)) {
OMPI_FREE_LIST_RETURN_MT(&ep->device->frags_free[qp_index], item);
}
} while (NULL != item);
OBJ_DESTRUCT(&ep->qps[qp_index].preposted_frags);
}
free(ep->qps);
}
OBJ_DESTRUCT(&ep->endpoint_lock);
OBJ_DESTRUCT(&ep->pending_frags);
/* If the CPC has an endpoint_finalize function, call it */
if (NULL != ep->endpoint_cpc->cbm_endpoint_finalize) {
ep->endpoint_cpc->cbm_endpoint_finalize(ep->cpc_context);
}
for (i = 0; i < IBOFFLOAD_CQ_LAST; i++) {
if (NULL != ep->recv_cq[i]) {
if (ibv_destroy_cq(ep->recv_cq[i])) {
IBOFFLOAD_ERROR(("Endpoint %x "
", failed to destroy CQ, errno says %s",
ep, strerror(errno)));
}
}
}
}
OBJ_CLASS_INSTANCE(mca_bcol_iboffload_endpoint_t,
opal_list_item_t,
mca_bcol_iboffload_endpoint_construct,
mca_bcol_iboffload_endpoint_destruct);
/* Pasha: Add some error message here */
/*
* Called when the CPC has established a connection on an endpoint
*/
static void mca_bcol_iboffload_endpoint_invoke_error(void *context)
{
mca_bcol_iboffload_endpoint_t *endpoint = (mca_bcol_iboffload_endpoint_t *) context;
IBOFFLOAD_ERROR(("Getting error on endpoint - %p!", endpoint));
}
/* Pasha: Need to add more logic here */
static void mca_bcol_iboffload_endpoint_cpc_complete(void *context)
{
mca_bcol_iboffload_endpoint_t *endpoint = (mca_bcol_iboffload_endpoint_t *) context;
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p for comm rank %d: CPC complete.\n",
endpoint, endpoint->iboffload_module->ibnet->super.group_list[endpoint->index]));
if (OMPI_SUCCESS !=
mca_bcol_iboffload_exchange_rem_addr(endpoint)) {
IBOFFLOAD_ERROR(("endpoint - %p, "
"remote addr exchange error.\n", endpoint));
}
/* The connection is correctly setup. Now we can decrease the
event trigger. */
opal_progress_event_users_decrement();
}
/* Vasily: Need to add more logic here */
int mca_bcol_iboffload_endpoint_post_recvs(void *context)
{
int qp_index, rc, num_qps;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_endpoint_t *endpoint =
(mca_bcol_iboffload_endpoint_t *) context;
IBOFFLOAD_VERBOSE(10, ("endpoint - %p, post of %d recvs !",
endpoint, cm->qp_infos[0].rd_num));
/* TODO Pasha - fix later */
num_qps = cm->num_qps;
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
rc = mca_bcol_iboffload_prepost_recv(endpoint, qp_index,
cm->qp_infos[qp_index].rd_num);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
/* Pasha: Need to add more failure logic */
IBOFFLOAD_ERROR(("Failed to prepost recv fragments "
"on qp index %d, return code - %d",
qp_index, rc));
return OMPI_ERROR;
}
}
return OMPI_SUCCESS;
}
/* The function go over each ibnet proc and creates endpoint for each one */
int mca_bcol_iboffloads_create_endpoints(mca_sbgp_ibnet_connection_group_info_t *cgroup,
mca_bcol_iboffload_module_t *module) {
uint32_t i;
mca_bcol_iboffload_endpoint_t *ep;
if (NULL == cgroup || NULL == module) {
IBOFFLOAD_ERROR(("Bad parameters for create endpoints function."));
return OMPI_ERROR;
}
module->num_endpoints = cgroup->num_procs;
module->endpoints = (mca_bcol_iboffload_endpoint_t **)
calloc(module->num_endpoints,
sizeof(mca_bcol_iboffload_endpoint_t *));
if (NULL == module->endpoints) {
IBOFFLOAD_ERROR(("Error memory allocation for endpoints array"
", errno says %s", strerror(errno)));
return OMPI_ERROR;
}
IBOFFLOAD_VERBOSE(10, ("iboffload - %p, num of endpoints - %d.\n",
module, module->num_endpoints));
/* Ishai: No need to open so many endpoints. We are not talking with all procs */
for (i = 0; i < cgroup->num_procs; i++) {
ep = OBJ_NEW(mca_bcol_iboffload_endpoint_t);
/* check qp memory allocation */
if (NULL == ep->qps) {
IBOFFLOAD_ERROR(("Failed to allocate memory for qps"));
return OMPI_ERROR;
}
/* init new endpoint */
ep->index = i;
ep->iboffload_module = module;
/* saving the device for the destruction - iboffload module amy not exist than */
ep->device = ep->iboffload_module->device;
ep->ibnet_proc = (mca_sbgp_ibnet_proc_t *)
opal_pointer_array_get_item(cgroup->ibnet_procs, i);
if (NULL == ep->ibnet_proc) {
IBOFFLOAD_ERROR(("Failed to get proc pointer, for index %d", i));
return OMPI_ERROR;
}
if (OMPI_SUCCESS !=
mca_bcol_iboffload_endpoint_init(ep)) {
IBOFFLOAD_ERROR(("Failed to init endpoint - %p", ep));
return OMPI_ERROR;
}
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, ep index - %d, iboffload - %p, "
"cpc contex - %p.\n", ep, ep->index,
ep->iboffload_module, ep->cpc_context));
/* Add the new endpoint to array of endpoints */
module->endpoints[i] = ep;
}
/* Pasha: Need to add better clean-up here */
return OMPI_SUCCESS;
}
static int config_qps(mca_bcol_iboffload_endpoint_t *ep)
{
int qp_index;
int ret = OMPI_SUCCESS;
ompi_common_ofacm_base_qp_config_t *qp_config = &ep->qp_config;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
qp_config->num_srqs = 0;
qp_config->srq_num = NULL;
qp_config->num_qps = cm->num_qps;
qp_config->init_attr = (struct ibv_qp_init_attr *)
calloc(qp_config->num_qps, sizeof(struct ibv_qp_init_attr));
if (NULL == qp_config->init_attr) {
IBOFFLOAD_ERROR(("Failed allocate memory for qp init attributes"));
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto config_qps_exit;
}
qp_config->attr = (struct ibv_qp_attr *)
calloc(qp_config->num_qps, sizeof(struct ibv_qp_attr));
if (OPAL_UNLIKELY(NULL == qp_config->attr)) {
IBOFFLOAD_ERROR(("Failed allocate memory for qp attributes"));
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto config_qps_exit;
}
/* we must to specify that the qps are special */
qp_config->init_attr_mask = (uint32_t *)
calloc(qp_config->num_qps, sizeof(uint32_t));
if (OPAL_UNLIKELY(NULL == qp_config->init_attr_mask)) {
IBOFFLOAD_ERROR(("Failed allocate memory for qp mask."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto config_qps_exit;
}
/* qp_config->rtr_attr_mask = qp_config->rts_attr_mask = NULL; */
qp_config->rtr_attr_mask = (uint32_t *)
calloc(qp_config->num_qps, sizeof(uint32_t));
if (OPAL_UNLIKELY(NULL == qp_config->rtr_attr_mask)) {
IBOFFLOAD_ERROR(("Failled allocate memory for qp rtr attributes mask."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto config_qps_exit;
}
qp_config->rts_attr_mask = (uint32_t *)
calloc(qp_config->num_qps, sizeof(uint32_t));
if (OPAL_UNLIKELY(NULL == qp_config->rts_attr_mask)) {
IBOFFLOAD_ERROR(("Failled allocate memory for qp rts attributes mask."));
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto config_qps_exit;
}
for (qp_index = 0; qp_index < qp_config->num_qps; ++qp_index) {
mca_bcol_iboffload_config_qps_fn_t config_qp =
cm->qp_infos[qp_index].config_qp;
if (NULL != config_qp) {
config_qp(qp_index, ep, qp_config);
}
}
config_qps_exit:
return ret;
}
/* The fucntion is called for endpoints
* with MCA_COMMON_OFACM_USER_CUSTOM state only,
* we need a OPAL_THREAD_LOCK before call to this function */
int mca_bcol_iboffload_endpoint_init(mca_bcol_iboffload_endpoint_t *ep)
{
int qp_index, cq_index, num_qps;
ompi_common_ofacm_base_module_t *cpc;
mca_bcol_iboffload_device_t *device = ep->iboffload_module->device;
mca_sbgp_ibnet_connection_group_info_t *cgroup =
&ep->iboffload_module->ibnet->cgroups[ep->iboffload_module->cgroup_index];
for (cq_index = 0; cq_index < IBOFFLOAD_CQ_LAST; cq_index++) {
if (OMPI_SUCCESS !=
mca_bcol_iboffload_adjust_cq(device, &ep->recv_cq[cq_index])) {
IBOFFLOAD_ERROR(("Error creating CQ for %s errno says %s",
ibv_get_device_name(device->dev.ib_dev), strerror(errno)));
/* OBJ_RELEASE(ep); */ /* Vasily: What must we do in this case ??? */
return OMPI_ERROR;
}
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != config_qps(ep))) {
IBOFFLOAD_ERROR(("Error configure QPs for endpoint %x errno says %s",
ep, strerror(errno)));
return OMPI_ERROR;
}
/* Adding here one more redirection in critical path. Need to think
* what is the best way to prevent it */
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, rem port - %d", ep,
ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].id));
cpc = ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].local_cpc;
ep->endpoint_cpc = cpc; /* caching pointer to cpc */
if (NULL != cpc->cbm_endpoint_init) {
ep->cpc_context = cpc->cbm_endpoint_init(
ep->ibnet_proc->ompi_proc,
&ep->qp_config,
device->ib_pd,
ep->iboffload_module->subnet_id,
ep->iboffload_module->ibnet->group_id,
ep->iboffload_module->lid,
/* Remote lid of target module */
ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].lid,
ep->index, /* user context index */
(void *) ep, /* user context */
cpc,
mca_bcol_iboffload_endpoint_cpc_complete,
mca_bcol_iboffload_endpoint_invoke_error,
mca_bcol_iboffload_endpoint_post_recvs);
if (OPAL_UNLIKELY(NULL == ep->cpc_context)) {
IBOFFLOAD_ERROR(("Endpoint - %p, failed to init context", ep));
/* OBJ_RELEASE(ep); */ /* Vasily: What must we do in this case ??? */
return OMPI_ERROR;
}
/* Updating remote port info */
num_qps = mca_bcol_iboffload_component.num_qps;
ep->remote_info = &ep->cpc_context->remote_info;
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
ep->qps[qp_index].qp = &ep->cpc_context->qps[qp_index];
}
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,328 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_ENDPOINT_H
#define MCA_BCOL_IBOFFLOAD_ENDPOINT_H
#include "ompi_config.h"
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h"
#define BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) (ep)->ibnet_proc->use_port[(cgroup)->index]
#define BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep) (BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) - 1)
BEGIN_C_DECLS
struct mca_bcol_iboffload_endpoint_qp_t {
struct ompi_common_ofacm_base_qp_t *qp;
size_t ib_inline_max;
int32_t sd_wqe; /* Number of available send wqe entries */
int32_t rd_wqe; /* Number of available recv wqe entries */
opal_list_t preposted_frags; /* List of preposted frags */
/* opal_mutex_t lock; */ /* Do I need lock here ? */
};
typedef struct mca_bcol_iboffload_endpoint_qp_t mca_bcol_iboffload_endpoint_qp_t;
enum {
IBOFFLOAD_CQ_SMALL_MESSAGES = 0,
IBOFFLOAD_CQ_SYNC,
IBOFFLOAD_CQ_LARGE_MESSAGES,
IBOFFLOAD_CQ_LAST
};
/* Endpoint object */
struct mca_bcol_iboffload_endpoint_t {
opal_list_item_t super;
/** BTL module that created this connection */
mca_bcol_iboffload_module_t *iboffload_module;
/** proc structure corresponding to endpoint */
mca_sbgp_ibnet_proc_t *ibnet_proc;
/** lock for concurrent access to endpoint state */
opal_mutex_t endpoint_lock;
/** Penging frag list */
opal_list_t pending_frags;
/** QPs information */
mca_bcol_iboffload_endpoint_qp_t *qps;
/** endpoint index on array */
int32_t index;
/** CQ for receive queues on this endpoint */
struct ibv_cq *recv_cq[IBOFFLOAD_CQ_LAST];
/** QP configuration information */
ompi_common_ofacm_base_qp_config_t qp_config;
/** cpc context */
ompi_common_ofacm_base_local_connection_context_t *cpc_context;
/** caching pointer to remote info */
ompi_common_ofacm_base_remote_connection_context_t *remote_info;
/** caching pointer to cpc */
ompi_common_ofacm_base_module_t *endpoint_cpc;
/** The struct is used for zero RDMA with immediate
in some collectives, in barrier for example. */
mca_bcol_iboffload_rdma_info_t remote_zero_rdma_addr;
mca_bcol_iboffload_rem_rdma_block_t remote_rdma_block;
/** The pointer to device - In the destruction function
the iboffload module may not exist any more - caching the device */
struct mca_bcol_iboffload_device_t *device;
bool need_toset_remote_rdma_info;
mca_bcol_iboffload_rdma_info_t remote_rdma_info[MAX_REMOTE_RDMA_INFO];
};
typedef struct mca_bcol_iboffload_endpoint_t mca_bcol_iboffload_endpoint_t;
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_endpoint_t);
/* Function declaration */
int mca_bcol_iboffload_endpoint_init(mca_bcol_iboffload_endpoint_t *ep);
static inline __opal_attribute_always_inline__
int check_endpoint_state(mca_bcol_iboffload_endpoint_t *ep,
mca_bcol_base_descriptor_t *des,
opal_list_t *pending_list)
{
int rc = OMPI_ERR_RESOURCE_BUSY;
OPAL_THREAD_LOCK(&ep->cpc_context->context_lock);
/* Adding here one more redirection in critical path. Need to think
* what is the best way to prevent it */
switch(ep->cpc_context->state) {
case MCA_COMMON_OFACM_CLOSED:
rc = ep->endpoint_cpc->cbm_start_connect(ep->cpc_context);
if (OMPI_SUCCESS == rc) {
rc = OMPI_ERR_RESOURCE_BUSY;
}
/*
* As long as we expect a message from the peer (in order
* to setup the connection) let the event engine pool the
* OOB events. Note: we increment it once peer active
* connection.
*/
opal_progress_event_users_increment();
/* fall through */
default:
/* opal_list_append(pending_list, (opal_list_item_t *)des); */ /* Vasily: will be uncomment later */
break;
case MCA_COMMON_OFACM_FAILED:
rc = OMPI_ERR_UNREACH;
break;
case MCA_COMMON_OFACM_CONNECTED:
rc = OMPI_SUCCESS;
break;
}
OPAL_THREAD_UNLOCK(&ep->cpc_context->context_lock);
return rc;
}
int mca_bcol_iboffloads_create_endpoints(mca_sbgp_ibnet_connection_group_info_t *cgroup,
mca_bcol_iboffload_module_t *module);
int mca_bcol_iboffload_endpoint_post_recvs(void *context);
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_prepost_recv(
mca_bcol_iboffload_endpoint_t *endpoint,
int qp_index, int num_to_prepost)
{
mca_bcol_iboffload_prepost_qps_fn_t prepost_recv =
mca_bcol_iboffload_component.qp_infos[qp_index].prepost_recv;
if (NULL != prepost_recv) {
return prepost_recv(endpoint, qp_index, num_to_prepost);
}
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_post_ml_scatter_recv_frag(
int qp_index, uint32_t dest_rank,
int nitems, struct iovec *buff_iovec,
uint32_t lkey,
struct ibv_sge *sg_entries,
mca_bcol_iboffload_frag_t *frag,
mca_bcol_iboffload_module_t *iboffload)
{
int ret, start_wr_index;
struct ibv_recv_wr *recv_wr, *recv_bad;
int i;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank];
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d",
(void *) endpoint, qp_index));
/* make sure that we do not overrun number of rd_wqe */
if (0 >= endpoint->qps[qp_index].rd_wqe) {
IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d",
endpoint->qps[qp_index].rd_wqe));
return 0;
}
OPAL_THREAD_LOCK(&recv_wrs->lock);
/* Calculate start index in array
* of pre-allocated work requests */
start_wr_index = cm->qp_infos[qp_index].rd_num - 1;
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, "
"start index of WRs - %d", (void *) endpoint,
qp_index, start_wr_index));
for (i = 0; i < nitems; i++) {
sg_entries[i].length = buff_iovec[i].iov_len;
sg_entries[i].addr = (uint64_t)buff_iovec[i].iov_base;
sg_entries[i].lkey = lkey;
IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , length %d , address %p",
i, sg_entries[i].length, sg_entries[i].addr));
IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , iovec length %d",
i, buff_iovec[i].iov_len));
}
recv_wr->num_sge = nitems;
recv_wr->sg_list = sg_entries;
/* Set the tail */
recv_wr->next = NULL;
/* post the list of recvs */
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
if (OPAL_UNLIKELY(0 != ret)) {
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
"qp_index - %d.\n",
ibv_get_device_name(device->dev.ib_dev),
strerror(errno), ret, qp_index));
return -1;
}
/* decresing numbers of free recv wqe */
--endpoint->qps[qp_index].rd_wqe;
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
IBOFFLOAD_VERBOSE(10, ("Return success: "
"endpoint %p, qp_index %d, dest_rank %d",
endpoint, qp_index, dest_rank));
return 1;
}
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_prepost_ml_recv_frag(
int qp_index, uint32_t dest_rank,
mca_bcol_iboffload_frag_t *frag,
mca_bcol_iboffload_module_t *iboffload)
{
int ret, start_wr_index;
struct ibv_recv_wr *recv_wr, *recv_bad;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank];
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d",
(void *) endpoint, qp_index));
/* make sure that we do not overrun number of rd_wqe */
if (0 >= endpoint->qps[qp_index].rd_wqe) {
IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d",
endpoint->qps[qp_index].rd_wqe));
return 0;
}
OPAL_THREAD_LOCK(&recv_wrs->lock);
/* Calculate start index in array
* of pre-allocated work requests */
start_wr_index = cm->qp_infos[qp_index].rd_num - 1;
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, "
"start index of WRs - %d", (void *) endpoint,
qp_index, start_wr_index));
recv_wr->sg_list = &frag->sg_entry;
/* Set the tail */
recv_wr->next = NULL;
/* post the list of recvs */
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
if (OPAL_UNLIKELY(0 != ret)) {
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
"qp_index - %d.\n",
ibv_get_device_name(device->dev.ib_dev),
strerror(errno), ret, qp_index));
return -1;
}
/* decresing numbers of free recv wqe */
--endpoint->qps[qp_index].rd_wqe;
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
IBOFFLOAD_VERBOSE(10, ("Return success: "
"endpoint %p, qp_index %d, dest_rank %d",
endpoint, qp_index, dest_rank));
return 1;
}
static inline __opal_attribute_always_inline__
mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_preposted_recv_frag(
mca_bcol_iboffload_module_t *iboffload,
int source, int qp_index)
{
mca_bcol_iboffload_frag_t *frag;
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[source];
frag = mca_bcol_iboffload_component.qp_infos[qp_index].get_preposted_recv(endpoint, qp_index);
/* do we want to run prepost */
IBOFFLOAD_VERBOSE(10, ("source - %d, qp_index - %d; "
"allocating preposted addr %p.\n",
source, qp_index, (void *) frag->sg_entry.addr));
if (OPAL_LIKELY(NULL != frag)) {
frag->next = NULL;
}
return frag;
}
END_C_DECLS
#endif /* MCA_BCOL_IBOFFLOAD_ENDPOINT_H */

Просмотреть файл

@ -1,350 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_task.h"
#include "bcol_iboffload_collfrag.h"
#include "bcol_iboffload_endpoint.h"
static int mca_bcol_iboffload_fanin_leader_progress(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int rc = OMPI_SUCCESS, leader_rank = 0, rank,
sbgp_size = iboffload->ibnet->super.group_size;
struct mqe_task *last_wait = NULL;
mca_bcol_iboffload_task_t *wait_task = NULL;
mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL;
struct mqe_task **mqe_ptr_to_set;
mca_bcol_iboffload_collfrag_t *coll_fragment;
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
opal_list_get_last(&coll_request->work_requests);
mqe_ptr_to_set = &coll_fragment->to_post;
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
goto out_of_resources;
}
for (rank = leader_rank + 1; rank < sbgp_size; ++rank) {
/* post wait */
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
iboffload, rank, coll_request->qp_index);
if(NULL == preposted_recv_frag) {
IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n"));
goto out_of_resources;
}
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, rank, 1,
preposted_recv_frag, coll_request->qp_index, NULL);
if(NULL == wait_task) {
IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
}
/* end of list */
*mqe_ptr_to_set = NULL;
last_wait->flags |= MQE_WR_FLAG_SIGNAL;
coll_fragment->signal_task_wr_id = last_wait->wr_id;
last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
/* post the mwr */
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
if(OMPI_SUCCESS != rc) {
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
/* Note: need to clean up */
return rc;
}
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
return OMPI_SUCCESS;
out_of_resources:
/* Release all resources */
IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
static int mca_bcol_iboffload_fanin_proxy_progress(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int rc = OMPI_SUCCESS, leader_rank = 0;
struct mqe_task *last_send = NULL;
mca_bcol_iboffload_task_t *send_task = NULL;
mca_bcol_iboffload_frag_t *send_fragment = NULL;
struct mqe_task **mqe_ptr_to_set;
mca_bcol_iboffload_collfrag_t *coll_fragment;
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
opal_list_get_last(&coll_request->work_requests);
mqe_ptr_to_set = &coll_fragment->to_post;
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
goto out_of_resources;
}
/* post send */
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
leader_rank, coll_request->qp_index, 0,
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
if(NULL == send_fragment) {
IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
goto out_of_resources;
}
send_task = mca_bcol_iboffload_get_send_task(iboffload, leader_rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER,
send_fragment, coll_fragment, INLINE);
if(NULL == send_task) {
IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
/* end of list */
*mqe_ptr_to_set = NULL;
assert(NULL != last_send);
last_send->flags |= MQE_WR_FLAG_SIGNAL;
coll_fragment->signal_task_wr_id = last_send->wr_id;
last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
/* post the mwr */
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
if(OMPI_SUCCESS != rc) {
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
/* Note: need to clean up */
return rc;
}
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
return OMPI_SUCCESS;
out_of_resources:
/* Release all resources */
IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
static int mca_bcol_iboffload_fanin_init(
bcol_function_args_t *input_args,
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t **coll_request)
{
ompi_free_list_item_t *item = NULL;
mca_bcol_iboffload_collfrag_t *coll_fragment = NULL;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init"));
OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item);
if(OPAL_UNLIKELY(NULL == item)) {
IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n"));
return OMPI_ERR_OUT_OF_RESOURCE;
}
(*coll_request) = (mca_bcol_iboffload_collreq_t *) item;
(*coll_request)->progress_fn = iboffload->fanin_algth;
(*coll_request)->completion_cb_fn = NULL;
(*coll_request)->order_info = &input_args->order_info;
(*coll_request)->module = iboffload;
(*coll_request)->ml_buffer_index = input_args->buffer_index;
(*coll_request)->buffer_info[SBUF].offset = 0;
(*coll_request)->buffer_info[RBUF].offset = 0;
(*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;
input_args->bcol_opaque_data = (void *) (*coll_request);
/* finish initializing full message descriptor */
(*coll_request)->n_fragments = 1;
(*coll_request)->n_frags_sent = 1;
(*coll_request)->n_frag_mpi_complete = 0;
(*coll_request)->n_frag_net_complete = 0;
(*coll_request)->user_handle_freed = false;
/*
* setup collective work request
*/
/* get collective frag */
coll_fragment = &(*coll_request)->first_collfrag;
mca_bcol_iboffload_collfrag_init(coll_fragment);
coll_fragment->alg = FANIN_ALG;
coll_fragment->mq_index = COLL_MQ;
/* Set mq credits */
coll_fragment->mq_credits = iboffload->alg_task_consump[FANIN_ALG];
/* set pointers for (coll frag) <-> (coll full request) */
MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment);
return OMPI_SUCCESS;
}
/************************************************************************
************************ New style Fan-In ******************************
***********************************************************************/
static int mca_bcol_iboffload_new_style_fanin_progress(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
mca_bcol_iboffload_collreq_t *coll_request =
(mca_bcol_iboffload_collreq_t *)
input_args->bcol_opaque_data;
if (BCOL_IS_COMPLETED(coll_request)) {
coll_request->user_handle_freed = true;
if (COLLREQ_IS_DONE(coll_request)) {
IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
RELEASE_COLLREQ(coll_request);
}
IBOFFLOAD_VERBOSE(10, ("Fan-In already done.\n"));
return BCOL_FN_COMPLETE;
}
return BCOL_FN_STARTED;
}
int mca_bcol_iboffload_new_style_fanin_first_call(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int i = 0, leader_rank = 0, /* We always suppose - the lowest index is a leader */
my_rank = iboffload->ibnet->super.my_index,
sbgp_size = iboffload->ibnet->super.group_size;
mca_bcol_iboffload_endpoint_t *ep = NULL;
mca_sbgp_ibnet_proc_t *my_ibnet_proc = iboffload->endpoints[my_rank]->ibnet_proc;
assert(NULL != my_ibnet_proc);
if (MCA_SBGP_IBNET_NODE_LEADER == my_ibnet_proc->duty) {
iboffload->fanin_algth = mca_bcol_iboffload_fanin_leader_progress;
iboffload->alg_task_consump[FANIN_ALG] += sbgp_size;
for (i = leader_rank + 1; i < sbgp_size; ++i) {
ep = iboffload->endpoints[i];
while (OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
}
} else {
iboffload->fanin_algth = mca_bcol_iboffload_fanin_proxy_progress;
iboffload->alg_task_consump[FANIN_ALG] += 1;
ep = iboffload->endpoints[leader_rank];
while(OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
}
return iboffload->fanin_algth(iboffload, coll_request);
}
static int mca_bcol_iboffload_new_style_fanin_intra(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
int rc = OMPI_SUCCESS;
struct mca_bcol_iboffload_collreq_t *coll_request = NULL;
mca_bcol_iboffload_module_t *iboffload =
(mca_bcol_iboffload_module_t *) const_args->bcol_module;
assert(NULL != iboffload);
MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args);
/* Init Fan-In collective reqeust */
rc = mca_bcol_iboffload_fanin_init(input_args, iboffload, &coll_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Error from mca_bcol_iboffload_fanin_init.\n"));
return BCOL_FN_NOT_STARTED;
}
rc = iboffload->fanin_algth(iboffload, coll_request);
if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
return BCOL_FN_NOT_STARTED;
}
return BCOL_FN_STARTED;
}
int mca_bcol_iboffload_fanin_register(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
IBOFFLOAD_VERBOSE(10, ("Register iboffload Fan-In.\n"));
comm_attribs.bcoll_type = BCOL_FANIN;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super,
&comm_attribs, &inv_attribs,
mca_bcol_iboffload_new_style_fanin_intra,
mca_bcol_iboffload_new_style_fanin_progress);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,349 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_task.h"
#include "bcol_iboffload_collfrag.h"
#include "bcol_iboffload_endpoint.h"
static int mca_bcol_iboffload_fanout_leader_progress(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int rc = OMPI_SUCCESS, leader_rank = 0, rank,
sbgp_size = iboffload->ibnet->super.group_size;
struct mqe_task *last_send = NULL;
mca_bcol_iboffload_task_t *send_task = NULL;
mca_bcol_iboffload_frag_t *send_fragment = NULL;
struct mqe_task **mqe_ptr_to_set;
mca_bcol_iboffload_collfrag_t *coll_fragment;
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
opal_list_get_last(&coll_request->work_requests);
mqe_ptr_to_set = &coll_fragment->to_post;
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
goto out_of_resources;
}
for (rank = leader_rank + 1; rank < sbgp_size; ++rank) {
/* post send */
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
rank, coll_request->qp_index, 0,
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
if(NULL == send_fragment) {
IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
goto out_of_resources;
}
send_task = mca_bcol_iboffload_get_send_task(iboffload, rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER,
send_fragment, coll_fragment, INLINE);
if(NULL == send_task) {
IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
}
/* end of list */
*mqe_ptr_to_set = NULL;
assert(NULL != last_send);
last_send->flags |= MQE_WR_FLAG_SIGNAL;
coll_fragment->signal_task_wr_id = last_send->wr_id;
last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
/* post the mwr */
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
if(OMPI_SUCCESS != rc) {
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
/* Note: need to clean up */
return rc;
}
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
return OMPI_SUCCESS;
out_of_resources:
/* Release all resources */
IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
static int mca_bcol_iboffload_fanout_proxy_progress(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int rc = OMPI_SUCCESS, leader_rank = 0;
struct mqe_task *last_wait = NULL;
mca_bcol_iboffload_task_t *wait_task = NULL;
mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL;
struct mqe_task **mqe_ptr_to_set;
mca_bcol_iboffload_collfrag_t *coll_fragment;
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
opal_list_get_last(&coll_request->work_requests);
mqe_ptr_to_set = &coll_fragment->to_post;
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
goto out_of_resources;
}
/* post wait */
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
iboffload, leader_rank, coll_request->qp_index);
if(NULL == preposted_recv_frag) {
IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n"));
goto out_of_resources;
}
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, leader_rank, 1,
preposted_recv_frag, coll_request->qp_index, NULL);
if(NULL == wait_task) {
IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
goto out_of_resources;
}
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
/* end of list */
*mqe_ptr_to_set = NULL;
last_wait->flags |= MQE_WR_FLAG_SIGNAL;
coll_fragment->signal_task_wr_id = last_wait->wr_id;
last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
/* post the mwr */
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
if(OMPI_SUCCESS != rc) {
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
/* Note: need to clean up */
return rc;
}
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
return OMPI_SUCCESS;
out_of_resources:
/* Release all resources */
IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
static int mca_bcol_iboffload_fanout_init(
bcol_function_args_t *input_args,
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t **coll_request)
{
ompi_free_list_item_t *item = NULL;
mca_bcol_iboffload_collfrag_t *coll_fragment = NULL;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init"));
OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item);
if(NULL == item) {
IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n"));
return OMPI_ERR_OUT_OF_RESOURCE;
}
(*coll_request) = (mca_bcol_iboffload_collreq_t *) item;
(*coll_request)->progress_fn = iboffload->fanout_algth;
(*coll_request)->completion_cb_fn = NULL;
(*coll_request)->order_info = &input_args->order_info;
(*coll_request)->module = iboffload;
(*coll_request)->ml_buffer_index = input_args->buffer_index;
(*coll_request)->buffer_info[SBUF].offset = 0;
(*coll_request)->buffer_info[RBUF].offset = 0;
(*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;
/* finish initializing full message descriptor */
(*coll_request)->n_fragments = 1;
(*coll_request)->n_frags_sent = 1;
(*coll_request)->n_frag_mpi_complete = 0;
(*coll_request)->n_frag_net_complete = 0;
(*coll_request)->user_handle_freed = false;
input_args->bcol_opaque_data = (void *) (*coll_request);
/*
* setup collective work request
*/
/* get collective frag */
coll_fragment = &(*coll_request)->first_collfrag;
mca_bcol_iboffload_collfrag_init(coll_fragment);
coll_fragment->alg = FANOUT_ALG;
coll_fragment->mq_index = COLL_MQ;
/* Set mq credits */
coll_fragment->mq_credits = iboffload->alg_task_consump[FANOUT_ALG];
/* set pointers for (coll frag) <-> (coll full request) */
MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment);
return OMPI_SUCCESS;
}
/************************************************************************
************************ New style Fan-In ******************************
***********************************************************************/
static int mca_bcol_iboffload_new_style_fanout_progress(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
mca_bcol_iboffload_collreq_t *coll_request =
(mca_bcol_iboffload_collreq_t *)
input_args->bcol_opaque_data;
if (BCOL_IS_COMPLETED(coll_request)) {
coll_request->user_handle_freed = true;
if (COLLREQ_IS_DONE(coll_request)) {
IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
RELEASE_COLLREQ(coll_request);
}
IBOFFLOAD_VERBOSE(10, ("Fan-Out already done.\n"));
return BCOL_FN_COMPLETE;
}
return BCOL_FN_STARTED;
}
int mca_bcol_iboffload_new_style_fanout_first_call(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int i = 0, leader_rank = 0, /* We always suppose - the lowest index is a leader */
my_rank = iboffload->ibnet->super.my_index,
sbgp_size = iboffload->ibnet->super.group_size;
mca_bcol_iboffload_endpoint_t *ep = NULL;
mca_sbgp_ibnet_proc_t *my_ibnet_proc = iboffload->endpoints[my_rank]->ibnet_proc;
assert(NULL != my_ibnet_proc);
if (MCA_SBGP_IBNET_NODE_LEADER == my_ibnet_proc->duty) {
iboffload->fanout_algth = mca_bcol_iboffload_fanout_leader_progress;
iboffload->alg_task_consump[FANOUT_ALG] += sbgp_size;
for (i = leader_rank + 1; i < sbgp_size; ++i) {
ep = iboffload->endpoints[i];
while (OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
}
} else {
iboffload->fanout_algth = mca_bcol_iboffload_fanout_proxy_progress;
iboffload->alg_task_consump[FANOUT_ALG] += 1;
ep = iboffload->endpoints[leader_rank];
while(OMPI_SUCCESS !=
check_endpoint_state(ep, NULL, NULL)) {
opal_progress();
}
}
return iboffload->fanout_algth(iboffload, coll_request);
}
static int mca_bcol_iboffload_new_style_fanout_intra(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
int rc = OMPI_SUCCESS;
struct mca_bcol_iboffload_collreq_t *coll_request = NULL;
mca_bcol_iboffload_module_t *iboffload =
(mca_bcol_iboffload_module_t *) const_args->bcol_module;
assert(NULL != iboffload);
MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args);
/* Init Fan-In collective reqeust */
rc = mca_bcol_iboffload_fanout_init(input_args, iboffload, &coll_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_VERBOSE(10, ("Error from mca_bcol_iboffload_fanin_init.\n"));
return BCOL_FN_NOT_STARTED;
}
rc = iboffload->fanout_algth(iboffload, coll_request);
if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
return BCOL_FN_NOT_STARTED;
}
return BCOL_FN_STARTED;
}
int mca_bcol_iboffload_fanout_register(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
IBOFFLOAD_VERBOSE(10, ("Register iboffload Fan-In.\n"));
comm_attribs.bcoll_type = BCOL_FANOUT;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super,
&comm_attribs, &inv_attribs,
mca_bcol_iboffload_new_style_fanout_intra,
mca_bcol_iboffload_new_style_fanout_progress);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,272 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/include/opal/types.h"
#include "opal/datatype/opal_convertor.h"
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_device.h"
#include "bcol_iboffload_collreq.h"
#include "bcol_iboffload_endpoint.h"
static void frag_constructor(mca_bcol_iboffload_frag_t *frag)
{
mca_bcol_iboffload_reg_t* reg =
(mca_bcol_iboffload_reg_t*) frag->super.registration;
memset(&frag->sg_entry, 0, sizeof(struct ibv_sge));
frag->sg_entry.addr = (uint64_t) (uintptr_t) frag->super.ptr;
frag->registration = reg;
if (NULL != reg) {
frag->sg_entry.lkey = reg->mr->lkey;
}
frag->next = NULL;
frag->type = MCA_BCOL_IBOFFLOAD_NONE_OWNER;
frag->ref_counter = 0;
frag->qp_index = -1;
}
OBJ_CLASS_INSTANCE(
mca_bcol_iboffload_frag_t,
ompi_free_list_item_t,
frag_constructor,
NULL);
static mca_bcol_iboffload_frag_t*
mca_bcol_iboffload_get_ml_frag_calc(mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collreq_t *coll_request,
size_t len, size_t src_offset)
{
int rc;
mca_bcol_iboffload_frag_t *fragment;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
uint64_t sbuff = (uint64_t) (uintptr_t) coll_request->buffer_info[SBUF].buf +
src_offset;
/* The buffer was allocated on ML level,
no need to allocate local buffer */
rc = pack_data_for_calc(iboffload->device->dev.ib_dev_context,
cm->map_ompi_to_ib_calcs[coll_request->op->op_type],
cm->map_ompi_to_ib_dt[coll_request->dtype->id],
false /* host order */,
(void *) sbuff, 0,
&coll_request->actual_ib_op,
&coll_request->actual_ib_dtype,
(void *) sbuff);
if (OPAL_UNLIKELY(0 != rc)) {
IBOFFLOAD_VERBOSE(10, ("pack_data_for_calc failed, op: %s, type: %s\n",
coll_request->op->o_name, coll_request->dtype->name));
return NULL;
}
fragment = mca_bcol_iboffload_get_ml_frag(
iboffload, coll_request->qp_index, len,
coll_request->buffer_info[SBUF].lkey,
sbuff);
return fragment;
}
static mca_bcol_iboffload_frag_t *
mca_bcol_iboffload_get_packed_frag(mca_bcol_iboffload_module_t *iboffload,
uint32_t destination, int qp_index, size_t len,
struct opal_convertor_t *convertor)
{
/* local variables */
int rc;
uint32_t out_size;
size_t max_size = 0;
struct iovec payload_iovec;
ompi_free_list_item_t *item;
mca_bcol_iboffload_frag_t *frag;
mca_bcol_iboffload_device_t *device = iboffload->device;
/* Get frag from free list */
OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
frag = (mca_bcol_iboffload_frag_t *) item;
/* Pack data into the buffer */
out_size = 1;
payload_iovec.iov_len = len;
payload_iovec.iov_base = (void *) (uintptr_t) frag->sg_entry.addr;
rc = opal_convertor_pack(convertor, &(payload_iovec),
&out_size, &max_size);
if (OPAL_UNLIKELY(rc < 0)) {
/* Error: put the fragment back */
OMPI_FREE_LIST_RETURN_MT(&device->frags_free[qp_index], item);
return NULL;
}
return frag;
}
static mca_bcol_iboffload_frag_t *
mca_bcol_iboffload_get_calc_frag(mca_bcol_iboffload_module_t *iboffload, int qp_index,
struct mca_bcol_iboffload_collreq_t *coll_request)
{
int rc;
ompi_free_list_item_t *item;
mca_bcol_iboffload_frag_t *frag;
mca_bcol_iboffload_device_t *device = iboffload->device;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
IBOFFLOAD_VERBOSE(10, ("Start to pack frag.\n"));
/* Get frag from free list */
OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
frag = (mca_bcol_iboffload_frag_t *) item;
/* Pack data into the buffer */
rc = pack_data_for_calc(device->dev.ib_dev_context,
cm->map_ompi_to_ib_calcs[coll_request->op->op_type],
cm->map_ompi_to_ib_dt[coll_request->dtype->id], false,
coll_request->buffer_info[SBUF].buf, 0,
&coll_request->actual_ib_op,
&coll_request->actual_ib_dtype,
(void *) (uintptr_t) frag->sg_entry.addr);
if (OPAL_UNLIKELY(0 != rc)) {
IBOFFLOAD_ERROR(("pack_data_for_calc failed, op: %s, type: %s\n",
coll_request->op->o_name, coll_request->dtype->name));
return NULL;
}
return frag;
}
mca_bcol_iboffload_frag_t*
mca_bcol_iboffload_get_send_frag(mca_bcol_iboffload_collreq_t *coll_request,
uint32_t destination, int qp_index, size_t len,
size_t src_offset, int buf_index, int send_frag_type)
{
/* local variables */
mca_bcol_iboffload_frag_t *frag;
mca_bcol_iboffload_module_t *iboffload = coll_request->module;
mca_bcol_iboffload_endpoint_t *endpoint =
iboffload->endpoints[destination];
IBOFFLOAD_VERBOSE(10, ("Calling mca_bcol_iboffload_get_send_frag qp_index %d",
qp_index));
if ((endpoint->qps[qp_index].sd_wqe) <= 0) {
IBOFFLOAD_VERBOSE(10, ("No send wqe %d",
endpoint->qps[qp_index].sd_wqe));
return NULL;
}
--endpoint->qps[qp_index].sd_wqe;
IBOFFLOAD_VERBOSE(10, ("Endpoint %p: qp_index %d, destination %d, sd_wqe %d",
endpoint, qp_index, destination, endpoint->qps[qp_index].sd_wqe));
switch (send_frag_type) {
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY:
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY"));
assert(NULL != &iboffload->device->dummy_frags[qp_index]);
return &iboffload->device->dummy_frags[qp_index];
case MCA_BCOL_IBOFFLOAD_SEND_FRAG:
{
ompi_free_list_item_t *item;
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG"));
/* Get frag from free list */
OMPI_FREE_LIST_GET_MT(&iboffload->device->frags_free[qp_index], item);
frag = (mca_bcol_iboffload_frag_t *) item;
}
break;
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT:
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT"));
frag = mca_bcol_iboffload_get_packed_frag(iboffload, destination,
qp_index, len, &coll_request->send_convertor);
break;
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC:
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC"));
frag = mca_bcol_iboffload_get_calc_frag(iboffload, qp_index, coll_request);
break;
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML:
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML"));
frag = mca_bcol_iboffload_get_ml_frag(
iboffload, qp_index, len, coll_request->buffer_info[buf_index].lkey,
(uint64_t)(uintptr_t) coll_request->buffer_info[buf_index].buf + src_offset);
break;
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC:
frag = mca_bcol_iboffload_get_ml_frag_calc(iboffload, coll_request, len, src_offset);
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC"));
break;
default:
IBOFFLOAD_VERBOSE(10, ("Getting default"));
frag = NULL;
IBOFFLOAD_ERROR(("Unknown send frag type %d for QP index %d",
send_frag_type, qp_index));
}
if (OPAL_UNLIKELY(NULL == frag)) {
IBOFFLOAD_VERBOSE(10, ("Getting NULL"));
return NULL;
}
frag->sg_entry.length = len;
frag->next = NULL;
return frag;
}
void
mca_bcol_iboffload_frag_init(ompi_free_list_item_t* item, void* ctx)
{
int qp_index = *(int *) ctx;
mca_bcol_iboffload_frag_t *frag = (mca_bcol_iboffload_frag_t *) item;
frag->qp_index = qp_index;
frag->type = MCA_BCOL_IBOFFLOAD_BCOL_OWNER;
}
void
mca_bcol_iboffload_ml_frag_init(ompi_free_list_item_t* item, void* ctx)
{
mca_bcol_iboffload_frag_t *frag = (mca_bcol_iboffload_frag_t *) item;
frag->qp_index = -1;
frag->type = MCA_BCOL_IBOFFLOAD_ML_OWNER;
}

Просмотреть файл

@ -1,154 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_FRAG_H
#define MCA_BCOL_IBOFFLOAD_FRAG_H
#include "ompi_config.h"
#include <infiniband/verbs.h>
#include "opal/datatype/opal_convertor.h"
#include "opal/mca/mpool/mpool.h"
#include "opal/class/ompi_free_list.h"
#include "bcol_iboffload.h"
BEGIN_C_DECLS
/* forward declarations */
struct mca_bcol_iboffload_collreq_t;
struct mca_bcol_iboffload_reg_t {
mca_mpool_base_registration_t base;
struct ibv_mr *mr;
};
typedef struct mca_bcol_iboffload_reg_t mca_bcol_iboffload_reg_t;
typedef enum {
MCA_BCOL_IBOFFLOAD_NONE_OWNER = -1,
MCA_BCOL_IBOFFLOAD_DUMMY_OWNER,
MCA_BCOL_IBOFFLOAD_BCOL_OWNER,
MCA_BCOL_IBOFFLOAD_ML_OWNER
} frag_type;
typedef enum {
MCA_BCOL_IBOFFLOAD_SEND_FRAG,
MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML,
MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC,
MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT,
MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC,
MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY
} send_frag_type;
struct mca_bcol_iboffload_frag_t {
ompi_free_list_item_t super;
struct mca_bcol_iboffload_frag_t *next;
struct mca_bcol_iboffload_reg_t *registration;
struct ibv_sge sg_entry;
frag_type type;
int ref_counter;
int qp_index;
};
typedef struct mca_bcol_iboffload_frag_t mca_bcol_iboffload_frag_t;
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_frag_t);
/* The same fragment maybe shared by multiple task.
* In order to manage right release and allocation flow
* we use reference counter on each fragment and the follow
* wrapper allocation and release function that hides
* the counter */
#define IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(fragment, task) \
do { \
++((fragment)->ref_counter); \
(task)->frag = (fragment); \
} while(0)
#define IBOFFLOAD_SET_FRAGS_ON_TASK(fragment, task) \
do { \
struct mca_bcol_iboffload_frag_t *temp_frag = fragment; \
while (NULL != temp_frag) { \
++(temp_frag->ref_counter); \
temp_frag = temp_frag->next; \
} \
(task)->frag = fragment; \
} while(0)
/* function declarations */
mca_bcol_iboffload_frag_t *
mca_bcol_iboffload_get_send_frag(struct mca_bcol_iboffload_collreq_t *coll_request,
uint32_t destination, int qp_index, size_t len,
size_t src_offset, int buff_index, int send_frag_type);
void
mca_bcol_iboffload_frag_init(ompi_free_list_item_t* item, void* ctx);
void
mca_bcol_iboffload_ml_frag_init(ompi_free_list_item_t* item, void* ctx);
static inline __opal_attribute_always_inline__
mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_ml_empty_frag(
mca_bcol_iboffload_module_t *iboffload,
int qp_index)
{
ompi_free_list_item_t *item;
mca_bcol_iboffload_frag_t *frag;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
/* Get frag from free list */
OMPI_FREE_LIST_GET_MT(&cm->ml_frags_free, item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
frag = (mca_bcol_iboffload_frag_t *) item;
frag->qp_index = qp_index;
frag->next = NULL;
return frag;
}
static inline __opal_attribute_always_inline__
mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_ml_frag(
mca_bcol_iboffload_module_t *iboffload,
int qp_index, size_t len, uint32_t lkey, uint64_t addr)
{
/* local variables */
mca_bcol_iboffload_frag_t *frag;
IBOFFLOAD_VERBOSE(10, ("Call for get ML frag - addr 0x%x", addr));
frag = mca_bcol_iboffload_get_ml_empty_frag(iboffload, qp_index);
frag->sg_entry.addr = addr;
frag->sg_entry.lkey = lkey;
frag->sg_entry.length = len;
IBOFFLOAD_VERBOSE(10, ("Setting ml frag lkey %u, "
"addr %p, qp_index %d, send value - %lf",
frag->sg_entry.lkey, frag->sg_entry.addr,
qp_index, *(double *) frag->sg_entry.addr));
return frag;
}
END_C_DECLS
#endif

Просмотреть файл

@ -1,451 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include "bcol_iboffload.h"
#include "bcol_iboffload_mca.h"
#include "ompi/constants.h"
#include "ompi/mca/common/ofacm/base.h"
#include "ompi/communicator/communicator.h"
#include "opal/util/show_help.h"
/*
* Local flags
*/
enum {
REGINT_NEG_ONE_OK = 0x01,
REGINT_GE_ZERO = 0x02,
REGINT_GE_ONE = 0x04,
REGINT_NONZERO = 0x08,
REGINT_MAX = 0x88
};
enum {
REGSTR_EMPTY_OK = 0x01,
REGSTR_MAX = 0x88
};
mca_base_var_enum_value_t mtu_values[] = {
{IBV_MTU_256, "256B"},
{IBV_MTU_512, "512B"},
{IBV_MTU_1024, "1k"},
{IBV_MTU_4096, "4k"},
{0, NULL}
};
/*
* utility routine for string parameter registration
*/
static int reg_string(const char* param_name,
const char* deprecated_param_name,
const char* param_desc,
const char* default_value, char **storage,
int flags)
{
int index;
/* the MCA variable system will not attempt to modify this value */
*storage = (char *) default_value;
index = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version,
param_name, param_desc, MCA_BASE_VAR_TYPE_STRING,
NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
if (NULL != deprecated_param_name) {
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "iboffload", deprecated_param_name,
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
if (0 != (flags & REGSTR_EMPTY_OK) && 0 == strlen(*storage)) {
opal_output(0, "Bad parameter value for parameter \"%s\"",
param_name);
return OMPI_ERR_BAD_PARAM;
}
return OMPI_SUCCESS;
}
/*
* utility routine for integer parameter registration
*/
static int reg_int(const char* param_name,
const char* deprecated_param_name,
const char* param_desc,
int default_value, int *storage, int flags)
{
int index;
*storage = default_value;
index = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version,
param_name, param_desc, MCA_BASE_VAR_TYPE_INT,
NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
if (NULL != deprecated_param_name) {
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "iboffload", deprecated_param_name,
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) {
return OMPI_SUCCESS;
}
if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) ||
(0 != (flags & REGINT_GE_ONE) && *storage < 1) ||
(0 != (flags & REGINT_NONZERO) && 0 == *storage)) {
opal_output(0, "Bad parameter value for parameter \"%s\"",
param_name);
return OMPI_ERR_BAD_PARAM;
}
return OMPI_SUCCESS;
}
/*
* utility routine for integer parameter registration
*/
static int reg_bool(const char* param_name,
const char* deprecated_param_name,
const char* param_desc,
bool default_value, bool *storage)
{
int index;
*storage = default_value;
index = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version,
param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL,
NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
if (NULL != deprecated_param_name) {
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "iboffload", deprecated_param_name,
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
return OMPI_SUCCESS;
}
int mca_bcol_iboffload_verify_params(void)
{
if (mca_bcol_iboffload_component.min_rnr_timer > 31) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_min_rnr_timer > 31",
"bcol_iboffload_ib_min_rnr_timer reset to 31");
mca_bcol_iboffload_component.min_rnr_timer = 31;
} else if (mca_bcol_iboffload_component.min_rnr_timer < 0){
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_min_rnr_timer < 0",
"bcol_iboffload_ib_min_rnr_timer reset to 0");
mca_bcol_iboffload_component.min_rnr_timer = 0;
}
if (mca_bcol_iboffload_component.timeout > 31) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_timeout > 31",
"bcol_iboffload_ib_timeout reset to 31");
mca_bcol_iboffload_component.timeout = 31;
} else if (mca_bcol_iboffload_component.timeout < 0) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_timeout < 0",
"bcol_iboffload_ib_timeout reset to 0");
mca_bcol_iboffload_component.timeout = 0;
}
if (mca_bcol_iboffload_component.retry_count > 7) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_retry_count > 7",
"bcol_iboffload_ib_retry_count reset to 7");
mca_bcol_iboffload_component.retry_count = 7;
} else if (mca_bcol_iboffload_component.retry_count < 0) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_retry_count < 0",
"bcol_iboffload_ib_retry_count reset to 0");
mca_bcol_iboffload_component.retry_count = 0;
}
if (mca_bcol_iboffload_component.max_rdma_dst_ops > 7) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_rnr_retry > 7",
"bcol_iboffload_ib_rnr_retry reset to 7");
mca_bcol_iboffload_component.max_rdma_dst_ops = 7;
} else if (mca_bcol_iboffload_component.max_rdma_dst_ops < 0) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_rnr_retry < 0",
"bcol_iboffload_ib_rnr_retry reset to 0");
mca_bcol_iboffload_component.max_rdma_dst_ops = 0;
}
if (mca_bcol_iboffload_component.service_level > 15) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_service_level > 15",
"bcol_iboffload_ib_service_level reset to 15");
mca_bcol_iboffload_component.service_level = 15;
} else if (mca_bcol_iboffload_component.service_level < 0) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "bcol_iboffload_ib_service_level < 0",
"bcol_iboffload_ib_service_level reset to 0");
mca_bcol_iboffload_component.service_level = 0;
}
if(mca_bcol_iboffload_component.buffer_alignment <= 1 ||
(mca_bcol_iboffload_component.buffer_alignment & (mca_bcol_iboffload_component.buffer_alignment - 1))) {
opal_show_help("help-mpi-btl-openib.txt", "wrong buffer alignment",
true, mca_bcol_iboffload_component.buffer_alignment, ompi_process_info.nodename, 64);
mca_bcol_iboffload_component.buffer_alignment = 64;
}
return OMPI_SUCCESS;
}
int mca_bcol_iboffload_register_params(void)
{
mca_base_var_enum_t *new_enum;
char *msg;
int ret = OMPI_SUCCESS, tmp;
#define CHECK(expr) do { \
tmp = (expr); \
if (OMPI_SUCCESS != tmp) ret = tmp; \
} while (0)
/* register openib component parameters */
CHECK(reg_int("k_nomial_radix", NULL,
"The radix of the K-nomial tree for scatther-gather type algorithms"
"(starts from 2)", 2, &mca_bcol_iboffload_component.k_nomial_radix,
REGINT_GE_ONE));
CHECK(reg_int("priority", NULL,
"IB offload component priority"
"(from 0(low) to 90 (high))", 90,
&mca_bcol_iboffload_component.super.priority, 0));
CHECK(reg_int("verbose", NULL,
"Output some verbose IB offload BTL information "
"(0 = no output, nonzero = output)", 0,
&mca_bcol_iboffload_component.verbose, 0));
CHECK(reg_bool("warn_default_gid_prefix", NULL,
"Warn when there is more than one active ports and at least one of them connected to the network with only default GID prefix configured (0 = do not warn; any other value = warn)",
true, &mca_bcol_iboffload_component.warn_default_gid_prefix));
CHECK(reg_bool("warn_nonexistent_if", NULL,
"Warn if non-existent devices and/or ports are specified in the bcol_iboffla_if_[in|ex]clude MCA parameters (0 = do not warn; any other value = warn)",
true, &mca_bcol_iboffload_component.warn_nonexistent_if));
CHECK(reg_int("max_pipeline_depth", NULL,
"The maximal number of fragments of the same collective request that can be transferred in parallel", 3,
(int *) &mca_bcol_iboffload_component.max_pipeline_depth, 0));
CHECK(reg_int("max_mqe_tasks", NULL,
"Maximum number of MQEs for each iboffload module",
1024, &mca_bcol_iboffload_component.max_mqe_tasks, 0));
CHECK(reg_int("max_mq_size", NULL,
"Maximum size of each MQ for each iboffload module",
1024, &mca_bcol_iboffload_component.max_mq_size, 0));
CHECK(reg_int("free_list_num", NULL,
"Intial size of free lists (must be >= 1)",
256, &mca_bcol_iboffload_component.free_list_num,
REGINT_GE_ONE));
CHECK(reg_int("free_list_max", NULL,
"Maximum size of free lists "
"(-1 = infinite, otherwise must be >= 0)",
-1, &mca_bcol_iboffload_component.free_list_max,
REGINT_NEG_ONE_OK | REGINT_GE_ONE));
CHECK(reg_int("free_list_inc", NULL,
"Increment size of free lists (must be >= 1)",
32, &mca_bcol_iboffload_component.free_list_inc,
REGINT_GE_ONE));
/* rdma mpool no longer exists - must use the grdma mpool component, should resolve errors in
* mtt testing
*/
/*
CHECK(reg_string("mpool", NULL,
"Name of the memory pool to be used (it is unlikely that you will ever want to change this",
"rdma", &mca_bcol_iboffload_component.mpool_name,
0));
*/
CHECK(reg_string("mpool", NULL,
"Name of the memory pool to be used (it is unlikely that you will ever want to change this",
"grdma", &mca_bcol_iboffload_component.mpool_name,
0));
CHECK(reg_int("cq_size", "cq_size",
"Size of the OpenFabrics completion "
"queue (will automatically be set to a minimum of "
"(2 * number_of_peers * bcol_iboffload_rd_num))",
1024, &mca_bcol_iboffload_component.cq_size, REGINT_GE_ONE));
CHECK(reg_int("exchange_tree_order", NULL,
"The order of the exchange tree. "
"Must be power of two.",
2, &mca_bcol_iboffload_component.exchange_tree_order, REGINT_GE_ONE));
CHECK(reg_int("knomial_tree_order", NULL,
"The order of the knomial exchange tree. ",
3, &mca_bcol_iboffload_component.knomial_tree_order, REGINT_GE_ONE));
CHECK(reg_int("max_inline_data", "max_inline_data",
"Maximum size of inline data segment "
"(-1 = run-time probe to discover max value, "
"otherwise must be >= 0). "
"If not explicitly set, use max_inline_data from "
"the INI file containing device-specific parameters",
128, (int *) &mca_bcol_iboffload_component.max_inline_data,
REGINT_NEG_ONE_OK | REGINT_GE_ZERO));
#if 0
CHECK(reg_string("pkey", "ib_pkey_val",
"OpenFabrics partition key (pkey) value. "
"Unsigned integer decimal or hex values are allowed (e.g., \"3\" or \"0x3f\") and will be masked against the maximum allowable IB paritition key value (0x7fff)",
"0", &pkey, 0));
/* Pasha
mca_bcol_iboffload_component.pkey_val =
ompi_btl_openib_ini_intify(pkey) & MCA_BTL_IB_PKEY_MASK;
free(pkey);
*/
#endif
CHECK(reg_string("receive_queues", NULL,
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
"P,512,256,192,128", &mca_bcol_iboffload_component.receive_queues,
0));
CHECK(reg_int("qp_ous_rd_atom", NULL,
"InfiniBand outstanding atomic reads (must be >= 0)", 4,
(int *) &mca_bcol_iboffload_component.qp_ous_rd_atom, REGINT_GE_ZERO));
asprintf(&msg, "OpenFabrics MTU, in bytes (if not specified in INI files). Valid values are: %d=256 bytes, %d=512 bytes, %d=1024 bytes, %d=2048 bytes, %d=4096 bytes",
IBV_MTU_256,
IBV_MTU_512,
IBV_MTU_1024,
IBV_MTU_2048,
IBV_MTU_4096);
if (NULL == msg) {
/* Don't try to recover from this */
return OMPI_ERR_OUT_OF_RESOURCE;
}
CHECK(mca_base_var_enum_create("infiniband mtu", mtu_values, &new_enum));
mca_bcol_iboffload_component.mtu = IBV_MTU_1024;
tmp = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version,
"mtu", msg, MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&mca_bcol_iboffload_component.mtu);
OBJ_RELEASE(new_enum);
free(msg);
if (0 > tmp) ret = tmp;
tmp = mca_base_var_register_synonym(tmp, "ompi", "bcol", "iboffload", "ib_mtu",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
if (0 > tmp) ret = tmp;
CHECK(reg_int("ib_min_rnr_timer", NULL, "InfiniBand minimum "
"\"receiver not ready\" timer, in seconds "
"(must be >= 0 and <= 31)",
1 , &mca_bcol_iboffload_component.min_rnr_timer, 0));
CHECK(reg_int("ib_timeout", NULL, "InfiniBand transmit timeout, plugged into formula: 4.096 microseconds * "
"(2^bcol_iboffload_ib_timeout) (must be >= 0 and <= 31)",
20, &mca_bcol_iboffload_component.timeout, 0));
CHECK(reg_int("ib_retry_count", NULL, "InfiniBand transmit retry count "
"(must be >= 0 and <= 7)",
7, &mca_bcol_iboffload_component.retry_count, 0));
CHECK(reg_int("ib_rnr_retry", NULL, "InfiniBand \"receiver not ready\" "
"retry count; applies *only* to SRQ/XRC queues. PP queues "
"use RNR retry values of 0 because Open MPI performs "
"software flow control to guarantee that RNRs never occur "
"(must be >= 0 and <= 7; 7 = \"infinite\")",
7, &mca_bcol_iboffload_component.rnr_retry, 0));
CHECK(reg_int("ib_max_rdma_dst_ops", NULL, "InfiniBand maximum pending RDMA "
"destination operations "
"(must be >= 0)",
4, &mca_bcol_iboffload_component.max_rdma_dst_ops, REGINT_GE_ZERO));
CHECK(reg_int("ib_service_level", NULL, "InfiniBand service level "
"(must be >= 0 and <= 15)",
0, &mca_bcol_iboffload_component.service_level, 0));
CHECK(reg_int("buffer_alignment", NULL,
"Prefered communication buffer alignment, in bytes "
"(must be > 0 and power of two)",
64, &mca_bcol_iboffload_component.buffer_alignment, REGINT_GE_ZERO));
/* register parmeters controlling message fragementation */
CHECK(reg_int("min_frag_size", NULL,
"Minimum fragment size",
getpagesize(), &mca_bcol_iboffload_component.super.min_frag_size,
REGINT_GE_ONE));
CHECK(reg_int("max_frag_size", NULL,
"Maximum fragment size",
FRAG_SIZE_NO_LIMIT, &mca_bcol_iboffload_component.super.max_frag_size,
REGINT_NONZERO));
CHECK(reg_bool("can_use_user_buffers", NULL,
"User memory can be used by the collective algorithms",
true, &mca_bcol_iboffload_component.super.can_use_user_buffers));
CHECK(reg_int("barrier_mode", NULL,
"Barrier mode: 0 - Recursive doubling; 1 - Recursive K-ing",
0, &mca_bcol_iboffload_component.barrier_mode, REGINT_GE_ZERO));
CHECK(reg_int("max_progress_pull", NULL,
"Max number of progress pull checks",
8, &mca_bcol_iboffload_component.max_progress_pull, REGINT_GE_ZERO));
CHECK(reg_int("use_brucks_smsg_alltoall_rdma", NULL,
"Use brucks algorithm for smsg alltoall and RDMA semantics 1 = No Temp buffer recycling"
"1 = Alg with no Temp Buffer Recycling (faster), 2 = Alg with temp Buffer Recycling (slower)",
0, &mca_bcol_iboffload_component.use_brucks_smsg_alltoall_rdma, 0));
CHECK(reg_int("use_brucks_smsg_alltoall_sr", NULL,
"Use brucks algorithm for smsg alltoall and Send/Recv semantics "
"1 = Alg with RTR (faster), 2 = Alg with RNR (slower)",
0, &mca_bcol_iboffload_component.use_brucks_smsg_alltoall_sr, 0));
CHECK(reg_int("alltoall_bruck_radix", NULL,
"Radix for Bruck algorithm for smsg alltoall",
3, &mca_bcol_iboffload_component.k_alltoall_bruck_radix, 0));
CHECK(reg_int("k_alltoall_bruck_radix", NULL,
"Temp Buffer alignment for Bruck algorithm for smsg alltoall",
64, &mca_bcol_iboffload_component.tmp_buf_alignment, 0));
/*
CHECK(reg_string("if_include", NULL,
"Comma-delimited list of devices/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with bcol_iboffload_if_exclude.",
NULL, &mca_bcol_iboffload_component.if_include,
0));
CHECK(reg_string("if_exclude", NULL,
"Comma-delimited list of device/ports to be excluded (empty value means to not exclude any ports). Mutually exclusive with bcol_iboffload_if_include.",
NULL, &mca_bcol_iboffload_component.if_exclude,
0));
*/
CHECK(mca_bcol_iboffload_verify_params());
/* Register any MCA params for the connect pseudo-components */
if (OMPI_SUCCESS == ret) {
ret = ompi_common_ofacm_base_register(&mca_bcol_iboffload_component.super.bcol_version);
}
return ret;
}

Просмотреть файл

@ -1,20 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file */
#ifndef MCA_BCOL_IBOFFLOAD_MCA_H
#define MCA_BCOL_IBOFFLOAD_MCA_H
#include "ompi_config.h"
int mca_bcol_iboffload_register_params(void);
int mca_bcol_iboffload_verify_params(void);
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,452 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <infiniband/mqe.h>
#include <infiniband/verbs.h>
#include <infiniband/mverbs.h>
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_device.h"
#include "bcol_iboffload_qp_info.h"
#include "bcol_iboffload_collreq.h"
#include "bcol_iboffload_endpoint.h"
static int mca_bcol_iboffload_dummy_frag_qp_prepost(
mca_bcol_iboffload_endpoint_t *endpoint,
int qp_index, int num_to_prepost)
{
struct ibv_recv_wr *recv_wr, *recv_bad;
int ret, num_preposted = 0, start_wr_index;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d",
(void *) endpoint, num_to_prepost));
if (OPAL_UNLIKELY(0 == num_to_prepost)) {
IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate"));
return OMPI_SUCCESS;
}
/* make sure that we do not overrun number of rd_wqe */
if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) {
IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d",
num_to_prepost, endpoint->qps[qp_index].rd_wqe));
num_to_prepost = endpoint->qps[qp_index].rd_wqe;
}
OPAL_THREAD_LOCK(&recv_wrs->lock);
/* calculate start index in array
* of pre-allocated work requests */
start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost;
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, "
"start index of WRs - %d, rd_wqe - %d",
(void *) endpoint, qp_index, num_to_prepost,
start_wr_index, endpoint->qps[qp_index].rd_wqe));
while (num_preposted < num_to_prepost) {
/* prepost the special barrier frag to recv queue */
struct ibv_sge *dummy_sg_entry =
&endpoint->iboffload_module->device->dummy_frags[qp_index].sg_entry;
recv_wr[num_preposted].sg_list = dummy_sg_entry;
++num_preposted;
}
if (OPAL_LIKELY(num_preposted > 0)) {
/* Set the tail */
recv_wr[num_preposted - 1].next = NULL;
/* post the list of recvs */
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
if (OPAL_UNLIKELY(0 != ret)) {
IBOFFLOAD_ERROR(("ibv_post_recv failed, error: %s [%d], "
"qp_index - %d.\n", strerror(errno), ret, qp_index));
return OMPI_ERROR;
}
/* recover last recv_wr if needed */
if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) {
recv_wr[num_preposted - 1].next = &recv_wr[num_preposted];
}
/* decresing numbers of free recv wqe */
endpoint->qps[qp_index].rd_wqe -= num_preposted;
}
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d, qp_index - %d",
(void *) endpoint, num_to_prepost, num_preposted, qp_index));
return OMPI_SUCCESS;
}
/*
* Receive prepost:
* return values:
* 0 - no prepost was done
* -1 - fatal error during prepost
* other value - number preposted elements
*/
static int mca_bcol_iboffload_frag_reg_qp_prepost(
mca_bcol_iboffload_endpoint_t *endpoint,
int qp_index, int num_to_prepost)
{
ompi_free_list_item_t *item;
mca_bcol_iboffload_frag_t *frag;
struct ibv_recv_wr *recv_wr, *recv_bad;
int i, ret, num_preposted = 0, start_wr_index;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
opal_list_t *preposted = &(endpoint->qps[qp_index].preposted_frags);
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d",
(void *) endpoint, num_to_prepost));
if (OPAL_UNLIKELY(0 == num_to_prepost)) {
IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate"));
return OMPI_SUCCESS;
}
/* make sure that we do not overrun number of rd_wqe */
if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) {
IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d",
num_to_prepost, endpoint->qps[qp_index].rd_wqe));
num_to_prepost = endpoint->qps[qp_index].rd_wqe;
}
OPAL_THREAD_LOCK(&recv_wrs->lock);
/* calculate start index in array
* of pre-allocated work requests */
start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost;
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, "
"start index of WRs - %d, rd_wqe - %d",
(void *) endpoint, qp_index, num_to_prepost,
start_wr_index, endpoint->qps[qp_index].rd_wqe));
while (num_preposted < num_to_prepost) {
/* put the item on list of preposted */
OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item);
if (OPAL_UNLIKELY(NULL == item)) {
break;
}
frag = (mca_bcol_iboffload_frag_t *) item;
opal_list_append(preposted, (opal_list_item_t *) item);
recv_wr[num_preposted].sg_list = &frag->sg_entry;
/* TODO Pasha - fix it later */ /* Vasily: Is it right place to take a size value ???? */
frag->sg_entry.length = cm->qp_infos[qp_index].size;
++num_preposted;
}
if (OPAL_LIKELY(num_preposted > 0)) {
/* Set the tail */
recv_wr[num_preposted - 1].next = NULL;
/* post the list of recvs */
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
if (OPAL_UNLIKELY(0 != ret)) {
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
"qp_index - %d.\n",
ibv_get_device_name(device->dev.ib_dev),
strerror(errno), ret, qp_index));
/* Return allocated frags */
for (i = 0; i < num_preposted; i++) {
OMPI_FREE_LIST_RETURN_MT(&device->frags_free[qp_index],
(ompi_free_list_item_t *)
opal_list_remove_last(preposted));
}
return OMPI_ERROR;
}
/* recover last recv_wr if needed */
if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) {
recv_wr[num_preposted - 1].next = &recv_wr[num_preposted];
}
/* decresing numbers of free recv wqe */
endpoint->qps[qp_index].rd_wqe -= num_preposted;
}
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d",
(void *) endpoint, num_to_prepost, num_preposted));
return OMPI_SUCCESS;
}
static void mca_bcol_iboffload_fillin_qp_attr(int qp_index,
mca_bcol_iboffload_endpoint_t *ep,
ompi_common_ofacm_base_qp_config_t *qp_config)
{
uint32_t max_sge, *init_attr_mask =
&qp_config->init_attr_mask[qp_index];
struct ibv_qp_attr *attr = &qp_config->attr[qp_index];
struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index];
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
/* Set special init attributes mask */
*init_attr_mask = IBV_M_QP_EXT_CLASS_1 |
IBV_M_QP_EXT_CLASS_2 |
IBV_M_QP_EXT_IGNORE_RQ_OVERFLOW;
/* Set init attributes */
init_attr->qp_type = IBV_QPT_RC;
/* Vasily: ??????
init_attr->cap.max_inline_data =
max_inline_size(qp, iboffload_module->device);
*/
/* Pasha: we can not leave max_inline empty !
Todo: copy max_inline_size() from ofacm to
common area.
*/
init_attr->cap.max_inline_data = (int32_t) cm->max_inline_data;
/* We allocate SG list for some algorithms (Bruck's alltoall) */
max_sge = ep->iboffload_module->group_size / 2 +
ep->iboffload_module->group_size % 2;
/* max send sge should be less than device maximums */
if (max_sge > (uint32_t)
ep->iboffload_module->device->ib_dev_attr.max_sge) {
max_sge = (uint32_t) ep->iboffload_module->device->ib_dev_attr.max_sge;
}
init_attr->cap.max_send_sge = max_sge;
init_attr->cap.max_recv_sge = max_sge;
/* Vasily: the value will be changed later */
/* TODO Pasha: this is real crap */
init_attr->cap.max_recv_wr = (uint32_t) cm->cq_size;
init_attr->cap.max_send_wr = (uint32_t) cm->cq_size;
/* Set attributes */
/* attr->pkey_index = 0; */ /* Vasily: ????? */
attr->port_num = ep->iboffload_module->port;
/* Vasily: the value will be changed later */
attr->path_mtu = (uint32_t)cm->mtu;
attr->max_dest_rd_atomic = cm->max_rdma_dst_ops;
attr->min_rnr_timer = (uint32_t)cm->min_rnr_timer;
attr->ah_attr.is_global = 0;
attr->ah_attr.sl = (uint32_t)cm->service_level;
/* Vasily: from struct mca_bcol_iboffload_port_t ????? */
/*
attr->ah_attr.src_path_bits = iboffload_module->src_path_bits;
*/
attr->ah_attr.port_num = ep->iboffload_module->port;
/* JMS to be filled in later dynamically */
attr->ah_attr.static_rate = 0;
/* RTS params */
attr->timeout = (uint32_t)cm->timeout;
attr->retry_cnt = (uint32_t)cm->retry_count;
attr->rnr_retry = (uint32_t)cm->rnr_retry;
attr->max_rd_atomic = (uint32_t)cm->max_rdma_dst_ops;
/* Init for local mca_bcol_iboffload_endpoint_qp_t qps structure
* that caches the qp information on endpoint */
OBJ_CONSTRUCT(&ep->qps[qp_index].preposted_frags, opal_list_t);
/* Pasha: Need to add function that will */
ep->qps[qp_index].ib_inline_max = cm->max_inline_data;
/* TODO Pasha - this is crap too... we do not have info for sevice qps. Fix it later */
ep->qps[qp_index].sd_wqe = cm->qp_infos[qp_index].rd_num;
ep->qps[qp_index].rd_wqe = cm->qp_infos[qp_index].rd_num;
IBOFFLOAD_VERBOSE(10, ("ep - %p, qp index - %d, num of rd_wqe - %d.",
ep, qp_index, ep->qps[qp_index].rd_wqe));
}
static int mca_bcol_iboffload_alloc_reg_qp_resource(int qp_index, mca_bcol_iboffload_device_t *device)
{
int length;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
ompi_free_list_t *frags_free = &device->frags_free[qp_index];
OBJ_CONSTRUCT(frags_free, ompi_free_list_t);
length = cm->qp_infos[qp_index].size;
IBOFFLOAD_VERBOSE(10, ("free list len %d\n", length));
if (OMPI_SUCCESS != ompi_free_list_init_ex_new(frags_free,
sizeof(mca_bcol_iboffload_frag_t), MCA_IBOFFLOAD_CACHE_LINE_SIZE,
OBJ_CLASS(mca_bcol_iboffload_frag_t),
length, cm->buffer_alignment,
cm->free_list_num,
cm->free_list_max,
cm->free_list_inc,
device->mpool,
mca_bcol_iboffload_frag_init,
(void *) &cm->qp_infos[qp_index].qp_index)) {
IBOFFLOAD_ERROR(("Failed to allocate frags_free"));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static int mca_bcol_iboffload_dealloc_reg_qp_resource(int qp_index, mca_bcol_iboffload_device_t *device)
{
OBJ_DESTRUCT(&device->frags_free[qp_index]);
return OMPI_SUCCESS;
}
static mca_bcol_iboffload_frag_t *mca_bcol_iboffload_get_dummy_frag(
mca_bcol_iboffload_endpoint_t *ep, int qp_index)
{
return &ep->iboffload_module->device->dummy_frags[qp_index];
}
static mca_bcol_iboffload_frag_t *mca_bcol_iboffload_endpoint_get_preposted_frag(
mca_bcol_iboffload_endpoint_t *ep, int qp_index)
{
return (mca_bcol_iboffload_frag_t *)
opal_list_remove_first(&ep->qps[qp_index].preposted_frags);
}
static void mca_bcol_iboffload_regular_qp_attr(int qp_index,
mca_bcol_iboffload_endpoint_t *ep,
ompi_common_ofacm_base_qp_config_t *qp_config)
{
struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index];
mca_bcol_iboffload_fillin_qp_attr(qp_index, ep, qp_config);
init_attr->send_cq = ep->iboffload_module->device->ib_cq;
init_attr->recv_cq = ep->recv_cq[IBOFFLOAD_CQ_SMALL_MESSAGES];
}
static void mca_bcol_iboffload_large_buff_qp_attr(int qp_index,
mca_bcol_iboffload_endpoint_t *ep,
ompi_common_ofacm_base_qp_config_t *qp_config)
{
struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index];
mca_bcol_iboffload_fillin_qp_attr(qp_index, ep, qp_config);
init_attr->send_cq = ep->iboffload_module->device->ib_cq;
init_attr->recv_cq = ep->recv_cq[IBOFFLOAD_CQ_LARGE_MESSAGES];
}
static void mca_bcol_iboffload_sync_qp_attr(int qp_index,
mca_bcol_iboffload_endpoint_t *ep,
ompi_common_ofacm_base_qp_config_t *qp_config)
{
struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index];
mca_bcol_iboffload_fillin_qp_attr(qp_index, ep, qp_config);
init_attr->send_cq = ep->iboffload_module->device->ib_cq;
init_attr->recv_cq = ep->recv_cq[IBOFFLOAD_CQ_SYNC];
}
static int mca_bcol_iboffload_setup_barrier_qp(mca_bcol_iboffload_qp_info_t* qp_info)
{
qp_info->config_qp = mca_bcol_iboffload_regular_qp_attr;
qp_info->prepost_recv = mca_bcol_iboffload_dummy_frag_qp_prepost;
qp_info->alloc_resource = NULL;
qp_info->dealloc_resource = NULL;
qp_info->get_preposted_recv = mca_bcol_iboffload_get_dummy_frag;
return OMPI_SUCCESS;
}
static int mca_bcol_iboffload_setup_regular_qp(mca_bcol_iboffload_qp_info_t* qp_info)
{
qp_info->config_qp = mca_bcol_iboffload_regular_qp_attr;
qp_info->prepost_recv = mca_bcol_iboffload_frag_reg_qp_prepost;
qp_info->alloc_resource = mca_bcol_iboffload_alloc_reg_qp_resource;
qp_info->dealloc_resource = mca_bcol_iboffload_dealloc_reg_qp_resource;
qp_info->get_preposted_recv = mca_bcol_iboffload_endpoint_get_preposted_frag;
return OMPI_SUCCESS;
}
static int mca_bcol_iboffload_setup_large_buff_qp(mca_bcol_iboffload_qp_info_t* qp_info)
{
qp_info->config_qp = mca_bcol_iboffload_large_buff_qp_attr;
qp_info->prepost_recv = NULL; /* We use "manual" ML frag preposting for this QP */
qp_info->alloc_resource = NULL;
qp_info->dealloc_resource = NULL;
qp_info->get_preposted_recv = NULL;
return OMPI_SUCCESS;
}
static int mca_bcol_iboffload_setup_credit_qp(mca_bcol_iboffload_qp_info_t* qp_info)
{
qp_info->config_qp = mca_bcol_iboffload_large_buff_qp_attr;
qp_info->prepost_recv = mca_bcol_iboffload_dummy_frag_qp_prepost;
qp_info->alloc_resource = NULL;
qp_info->dealloc_resource = NULL;
qp_info->get_preposted_recv = mca_bcol_iboffload_get_dummy_frag;
return OMPI_SUCCESS;
}
static int mca_bcol_iboffload_setup_sync_qp(mca_bcol_iboffload_qp_info_t* qp_info)
{
qp_info->config_qp = mca_bcol_iboffload_sync_qp_attr;
qp_info->prepost_recv = mca_bcol_iboffload_dummy_frag_qp_prepost;
qp_info->alloc_resource = NULL;
qp_info->dealloc_resource = NULL;
qp_info->get_preposted_recv = mca_bcol_iboffload_get_dummy_frag;
return OMPI_SUCCESS;
}
mca_bcol_iboffload_setup_qps_fn_t setup_qps_fn[MCA_BCOL_IBOFFLOAD_QP_LAST] = {
mca_bcol_iboffload_setup_barrier_qp, /* MCA_BCOL_IBOFFLOAD_QP_BARRIER */
mca_bcol_iboffload_setup_regular_qp, /* MCA_BCOL_IBOFFLOAD_QP_REGULAR */
mca_bcol_iboffload_setup_sync_qp, /* MCA_BCOL_IBOFFLOAD_QP_SYNC */
mca_bcol_iboffload_setup_credit_qp, /* MCA_BCOL_IBOFFLOAD_QP_CREDIT */
mca_bcol_iboffload_setup_large_buff_qp, /* MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF */
/* MCA_BCOL_IBOFFLOAD_QP_LAST */
};

Просмотреть файл

@ -1,127 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* In order to add a new QP you need to do next steps:
*
* 1) Add new index to enum with list of the all QPs,
* MCA_BCOL_IBOFFLOAD_QP_NEW_QP e.g.
*
* 2) In the setup_qps_fn array init MCA_BCOL_IBOFFLOAD_QP_NEW_QP
* index with your init func for this QP.
*
* 3) In the init func you added init the next func pointers:
* a) config_qp - in this func you need to fill in ibv_qp_init_attr
* structure will be used for this QP creation.
*
* b) prepost_recv - you have to specify this poiner if you want
* automatically executed preposting to your new QP.
*
* c) alloc_resource - will be called during device activation,
* if you need any device resource (list of frags for example)
* for your new QP here the right place to allocate it.
*
* d) dealloc_resource - if any resource was allocated dynamically
* by alloc_resource func destruct it in this func.
*
* e) get_preposted_recv - the function returns preposted recieve for 'wait task'.
*
* d) If you don't need any of these funcs you have to init appropriate pointer with NULL.
*/
#ifndef MCA_BCOL_IBOFFLOAD_QP_INFO_H
#define MCA_BCOL_IBOFFLOAD_QP_INFO_H
#include "ompi_config.h"
BEGIN_C_DECLS
/* forward declarations */
struct mca_bcol_iboffload_device_t;
struct mca_bcol_iboffload_collreq_t;
struct mca_bcol_iboffload_qp_info_t;
struct mca_bcol_iboffload_endpoint_t;
/* The list of the all required QPs */
enum {
MCA_BCOL_IBOFFLOAD_QP_BARRIER,
MCA_BCOL_IBOFFLOAD_QP_REGULAR,
MCA_BCOL_IBOFFLOAD_QP_SYNC,
MCA_BCOL_IBOFFLOAD_QP_CREDIT,
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
MCA_BCOL_IBOFFLOAD_QP_LAST
};
typedef enum {
MCA_BCOL_IBOFFLOAD_PP_QP,
MCA_BCOL_IBOFFLOAD_SRQ_QP,
MCA_BCOL_IBOFFLOAD_XRC_QP
} mca_bcol_iboffload_qp_type_t;
struct mca_bcol_iboffload_pp_qp_info_t {
int32_t rd_win;
int32_t rd_rsv;
}; typedef struct mca_bcol_iboffload_pp_qp_info_t mca_bcol_iboffload_pp_qp_info_t;
struct mca_bcol_iboffload_srq_qp_info_t {
int32_t sd_max;
}; typedef struct mca_bcol_iboffload_srq_qp_info_t mca_bcol_iboffload_srq_qp_info_t;
typedef int (*mca_bcol_iboffload_setup_qps_fn_t) (struct mca_bcol_iboffload_qp_info_t*);
typedef int (*mca_bcol_iboffload_prepost_qps_fn_t)
(struct mca_bcol_iboffload_endpoint_t *endpoint,
int qp_index, int num_to_prepost);
typedef void (*mca_bcol_iboffload_config_qps_fn_t)
(int qp_index,
struct mca_bcol_iboffload_endpoint_t *ep,
ompi_common_ofacm_base_qp_config_t *qp_config);
typedef int (*mca_bcol_iboffload_alloc_qps_resource_fn_t)
(int qp_index,
struct mca_bcol_iboffload_device_t *device);
typedef int (*mca_bcol_iboffload_dealloc_qps_resource_fn_t)
(int qp_index,
struct mca_bcol_iboffload_device_t *device);
typedef struct mca_bcol_iboffload_frag_t* (*mca_bcol_iboffload_get_preposted_recv_fn_t)
(struct mca_bcol_iboffload_endpoint_t *ep, int qp_index);
struct mca_bcol_iboffload_qp_info_t {
size_t size;
int32_t rd_num;
int32_t rd_low;
int32_t rd_pp_win; /* prepost window = rd_num - rd_low */
int qp_index;
mca_bcol_iboffload_qp_type_t type;
mca_bcol_iboffload_config_qps_fn_t config_qp;
mca_bcol_iboffload_prepost_qps_fn_t prepost_recv;
mca_bcol_iboffload_alloc_qps_resource_fn_t alloc_resource;
mca_bcol_iboffload_dealloc_qps_resource_fn_t dealloc_resource;
mca_bcol_iboffload_get_preposted_recv_fn_t get_preposted_recv;
union {
mca_bcol_iboffload_pp_qp_info_t pp_qp;
mca_bcol_iboffload_srq_qp_info_t srq_qp;
} u;
}; typedef struct mca_bcol_iboffload_qp_info_t mca_bcol_iboffload_qp_info_t;
extern mca_bcol_iboffload_setup_qps_fn_t setup_qps_fn[MCA_BCOL_IBOFFLOAD_QP_LAST];
END_C_DECLS
#endif /* MCA_BCOL_IBOFFLOAD_QP_INFO_H */

Просмотреть файл

@ -1,81 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_task.h"
static void task_constructor(mca_bcol_iboffload_task_t *task)
{
task->frag = NULL;
task->collfrag = NULL;
task->endpoint = NULL;
task->next_task = NULL;
task->sg_entries = NULL;
task->sg_entries_num = 0;
task->task_list = NULL;
memset(&task->wr, 0, sizeof(task->wr));
memset(&task->element, 0, sizeof(struct mqe_task));
memset(&task->task_mqe_qp_entry, 0, sizeof(struct mqe_qp_entry));
}
static void task_destructor(mca_bcol_iboffload_task_t *task)
{
if (NULL != task->sg_entries) {
free(task->sg_entries);
}
}
OBJ_CLASS_INSTANCE(
mca_bcol_iboffload_task_t,
ompi_free_list_item_t,
task_constructor,
task_destructor);
void
mca_bcol_iboffload_calc_task_init(ompi_free_list_item_t* item, void* ctx)
{
mca_bcol_iboffload_task_t *calc_task =
(mca_bcol_iboffload_task_t *) item;
calc_task->task_list = (ompi_free_list_t *) ctx;
calc_task->sg_entries_num = 2;
calc_task->sg_entries = (struct ibv_sge *) malloc (2 * sizeof(struct ibv_sge));
}
void
mca_bcol_iboffload_iovec_task_init(ompi_free_list_item_t* item, void* ctx)
{
mca_bcol_iboffload_task_t *iovec_task =
(mca_bcol_iboffload_task_t *) item;
mca_bcol_iboffload_module_t *iboffload_module =
(mca_bcol_iboffload_module_t *) ctx;
int nitems, group_size = iboffload_module->group_size;
nitems = group_size / 2 + group_size % 2;
if (nitems > iboffload_module->device->ib_dev_attr.max_sge) {
nitems = iboffload_module->device->ib_dev_attr.max_sge;
}
iovec_task->sg_entries_num = nitems;
iovec_task->task_list = &iboffload_module->iovec_tasks_free;
iovec_task->sg_entries = (struct ibv_sge *)
malloc(nitems * sizeof(struct ibv_sge));
}

Просмотреть файл

@ -1,613 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_TASK_H
#define MCA_BCOL_IBOFFLOAD_TASK_H
#include "ompi_config.h"
#include <infiniband/verbs.h>
#include <infiniband/mverbs.h>
#include <infiniband/mqe.h>
#include "bcol_iboffload.h"
#include "bcol_iboffload_frag.h"
#include "bcol_iboffload_collreq.h"
#include "bcol_iboffload_endpoint.h"
#include "bcol_iboffload_collfrag.h"
#define SENDWR(task) ((task)->element.post.send_wr)
BEGIN_C_DECLS
/* the mca_bcol_ibv_mwr_task_t name was replaced with mca_bcol_iboffload_task_t */
struct mca_bcol_iboffload_task_t {
ompi_free_list_item_t super;
/* pointer to the memory descriptor associated with the task */
mca_bcol_iboffload_frag_t *frag;
/* pointer to the bcol descriptor,
* we need it for send task only becasue we complete them in async maner
*/
mca_bcol_iboffload_collfrag_t *collfrag;
/* task to be posted */
struct mqe_task element;
/* allocate ibv_sge structs array - in a CALC case
* for example it will have two entries.
*/
struct ibv_sge *sg_entries;
/* sg_entries array length */
int sg_entries_num;
/* Each task is a member of some free list,
if the pointer is NULL => we assume the task
is a member of the common task list (tasks_free) */
ompi_free_list_t *task_list;
/* Pointer to the next task */
struct mca_bcol_iboffload_task_t *next_task;
/* pasha - it is crappy work around for driver interface
* the send_wr and recv_wr should be part of mqe_task and not pointers !
*/
union {
struct ibv_m_send_wr send_wr;
struct ibv_recv_wr recv_wr;
} wr;
/* If we'll decide to post a task to a different qp */
struct mqe_qp_entry task_mqe_qp_entry;
/* Pointer to endpoint for this task */
mca_bcol_iboffload_endpoint_t *endpoint;
};
typedef struct mca_bcol_iboffload_task_t mca_bcol_iboffload_task_t;
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_task_t);
/* calc_tasks_free free list init function */
void
mca_bcol_iboffload_calc_task_init(ompi_free_list_item_t* item, void* ctx);
/* iovec_tasks_free free list init function */
void
mca_bcol_iboffload_iovec_task_init(ompi_free_list_item_t* item, void* ctx);
static inline __opal_attribute_always_inline__ void
mca_bcol_iboffload_return_frag_tolist(
mca_bcol_iboffload_frag_t *frag,
ompi_free_list_t *list)
{
if (NULL != frag) {
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
assert(MCA_BCOL_IBOFFLOAD_NONE_OWNER != frag->type);
if (MCA_BCOL_IBOFFLOAD_DUMMY_OWNER != frag->type &&
0 == frag->ref_counter) {
if (MCA_BCOL_IBOFFLOAD_BCOL_OWNER == frag->type) {
OMPI_FREE_LIST_RETURN_MT((&(list[frag->qp_index])),
(ompi_free_list_item_t*) frag);
} else if (MCA_BCOL_IBOFFLOAD_ML_OWNER == frag->type) {
OMPI_FREE_LIST_RETURN_MT((&(cm->ml_frags_free)),
(ompi_free_list_item_t*) frag);
}
}
}
}
static inline __opal_attribute_always_inline__ void
mca_bcol_iboffload_return_recv_frags_toendpoint(
mca_bcol_iboffload_frag_t *frags,
mca_bcol_iboffload_endpoint_t *ep,
int qp_index)
{
mca_bcol_iboffload_frag_t *recv_frag = frags;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
while (NULL != recv_frag) {
assert(MCA_BCOL_IBOFFLOAD_NONE_OWNER != recv_frag->type);
if (MCA_BCOL_IBOFFLOAD_ML_OWNER != recv_frag->type) {
opal_list_prepend(&ep->qps[qp_index].preposted_frags,
(opal_list_item_t *) recv_frag);
} else {
OMPI_FREE_LIST_RETURN_MT((&(cm->ml_frags_free)),
(ompi_free_list_item_t*) recv_frag);
}
recv_frag = recv_frag->next;
}
}
/* Wait task allocation and initialization */
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
mca_bcol_iboffload_get_wait_task(mca_bcol_iboffload_module_t *iboffload,
uint32_t source, int num_waits,
mca_bcol_iboffload_frag_t *frags,
int qp_index, struct ibv_qp *qp)
{
ompi_free_list_item_t *item;
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[source];
/* blocking allocation for send fragment */
OMPI_FREE_LIST_GET_MT(&cm->tasks_free, item);
if (OPAL_UNLIKELY(NULL == item)) {
mca_bcol_iboffload_return_recv_frags_toendpoint(frags, endpoint, qp_index);
return NULL;
}
task = (mca_bcol_iboffload_task_t *) item;
/* set pointer to corresponding recv fragment */
IBOFFLOAD_SET_FRAGS_ON_TASK(frags, task);
task->next_task = NULL;
task->endpoint = endpoint;
/* set opcode */
task->element.opcode = MQE_WR_CQE_WAIT;
task->element.flags = 0; /* Here maybe ANY flag, anyway driver ignore it */
/* set task id */
task->element.wr_id = (uint64_t) (uintptr_t) task;
/* set CQ */
task->element.wait.cq = endpoint->qp_config.init_attr[qp_index].recv_cq;
/* set number of tasks to task */
task->element.wait.count = num_waits;
/* set pointer to QP */
if (NULL == qp) { /* NULL means use MQ's QP */
task->element.wait.mqe_qp = NULL;
} else { /* Post wait to the SQ of this QP */
task->task_mqe_qp_entry.next = NULL;
task->task_mqe_qp_entry.qp = qp;
task->element.wait.mqe_qp = &task->task_mqe_qp_entry;
}
IBOFFLOAD_VERBOSE(10, ("Allocating task %p, cq: %p, num waits: %d, qp_index - %d, "
"destination %d for comm rank: %d.\n",
(void *) task, (void *) task->element.wait.cq,
task->element.wait.count, qp_index, source,
endpoint->iboffload_module->ibnet->super.group_list[endpoint->index]));
return task;
}
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
mca_bcol_iboffload_prepare_send_task(
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_endpoint_t *endpoint,
int qp_index, ompi_free_list_t *task_list,
mca_bcol_iboffload_collfrag_t *collfrag)
{
ompi_free_list_item_t *item;
mca_bcol_iboffload_task_t *task;
IBOFFLOAD_VERBOSE(10, ("Destination rank - %d, QP index - %d, "
"for comm rank - %d\n", endpoint->index, qp_index,
endpoint->iboffload_module->ibnet->super.group_list[endpoint->index]));
/* get item from free list */
OMPI_FREE_LIST_GET_MT(task_list, item);
if (OPAL_UNLIKELY(NULL == item)) {
return NULL;
}
task = (mca_bcol_iboffload_task_t*) item;
task->endpoint = endpoint;
++(collfrag->n_sends);
task->collfrag = collfrag;
task->next_task = NULL;
task->element.wr_id = (uint64_t) (uintptr_t) task;
task->element.post.qp = endpoint->qps[qp_index].qp->lcl_qp;
task->element.opcode = MQE_WR_SEND;
/* define send work request */
SENDWR(task) = &(task->wr.send_wr);
SENDWR(task)->next = NULL;
SENDWR(task)->wr_id = (uint64_t) (uintptr_t) collfrag;
IBOFFLOAD_VERBOSE(10, ("coll_frag - %p.\n", collfrag));
/* Allways send IMM on sends ! */
task->element.flags = MQE_WR_FLAG_IMM_EXE;
/* Always signal completion */
SENDWR(task)->send_flags = IBV_SEND_SIGNALED;
return task;
}
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
mca_bcol_iboffload_get_send_task(
mca_bcol_iboffload_module_t *iboffload,
uint32_t destination, int qp_index,
mca_bcol_iboffload_frag_t *frag,
mca_bcol_iboffload_collfrag_t *collfrag,
bool enable_inline)
{
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[destination];
IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_get_send_task qp_index %d\n",
qp_index));
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
&cm->tasks_free,
collfrag);
if (OPAL_UNLIKELY(NULL == task)) {
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
return NULL;
}
/* no support for multiple frags */
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
/* We can not do send with 0 byte but we can do zero byte RDMA with immidiate */
if (0 == frag->sg_entry.length) {
SENDWR(task)->imm_data = 0;
SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
SENDWR(task)->wr.rdma.rkey = endpoint->remote_zero_rdma_addr.rkey;
SENDWR(task)->wr.rdma.remote_addr = endpoint->remote_zero_rdma_addr.addr;
} else {
SENDWR(task)->opcode = IBV_WR_SEND;
}
/* single sge */
SENDWR(task)->num_sge = 1;
SENDWR(task)->sg_list = &(frag->sg_entry);
/* Use inline send when it is possible */
if (enable_inline &&
frag->sg_entry.length < cm->max_inline_data) {
IBOFFLOAD_VERBOSE(10, ("Setting inline for len %d\n", frag->sg_entry.length));
SENDWR(task)->send_flags |= IBV_SEND_INLINE;
}
return task;
}
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
mca_bcol_iboffload_get_send_vec_task(
mca_bcol_iboffload_module_t *iboffload,
uint32_t destination, int qp_index,
size_t nitems,
struct iovec *buff_iovec,
uint32_t lkey,
mca_bcol_iboffload_frag_t *frag,
mca_bcol_iboffload_collfrag_t *collfrag,
bool enable_inline)
{
mca_bcol_iboffload_task_t *task;
int i;
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[destination];
IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_get_send_task qp_index %d\n",
qp_index));
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
&iboffload->iovec_tasks_free,
collfrag);
if (OPAL_UNLIKELY(NULL == task)) {
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
return NULL;
}
/* no support for multiple frags */
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
/* We can not do send with 0 byte but we can do zero byte RDMA with immidiate */
SENDWR(task)->opcode = IBV_WR_SEND;
assert (task->sg_entries != NULL);
for (i = 0; (size_t) i < nitems; ++i){
task->sg_entries[i].length = buff_iovec[i].iov_len;
task->sg_entries[i].addr = (uint64_t) buff_iovec[i].iov_base;
task->sg_entries[i].lkey = lkey;
}
/* multiple sge */
SENDWR(task)->num_sge = nitems;
SENDWR(task)->sg_list = (task->sg_entries);
/* Use inline send when it is possible */
if (enable_inline &&
frag->sg_entry.length < cm->max_inline_data) {
IBOFFLOAD_VERBOSE(10, ("Setting inline for len %d\n", frag->sg_entry.length));
SENDWR(task)->send_flags |= IBV_SEND_INLINE;
}
return task;
}
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
mca_bcol_iboffload_get_rdma_vec_task(
uint32_t destination, size_t offset, size_t nitems,
mca_bcol_iboffload_frag_t *frag,
mca_bcol_iboffload_module_t *iboffload,
struct iovec *buff_iovec, uint32_t lkey,
mca_bcol_iboffload_collfrag_t *collfrag)
{
int i;
mca_bcol_iboffload_collreq_t *coll_request = collfrag->coll_full_req;
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_endpoint_t *endpoint =
iboffload->endpoints[destination];
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint,
coll_request->qp_index,
&iboffload->iovec_tasks_free,
collfrag);
if (OPAL_UNLIKELY(NULL == task)) {
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
return NULL;
}
/* no support for multiple frags */
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
SENDWR(task)->imm_data = 0;
SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
SENDWR(task)->wr.rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey;
SENDWR(task)->wr.rdma.remote_addr = (uint64_t) (uintptr_t)
((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset);
for (i = 0; (size_t) i < nitems; ++i){
task->sg_entries[i].length = buff_iovec[i].iov_len;
task->sg_entries[i].addr = (uint64_t) buff_iovec[i].iov_base;
task->sg_entries[i].lkey = lkey;
}
/* single sge */
SENDWR(task)->num_sge = nitems;
SENDWR(task)->sg_list = (task->sg_entries);
IBOFFLOAD_VERBOSE(10, ("The remote offset %ld \n", offset));
return task;
}
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
mca_bcol_iboffload_get_rdma_task(
uint32_t destination, size_t offset,
mca_bcol_iboffload_frag_t *frag,
mca_bcol_iboffload_module_t *iboffload,
mca_bcol_iboffload_collfrag_t *collfrag)
{
mca_bcol_iboffload_collreq_t *coll_request = collfrag->coll_full_req;
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_endpoint_t *endpoint =
iboffload->endpoints[destination];
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint,
coll_request->qp_index,
&cm->tasks_free, collfrag);
if (OPAL_UNLIKELY(NULL == task)) {
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
return NULL;
}
/* no support for multiple frags */
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
SENDWR(task)->imm_data = 0;
SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
SENDWR(task)->wr.rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey;
/* Pasha: I really not happy with the way we calculate remote addresses.
why we don't use rbuf + offset ?*/
SENDWR(task)->wr.rdma.remote_addr = (uint64_t) (uintptr_t)
((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset);
/* single sge */
SENDWR(task)->num_sge = 1;
SENDWR(task)->sg_list = &(frag->sg_entry);
IBOFFLOAD_VERBOSE(10, ("The remote offset %ld \n", offset));
return task;
}
/* Pasha: hacking version of calc operation */
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
mca_bcol_iboffload_get_calc_task(mca_bcol_iboffload_module_t *iboffload,
uint32_t destination, int qp_index, mca_bcol_iboffload_frag_t *frag,
struct ibv_sge *l_operand, struct ibv_sge *r_operand,
mca_bcol_iboffload_collreq_t *coll_request,
bool enable_inline)
/* Some specifications for this function:
* 1) We assume that the len of two operands (ibv_sge structs) is a same.
* 2) Possibly we use the results (ibv_sge structs) from previous
* calc operations => maybe the frag pointer is NULL.
*/
{
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_endpoint_t *endpoint =
iboffload->endpoints[destination];
mca_bcol_iboffload_collfrag_t *collfrag =
(mca_bcol_iboffload_collfrag_t *)
opal_list_get_last(&coll_request->work_requests);
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
&cm->calc_tasks_free, collfrag);
if (OPAL_UNLIKELY(NULL == task)) {
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
return NULL;
}
if (NULL != frag) {
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
} else {
task->frag = NULL;
}
task->sg_entries[0] = *l_operand;
task->sg_entries[1] = *r_operand;
SENDWR(task)->num_sge = 2;
SENDWR(task)->sg_list = task->sg_entries;
SENDWR(task)->opcode = MCA_BCOL_IBOFFLOAD_SEND_CALC;
#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA
SENDWR(task)->wr.calc_send.data_type = coll_request->actual_ib_dtype;
SENDWR(task)->wr.calc_send.calc_op = coll_request->actual_ib_op;
#else
SENDWR(task)->wr.calc.data_type = coll_request->actual_ib_dtype;
SENDWR(task)->wr.calc.calc_op = coll_request->actual_ib_op;
#endif
return task;
}
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
mca_bcol_iboffload_get_rdma_calc_task(mca_bcol_iboffload_module_t *iboffload,
uint32_t destination, int qp_index, mca_bcol_iboffload_frag_t *frag,
struct ibv_sge *l_operand, struct ibv_sge *r_operand,
mca_bcol_iboffload_collreq_t *coll_request,
size_t offset)
/* Some specifications for this function:
* 1) We assume that the len of two operands (ibv_sge structs) is a same.
* 2) Possibly we use the results (ibv_sge structs) from previous
* calc operations => maybe the frag pointer is NULL.
*/
{
mca_bcol_iboffload_task_t *task;
mca_bcol_iboffload_endpoint_t *endpoint =
iboffload->endpoints[destination];
mca_bcol_iboffload_collfrag_t *collfrag =
(mca_bcol_iboffload_collfrag_t *)
opal_list_get_last(&coll_request->work_requests);
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
&cm->calc_tasks_free, collfrag);
if (OPAL_UNLIKELY(NULL == task)) {
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
return NULL;
}
if (NULL != frag) {
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
} else {
task->frag = NULL;
}
task->sg_entries[0] = *l_operand;
/* Hack - we don't really use it.
task->sg_entries[1] = *r_operand;
*/
/* We use only single entry
SENDWR(task)->num_sge = 2;
*/
SENDWR(task)->num_sge = 1;
SENDWR(task)->sg_list = task->sg_entries;
#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA
SENDWR(task)->opcode = IBV_M_WR_CALC_RDMA_WRITE_WITH_IMM;
SENDWR(task)->wr.calc_rdma.data_type = coll_request->actual_ib_dtype;
SENDWR(task)->wr.calc_rdma.calc_op = coll_request->actual_ib_op;
SENDWR(task)->wr.calc_rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey;
SENDWR(task)->wr.calc_rdma.remote_addr = (uint64_t) (uintptr_t)
((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset);
#else
IBOFFLOAD_ERROR(("Fatal error: RDMA CALC was called, but the driver does not support this operation"));
return NULL;
#endif
return task;
}
static inline __opal_attribute_always_inline__
int release_frags_on_task(mca_bcol_iboffload_task_t *task,
ompi_free_list_t *list)
{
int rc, qp_index;
mca_bcol_iboffload_frag_t *temp_frag = task->frag;
mca_bcol_iboffload_endpoint_t *endpoint = task->endpoint;
mca_bcol_iboffload_component_t *cm =
&mca_bcol_iboffload_component;
IBOFFLOAD_VERBOSE(10, ("\nCalling release_frags_on_task"));
while (NULL != temp_frag) {
qp_index = temp_frag->qp_index;
--(temp_frag->ref_counter);
/* Return credits */
if (MQE_WR_CQE_WAIT == task->element.opcode) {
++(endpoint->qps[qp_index].rd_wqe);
IBOFFLOAD_VERBOSE(10, ("Return rd_wqe %d pp_win %d",
endpoint->qps[qp_index].rd_wqe,
cm->qp_infos[qp_index].rd_pp_win));
/* Call for recv prepost */
if (endpoint->qps[qp_index].rd_wqe >=
cm->qp_infos[qp_index].rd_pp_win) {
IBOFFLOAD_VERBOSE(10, ("Prepost to endpoint->index - %d, qp_index - %d", endpoint->index, qp_index));
rc = mca_bcol_iboffload_prepost_recv(endpoint, qp_index,
endpoint->qps[qp_index].rd_wqe);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
IBOFFLOAD_ERROR(("QP %d: failed to prepost.\n", qp_index));
return OMPI_ERROR;
}
/* What happens if we can not prepost ?*/
}
} else if (MQE_WR_SEND == task->element.opcode) {
++(endpoint->qps[qp_index].sd_wqe);
assert(endpoint->qps[qp_index].sd_wqe <= cm->qp_infos[qp_index].rd_num);
IBOFFLOAD_VERBOSE(10, ("Return sd_wqe %d, qp_index - %d, endpoint - %p",
endpoint->qps[qp_index].sd_wqe, qp_index, endpoint));
} else {
/* We should not arrive to this case */
IBOFFLOAD_ERROR(("Unsupporeted operation"));
return OMPI_ERROR;
}
mca_bcol_iboffload_return_frag_tolist(temp_frag, list);
temp_frag = temp_frag->next;
}
return OMPI_SUCCESS;
}
END_C_DECLS
#endif

Просмотреть файл

@ -1,40 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
# Copyright (c) 2015 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_bcol_iboffload_CONFIG([should_build])
# ------------------------------------------
# AC_DEFUN([MCA_ompi_bcol_iboffload_POST_CONFIG], [
# ])
# MCA_ompi_bcol_iboffload_CONFIG([action-if-can-compile],
# [action-if-cant-compile])
# ------------------------------------------------
AC_DEFUN([MCA_ompi_bcol_iboffload_CONFIG],[
AC_CONFIG_FILES([ompi/mca/bcol/iboffload/Makefile])
bcol_ofa_happy="no"
bcol_mlnx_ofed_happy="no"
OPAL_CHECK_OPENFABRICS([bcol_iboffload], [bcol_ofa_happy="yes"])
OPAL_CHECK_MLNX_OPENFABRICS([bcol_iboffload], [bcol_mlnx_ofed_happy="yes"])
AS_IF([test "$bcol_ofa_happy" = "yes" && test "$bcol_mlnx_ofed_happy" = "yes"],
[$1],
[$2])
# substitute in the things needed to build iboffload
AC_SUBST([bcol_iboffload_CFLAGS])
AC_SUBST([bcol_iboffload_CPPFLAGS])
AC_SUBST([bcol_iboffload_LDFLAGS])
AC_SUBST([bcol_iboffload_LIBS])
])dnl

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: ORNL
status: unmaintained

Просмотреть файл

@ -1,57 +0,0 @@
#
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
# Copyright (c) 2009-2013 Mellanox Technologies. All rights reserved.
# Copyright (c) 2013 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
bcol_ptpcoll.h \
bcol_ptpcoll_utils.h \
bcol_ptpcoll_utils.c \
bcol_ptpcoll_mca.h \
bcol_ptpcoll_mca.c \
bcol_ptpcoll_barrier.c \
bcol_ptpcoll_bcast.c \
bcol_ptpcoll_bcast.h \
bcol_ptpcoll_component.c \
bcol_ptpcoll_fanin.c \
bcol_ptpcoll_fanout.c \
bcol_ptpcoll_module.c \
bcol_ptpcoll_allreduce.h \
bcol_ptpcoll_allreduce.c \
bcol_ptpcoll_reduce.h \
bcol_ptpcoll_reduce.c \
bcol_ptpcoll_allgather.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
component_noinst =
component_install =
if MCA_BUILD_ompi_bcol_ptpcoll_DSO
component_install += mca_bcol_ptpcoll.la
else
component_noinst += libmca_bcol_ptpcoll.la
endif
# See ompi/mca/btl/sm/Makefile.am for an explanation of
# libmca_common_sm.la.
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_bcol_ptpcoll_la_SOURCES = $(sources)
mca_bcol_ptpcoll_la_LDFLAGS = -module -avoid-version
mca_bcol_ptpcoll_la_LIBADD =
noinst_LTLIBRARIES = $(component_noinst)
libmca_bcol_ptpcoll_la_SOURCES =$(sources)
libmca_bcol_ptpcoll_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,474 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_PTPCOLL_EXPORT_H
#define MCA_BCOL_PTPCOLL_EXPORT_H
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/mca.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "opal/mca/mpool/mpool.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/patterns/net/netpatterns.h"
BEGIN_C_DECLS
#ifdef HAVE_SCHED_YIELD
# include <sched.h>
# define SPIN sched_yield()
#else /* no switch available */
# define SPIN
#endif
/**
* Structure to hold the basic shared memory coll component. First it holds the
* base coll component, and then holds a bunch of
* sm-coll-component-specific stuff (e.g., current MCA param
* values).
*/
struct mca_bcol_ptpcoll_component_t {
/** Base coll component */
mca_bcol_base_component_2_0_0_t super;
/** Verbosity level, used only in debug enabled builds */
int verbose;
/** The radix of K-nomial tree, initilized by mca parameter */
int k_nomial_radix;
/** The radix of narray tree, initilized by mca parameter */
int narray_radix;
/** The radix is used for narray scatther and knomail gather for
large message bcast **/
int narray_knomial_radix;
/** Number of times to poll for specific tag/src */
int num_to_probe;
/*
* bcast small messages algorithm
* 1 - Knomial bcast
* 2 - Narray bcast
*/
int bcast_small_messages_known_root_alg;
/*
* bcast large messages algorithm
* 1 - binomial scatter-gather
* 2 - Narray scatther, knomial gather
*/
int bcast_large_messages_known_root_alg;
/*
* barrier algorithm
* 1 - recursive doubling
* 2 - recursive K-ing
*/
int barrier_alg;
int use_brucks_smsg_alltoall_rdma;
};
struct mca_bcol_ptpcoll_collreq_t {
opal_free_list_item_t super;
int tag;
int num_reqs;
int exchange;
int need_toserv_extra;
int extra_partner_rank;
ompi_request_t **requests;
};
typedef struct mca_bcol_ptpcoll_collreq_t mca_bcol_ptpcoll_collreq_t;
OBJ_CLASS_DECLARATION(mca_bcol_ptpcoll_collreq_t);
/**
* Convenience typedef
*/
typedef struct mca_bcol_ptpcoll_component_t mca_bcol_ptpcoll_component_t;
/* Bcast small messages,
known root algorithm */
enum {
PTPCOLL_KNOMIAL = 1,
PTPCOLL_NARRAY
};
/* Bcast large messages,
known root algorithm */
enum {
PTPCOLL_BINOMIAL_SG = 1, /* Binomila scatter-gather */
PTPCOLL_NARRAY_KNOMIAL_SG /* Narray-Knomial scatter-gather */
};
/*
* Implemented function index list
*/
/* barrier */
enum{
FANIN_FAN_OUT_BARRIER_FN,
RECURSIVE_DOUBLING_BARRIER_FN,
N_BARRIER_FNS
};
/* reduce */
enum{
FANIN_REDUCE_FN,
REDUCE_SCATTER_GATHER_FN,
N_REDUCE_FNS
};
enum{
SHORT_DATA_FN_REDUCE,
LONG_DATA_FN_REDUCE,
N_REDUCE_FNS_USED
};
/* all-reduce */
enum{
FANIN_FANOUT_ALLREDUCE_FN,
REDUCE_SCATTER_ALLGATHER_FN,
N_ALLREDUCE_FNS
};
enum{
SHORT_DATA_FN_ALLREDUCE,
LONG_DATA_FN_ALLREDUCE,
N_ALLREDUCE_FNS_USED
};
/*
* N-order tree node description
*/
struct tree_node_t {
/* my rank within the group */
int my_rank;
/* my node type - root, leaf, or interior */
int my_node_type;
/* number of nodes in the tree */
int tree_size;
/* number of parents (0/1) */
int n_parents;
/* number of children */
int n_children;
/* parent rank within the group */
int parent_rank;
/* chidren ranks within the group */
int *children_ranks;
};
typedef struct tree_node_t tree_node_t;
struct pair_exchange_node_t {
/* number of nodes this node will exchange data with */
int n_exchanges;
/* ranks of nodes involved in data exchnge */
int *rank_exchanges;
/* number of extra sources of data - outside largest power of 2 in
* this group */
int n_extra_sources;
/* rank of the extra source */
int rank_extra_source;
/* number of tags needed per stripe */
int n_tags;
/* log 2 of largest full power of 2 for this node set */
int log_2;
/* largest power of 2 that fits in this group */
int n_largest_pow_2;
/* node type */
int node_type;
};
typedef struct pair_exchange_node_t pair_exchange_node_t;
/*
* Barrier request objects
*/
/* enum for phase at which the nb barrier is in */
enum{
NB_BARRIER_INACTIVE,
NB_BARRIER_FAN_IN,
NB_BARRIER_FAN_OUT,
/* done and not started are the same for all practicle
* purposes, as the init funtion always sets this flag
*/
NB_BARRIER_DONE
};
typedef enum {
PTPCOLL_NOT_STARTED = 1,
PTPCOLL_WAITING_FOR_DATA = 1 << 1,
PTPCOLL_SCATTER_STARTED = 1 << 2,
PTPCOLL_GATHER_STARTED = 1 << 3,
PTPCOLL_EXTRA_SEND_STARTED = 1 << 4,
PTPCOLL_ROOT_SEND_STARTED = 1 << 5
} ptpcoll_op_status;
struct mca_bcol_ptpcoll_ml_buffer_desc_t {
void *data_addr; /* buffer address */
uint64_t bank_index; /* my bank */
uint64_t buffer_index; /* my buff index */
int active_requests; /* keep number of active requests */
ompi_request_t **requests; /* caching pointers to requests */
int data_src; /* used for bcast to cache internal data */
int radix_mask; /* used for bcast to cache internal data */
int radix_mask_pow; /* used for bcast to cache internal data */
int iteration; /* buffer iteration in knomial, binomail, etc. algorithms */
int tag; /* tag number that is attached to this operation */
int status; /* operation status */
/* Fixme: Probably we can get rid of these fields by redesigning
* the reduce implementation
*/
int reduction_status; /* used for reduction to cache internal
reduction status */
bool reduce_init_called;
};
typedef struct mca_bcol_ptpcoll_ml_buffer_desc_t mca_bcol_ptpcoll_ml_buffer_desc_t;
/*
* Information that we need to keep in order to access and
* track local ML memory that is used as source and destinatination
* for collectives operations
*/
struct mca_bcol_ptpcoll_local_mlmem_desc_t {
/* Bank index to release */
uint32_t bank_index_for_release;
/* number of memory banks */
uint32_t num_banks;
/* number of buffers per bank */
uint32_t num_buffers_per_bank;
/* size of a payload buffer */
uint32_t size_buffer;
/* pointer to buffer descriptors initialized */
mca_bcol_ptpcoll_ml_buffer_desc_t *ml_buf_desc;
};
typedef struct mca_bcol_ptpcoll_local_mlmem_desc_t mca_bcol_ptpcoll_local_mlmem_desc_t;
typedef enum {
PTPCOLL_PROXY = 1,
PTPCOLL_IN_GROUP = 1 << 1,
PTPCOLL_EXTRA = 1 << 2,
PTPCOLL_KN_PROXY = 1 << 3,
PTPCOLL_KN_IN_GROUP = 1 << 4,
PTPCOLL_KN_EXTRA = 1 << 5
} node_type_pow2;
struct mca_bcol_ptpcoll_module_t {
/* base structure */
mca_bcol_base_module_t super;
/* size */
int group_size;
/* size of each memory segment */
size_t segment_size;
/* k_nomial radix */
int k_nomial_radix;
/* caching power of K, for K-nomial operations */
int pow_k;
/* caching power of K number that is smaller or equal to size of group */
int pow_knum;
/* caching power of 2, it is special case for some algorithms */
int pow_2;
/* caching power of 2 number that is closet to size of group */
int pow_2num;
/* type of this node in group of power 2 */
int pow_2type;
/* type of this node in group of K-nomaial tree */
int pow_ktype;
/* type of this node in group of narray tree */
int narray_type;
/* size of full narray tree */
int full_narray_tree_size;
/* num leafs on last level */
int full_narray_tree_num_leafs;
/* Nary tree info */
netpatterns_tree_node_t *narray_node;
/* if the rank in group, it keeps the extra peer.
if the rank is extra, it keeps the proxy peer.
*/
int proxy_extra_index; /* pow2 algorithm */
int *kn_proxy_extra_index; /* K nomaila algorithm */
int kn_proxy_extra_num; /* number of extra peers , maximum k - 1*/
/* collective tag */
long long collective_tag;
/* tag mask - the pml has a limit on tag size, so need
* to wrap around
*/
uint64_t tag_mask;
/* Caching information about local ml memory.
* Since ptpcoll does not support RDMA operations over pml,
* we don't need to keep any information about remote buffers
*/
mca_bcol_ptpcoll_local_mlmem_desc_t ml_mem;
/* Narray-Knomial scatther gather */
/* list of extra indexes */
int *narray_knomial_proxy_extra_index;
/* number of extra peers , maximum k - 1*/
int narray_knomial_proxy_num;
/* Narray-Knomial node information array */
netpatterns_narray_knomial_tree_node_t *narray_knomial_node;
/* Knomial exchange tree */
netpatterns_k_exchange_node_t knomial_exchange_tree;
/* knomial allgather tree --- Do not disable, we need both
different algorithms define recursive k - ing differently
*/
netpatterns_k_exchange_node_t knomial_allgather_tree;
/* Knomial allgather offsets */
int **allgather_offsets;
/* Free lists of outstanding collective operations */
opal_free_list_t collreqs_free;
int log_group_size;
struct iovec *alltoall_iovec;
};
typedef struct mca_bcol_ptpcoll_module_t mca_bcol_ptpcoll_module_t;
OBJ_CLASS_DECLARATION(mca_bcol_ptpcoll_module_t);
/**
* Global component instance
*/
OMPI_MODULE_DECLSPEC extern mca_bcol_ptpcoll_component_t
mca_bcol_ptpcoll_component;
/*
* coll module functions
*/
/* query to see if the component is available for use, and can
* satisfy the thread and progress requirements
*/
int mca_bcol_ptpcoll_init_query(bool enable_progress_threads,
bool enable_mpi_threads);
/* query to see if the module is available for use on the given
* communicator, and if so, what it's priority is.
*/
mca_bcol_base_module_t **
mca_bcol_ptpcoll_comm_query(mca_sbgp_base_module_t *sbgp, int *num_modules);
/* interface function to setup recursive k-ing tree */
int mca_bcol_ptpcoll_setup_knomial_tree(mca_bcol_base_module_t *super);
/* barrier routines */
int bcol_ptpcoll_barrier_recurs_dbl(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_barrier_recurs_knomial(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_barrier_init(mca_bcol_base_module_t *super);
int mca_bcol_ptpcoll_memsync_init(mca_bcol_base_module_t *super);
void * bcol_ptpcoll_allocate_memory(size_t length, size_t alignment,
struct mca_bcol_base_module_t *bcol_module);
int bcol_ptpcoll_register_memory(void * in_ptr, size_t length, size_t alignment,
struct mca_bcol_base_module_t *bcol_module);
int bcol_ptpcoll_deregister_memory( void * in_ptr,
struct mca_bcol_base_module_t *bcol_module);
int bcol_ptpcoll_free_memory(void *ptr,
struct mca_bcol_base_module_t *bcol_module);
int bcol_ptpcoll_fanin( bcol_function_args_t *input_args,
struct mca_bcol_base_module_t *module);
int bcol_ptpcoll_fanout( bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
/* allgather routine */
int bcol_ptpcoll_k_nomial_allgather_init(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
/* allgather progress */
int bcol_ptpcoll_k_nomial_allgather_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
/* allgather register */
int bcol_ptpcoll_allgather_init(mca_bcol_base_module_t *super);
static inline __opal_attribute_always_inline__
int mca_bcol_ptpcoll_test_for_match(ompi_request_t **request , int *rc)
{
int matched = 0;
int i;
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
*rc = OMPI_SUCCESS;
for (i = 0; i < cm->num_to_probe &&
0 == matched && OMPI_SUCCESS == *rc ; i++) {
*rc = ompi_request_test(request, &matched, MPI_STATUS_IGNORE);
}
return matched;
}
static inline __opal_attribute_always_inline__
int mca_bcol_ptpcoll_test_all_for_match(int *n_requests, ompi_request_t **requests , int *rc)
{
int matched = 0;
int i;
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
*rc = OMPI_SUCCESS;
assert(*n_requests >= 0);
if (0 == *n_requests) {
return 1;
}
for (i = 0; i < cm->num_to_probe &&
0 == matched && OMPI_SUCCESS == *rc; i++) {
*rc = ompi_request_test_all
(*n_requests, requests, &matched, MPI_STATUS_IGNORE);
}
if (matched) {
*n_requests = 0;
}
return matched;
}
/* Some negative tags already used by OMPI, making sure that we take safe offset */
#define PTPCOLL_TAG_OFFSET 100
#define PTPCOLL_TAG_FACTOR 2
static inline int lognum(int n){
int count = 1, lognum = 0;
while (count < n) {
count = count << 1;
lognum++;
}
return lognum;
}
END_C_DECLS
#endif /* MCA_BCOL_PTPCOLL_EXPORT_H */

Просмотреть файл

@ -1,605 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/include/ompi/constants.h"
#include "ompi/mca/bcol/bcol.h"
#include "bcol_ptpcoll_allreduce.h"
/*
* Recursive K-ing allgather
*/
/*
*
* Recurssive k-ing algorithm
* Example k=3 n=9
*
*
* Number of Exchange steps = log (basek) n
* Number of steps in exchange step = k (radix)
*
*/
int bcol_ptpcoll_k_nomial_allgather_init(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variables */
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
netpatterns_k_exchange_node_t *exchange_node = &ptpcoll_module->knomial_allgather_tree;
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
int group_size = ptpcoll_module->group_size;
int *list_connected = ptpcoll_module->super.list_n_connected; /* critical for hierarchical colls */
int tag;
int i, j;
int knt;
int comm_src, comm_dst, src, dst;
int recv_offset, recv_len;
int send_offset, send_len;
uint32_t buffer_index = input_args->buffer_index;
int pow_k, tree_order;
int rc = OMPI_SUCCESS;
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
ompi_request_t **requests =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
int *active_requests =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
int completed = 0; /* initialized */
void *data_buffer = (void*)(
(unsigned char *) input_args->sbuf +
(size_t) input_args->sbuf_offset);
int pack_len = input_args->count * input_args->dtype->super.size;
#if 0
fprintf(stderr,"entering p2p allgather pack_len %d. exchange node: %p\n",pack_len, exchange_node);
#endif
/* initialize the iteration counter */
int *iteration = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
*iteration = 0;
/* reset active request counter */
*active_requests = 0;
/* keep tag within the limit supported by the pml */
tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
/* mark this as a collective tag, to avoid conflict with user-level flags */
tag = -tag;
/* k-nomial parameters */
tree_order = exchange_node->tree_order;
pow_k = exchange_node->log_tree_order;
/* let's begin the collective, starting with extra ranks and their
* respective proxies
*/
if( EXTRA_NODE == exchange_node->node_type ) {
/* then I will send to my proxy rank*/
dst = exchange_node->rank_extra_sources_array[0];
/* find rank in the communicator */
comm_dst = group_list[dst];
/* now I need to calculate my own offset */
knt = 0;
for (i = 0 ; i < my_group_index; i++){
knt += list_connected[i];
}
/* send the data to my proxy */
rc = MCA_PML_CALL(isend((void *) ( (unsigned char *) data_buffer +
knt*pack_len),
pack_len * list_connected[my_group_index],
MPI_BYTE,
comm_dst, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10,("Failed to isend data"));
return OMPI_ERROR;
}
++(*active_requests);
/* now I go ahead and post the receive from my proxy */
comm_src = comm_dst;
knt = 0;
for( i =0; i < group_size; i++){
knt += list_connected[i];
}
rc = MCA_PML_CALL(irecv(data_buffer,
knt * pack_len,
MPI_BYTE,
comm_src,
tag , comm, &(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
return OMPI_ERROR;
}
++(*active_requests);
/* poll for completion */
/* this polls internally */
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(completed){
/* go to buffer release */
goto FINISHED;
}else{
/* save state and hop out
* nothing to save here
*/
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
}
}else if ( 0 < exchange_node->n_extra_sources ) {
/* I am a proxy for someone */
src = exchange_node->rank_extra_sources_array[0];
/* find the rank in the communicator */
comm_src = group_list[src];
knt = 0;
for(i = 0; i < src; i++){
knt += list_connected[i];
}
/* post the receive */
rc = MCA_PML_CALL(irecv((void *) ( (unsigned char *) data_buffer
+ knt*pack_len),
pack_len * list_connected[src],
MPI_BYTE,
comm_src,
tag , comm, &(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
return OMPI_ERROR;
}
++(*active_requests);
/* poll for completion */
/* this routine polls internally */
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(!completed){
/* save state and hop out
* We really do need to block here so set
* the iteration to -1 indicating we need to
* finish this part first
*/
*iteration = -1;
return ((OMPI_SUCCESS != rc )? OMPI_ERROR : BCOL_FN_STARTED);
}
}
/* we start the recursive k - ing phase */
/* fprintf(stderr,"tree order %d pow_k %d \n",tree_order,pow_k);*/
for( i = 0; i < pow_k; i++) {
for(j = 0; j < (tree_order - 1); j++) {
/* send phase */
dst = exchange_node->rank_exchanges[i][j];
if( dst < 0 ){
continue;
}
comm_dst = group_list[dst];
send_offset = exchange_node->payload_info[i][j].s_offset * pack_len;
send_len = exchange_node->payload_info[i][j].s_len * pack_len;
/* debug print */
/* fprintf(stderr,"sending %d bytes to rank %d at offset %d\n",send_len, */
/* comm_dst,send_offset); */
rc = MCA_PML_CALL(isend((void*)((unsigned char *) data_buffer +
send_offset),
send_len,
MPI_BYTE,
comm_dst, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10,("Failed to isend data"));
return OMPI_ERROR;
}
++(*active_requests);
/* sends are posted */
}
/* Now post the recv's */
for( j = 0; j < (tree_order - 1); j++ ) {
/* recv phase */
src = exchange_node->rank_exchanges[i][j];
if( src < 0 ) {
continue;
}
comm_src = group_list[src];
recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len;
recv_len = exchange_node->payload_info[i][j].r_len * pack_len;
/* debug print */
/* fprintf(stderr,"recving %d bytes to rank %d at offset %d\n",recv_len, */
/* comm_src,recv_offset); */
/* post the receive */
rc = MCA_PML_CALL(irecv((void *) ((unsigned char *) data_buffer +
recv_offset),
recv_len,
MPI_BYTE,
comm_src,
tag, comm, &(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
return OMPI_ERROR;
}
++(*active_requests);
}
/* finished all send/recv's now poll for completion before
* continuing to next iteration
*/
completed = 0;
/* polling internally on 2*(k - 1) requests */
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(!completed){
/* save state and hop out
* only the iteration needs to be tracked
*/
*iteration = i; /* need to pick up here */
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
}
}
/* finish off the last piece, send the data back to the extra */
if( 0 < exchange_node->n_extra_sources ) {
dst = exchange_node->rank_extra_sources_array[0];
comm_dst = group_list[dst];
knt = 0;
for( i = 0; i < group_size; i++){
knt += list_connected[i];
}
/* debug print */
/*
fprintf(stderr,"sending %d bytes to extra %d \n",pack_len*knt,comm_dst);
*/
rc = MCA_PML_CALL(isend(data_buffer,
pack_len * knt,
MPI_BYTE,
comm_dst, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10,("Failed to isend data"));
return OMPI_ERROR;
}
++(*active_requests);
/* probe for send completion */
completed = 0;
/* polling internally */
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(!completed){
/* save state and hop out
* We really do need to block here so set
* the iteration to pow_k +1 indicating we need to
* finish progressing the last part
*/
*iteration = pow_k + 1;
return (OMPI_SUCCESS != rc ? OMPI_ERROR : BCOL_FN_STARTED);
}
}
FINISHED:
/* recycle buffer if need be */
return BCOL_FN_COMPLETE;
}
/* allgather progress function */
int bcol_ptpcoll_k_nomial_allgather_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variables */
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
netpatterns_k_exchange_node_t *exchange_node = &ptpcoll_module->knomial_allgather_tree;
int group_size = ptpcoll_module->group_size;
int *list_connected = ptpcoll_module->super.list_n_connected; /* critical for hierarchical colls */
int tag;
int i, j;
int knt;
int comm_src, comm_dst, src, dst;
int recv_offset, recv_len;
int send_offset, send_len;
uint32_t buffer_index = input_args->buffer_index;
int pow_k, tree_order;
int rc = OMPI_SUCCESS;
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
ompi_request_t **requests =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
int *active_requests =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
int completed = 0; /* initialized */
void *data_buffer = (void*)(
(unsigned char *) input_args->sbuf +
(size_t) input_args->sbuf_offset);
int pack_len = input_args->count * input_args->dtype->super.size;
/* initialize the counter */
int *iteration = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
#if 0
fprintf(stderr,"%d: entering p2p allgather progress AR: %d iter: %d\n",my_group_index,*active_requests,
*iteration);
#endif
/* keep tag within the limit supported by the pml */
tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
/* mark this as a collective tag, to avoid conflict with user-level flags */
tag = -tag;
/* k-nomial tree parameters */
tree_order = exchange_node->tree_order;
pow_k = exchange_node->log_tree_order;
/* let's begin the collective, starting with extra ranks and their
* respective proxies
*/
if( EXTRA_NODE == exchange_node->node_type ) {
/* debug print */
/*fprintf(stderr,"666 \n");*/
/* simply poll for completion */
completed = 0;
/* polling internally */
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(completed){
/* go to buffer release */
goto FINISHED;
}else{
/* save state and hop out
* nothing to save here
*/
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
}
}else if ( 0 < exchange_node->n_extra_sources && (-1 == *iteration)) {
/* I am a proxy for someone */
/* Simply poll for completion */
completed = 0;
/* polling internally */
assert( 1 == *active_requests);
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(!completed){
/* save state and hop out
* We really do need to block here so set
* the iteration to -1 indicating we need to
* finish this part first
*/
(*iteration) = -1;
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
}
/* I may now proceed to the recursive k - ing phase */
*iteration = 0;
}
/* the ordering here between the extra rank and progress active requests
* is critical
*/
/* extra rank */
if( (pow_k + 1) == *iteration ){
/* finish off the last one */
goto PROGRESS_EXTRA;
}
/* active requests must be completed before continuing on to
* recursive k -ing step
* CAREFUL HERE, IT THIS REALLY WHAT YOU WANT??
*/
if( 0 < (*active_requests) ) {
/* then we have something to progress from last step */
/* debug print */
/*
fprintf(stderr,"%d: entering progress AR: %d iter: %d\n",my_group_index,*active_requests,
*iteration);
*/
completed = 0;
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(!completed){
/* save state and hop out
* state hasn't changed
*/
return ((MPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
}
++(*iteration);
}
/* we start the recursive k - ing phase */
for( i = *iteration; i < pow_k; i++) {
/* nothing changes here */
for(j = 0; j < (tree_order - 1); j++) {
/* send phase */
dst = exchange_node->rank_exchanges[i][j];
if( dst < 0 ){
continue;
}
comm_dst = group_list[dst];
send_offset = exchange_node->payload_info[i][j].s_offset * pack_len;
send_len = exchange_node->payload_info[i][j].s_len * pack_len;
rc = MCA_PML_CALL(isend((void*)((unsigned char *) data_buffer +
send_offset),
send_len,
MPI_BYTE,
comm_dst, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10,("Failed to isend data"));
return OMPI_ERROR;
}
++(*active_requests);
/* sends are posted */
}
/* Now post the recv's */
for( j = 0; j < (tree_order - 1); j++ ) {
/* recv phase */
src = exchange_node->rank_exchanges[i][j];
if( src < 0 ) {
continue;
}
comm_src = group_list[src];
recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len;
recv_len = exchange_node->payload_info[i][j].r_len * pack_len;
/* post the receive */
rc = MCA_PML_CALL(irecv((void *) ((unsigned char *) data_buffer +
recv_offset),
recv_len,
MPI_BYTE,
comm_src,
tag, comm, &(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
return OMPI_ERROR;
}
++(*active_requests);
}
/* finished all send/recv's now poll for completion before
* continuing to next iteration
*/
completed = 0;
/* make this non-blocking */
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(!completed){
/* save state and hop out
* We really do need to block here so set
* the iteration to -1 indicating we need to
* finish this part first
*/
*iteration = i; /* need to pick up here */
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
}
}
/* finish off the last piece, send the data back to the extra */
if( 0 < exchange_node->n_extra_sources ) {
dst = exchange_node->rank_extra_sources_array[0];
comm_dst = group_list[dst];
knt = 0;
for( i = 0; i < group_size; i++){
knt += list_connected[i];
}
rc = MCA_PML_CALL(isend(data_buffer,
pack_len * knt,
MPI_BYTE,
comm_dst, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10,("Failed to isend data"));
return OMPI_ERROR;
}
++(*active_requests);
/* probe for send completion */
completed = 0;
/* make this non-blocking */
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(!completed){
/* save state and hop out
* We really do need to block here so set
* the iteration to pow_k +1 indicating we need to
* finish progressing the last part
*/
*iteration = pow_k + 1;
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
}
}
/* folks need to skip this unless they really are the proxy
* reentering with the intent of progressing the final send
*/
goto FINISHED;
PROGRESS_EXTRA:
/* probe for send completion */
completed = 0;
/* make this non-blocking */
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if(!completed){
/* save state and hop out
* We really do need to block here so set
* the iteration to pow_k +1 indicating we need to
* finish progressing the last part
*/
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
}
FINISHED:
/* recycle buffer if need be */
return BCOL_FN_COMPLETE;
}
/*
* Register allreduce functions to the BCOL function table,
* so they can be selected
*/
int bcol_ptpcoll_allgather_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
comm_attribs.bcoll_type = BCOL_ALLGATHER;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_ptpcoll_k_nomial_allgather_init,
bcol_ptpcoll_k_nomial_allgather_progress);
comm_attribs.data_src = DATA_SRC_KNOWN;
inv_attribs.bcol_msg_min = 10000000;
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_ptpcoll_k_nomial_allgather_init,
bcol_ptpcoll_k_nomial_allgather_progress);
return OMPI_SUCCESS;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,95 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_PTPCOLL_ALLREDUCE_H
#define MCA_BCOL_PTPCOLL_ALLREDUCE_H
#include "ompi_config.h"
#include "ompi/op/op.h"
#include "ompi/datatype/ompi_datatype.h"
#include "bcol_ptpcoll.h"
#include "bcol_ptpcoll_utils.h"
enum {
BLOCK_OFFSET = 0,
LOCAL_REDUCE_SEG_OFFSET,
BLOCK_COUNT,
SEG_SIZE,
NOFFSETS
};
BEGIN_C_DECLS
int bcol_ptpcoll_allreduce_narraying(mca_bcol_ptpcoll_module_t *ptpcoll_module,
const int buffer_index, void *data_buffer,
struct ompi_op_t *op,
const int count, struct ompi_datatype_t *dtype, const int
buffer_size, const int relative_group_index);
int bcol_ptpcoll_allreduce_narraying_init(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_allreduce_recursivek_scatter_reduce(mca_bcol_ptpcoll_module_t *ptpcoll_module,
const int buffer_index, void *sbuf,
void *rbuf,
struct ompi_op_t *op,
const int count, struct ompi_datatype_t *dtype,
const int relative_group_index,
const int padded_start_byte);
int bcol_ptpcoll_allreduce_knomial_allgather(mca_bcol_ptpcoll_module_t *ptpcoll_module,
const int buffer_index,
void *sbuf,void *rbuf, int count, struct
ompi_datatype_t *dtype,
const int relative_group_index,
const int padded_start_byte);
int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int compute_knomial_allgather_offsets(int group_index, int count, struct
ompi_datatype_t *dtype,int k_radix,int n_exchanges,
int **offsets);
int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
int buffer_index,
void *sbuf,
void *rbuf,
struct ompi_op_t *op,
const int count, struct ompi_datatype_t *dtype);
int bcol_ptpcoll_allreduce_knomial_allgather_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
int buffer_index,
void *sbuf,
void *rbuf,
const int count, struct ompi_datatype_t *dtype);
int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_allreduce_init(mca_bcol_base_module_t *super);
#if 0
int knomial_reduce_scatter_offsets(int group_index,int count, struct ompi_datatype_t *dtype, int k_radix,
int n_exchanges, int nth_exchange, size_t *recv_offset, size_t
*block_offset, size_t *block_count, size_t *block_size, size_t
*seg_size);
int allgather_offsets(int group_index,int count, struct ompi_datatype_t *dtype, int k_radix,
int n_exchanges, int nth_exchange, size_t *send_offset, size_t
*block_offset, size_t *block_count, size_t *block_size, size_t
*seg_size);
#endif
END_C_DECLS
#endif

Просмотреть файл

@ -1,933 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/include/ompi/constants.h"
#include "bcol_ptpcoll.h"
#include "bcol_ptpcoll_utils.h"
/*
* Fanin routines - no user data
*/
/********************************************* New Barrier *********************************************/
/*******************************************************************************************************/
/*******************************************************************************************************/
/*************************************** K-nominal ***************************************/
/*****************************************************************************************/
static int bcol_ptpcoll_barrier_recurs_knomial_new(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variable */
uint64_t sequence_number;
mca_bcol_ptpcoll_module_t *ptpcoll_module =
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
netpatterns_k_exchange_node_t *my_exchange_node =
&ptpcoll_module->knomial_exchange_tree;
int rc, k, pair_comm_rank, exchange, completed,
tree_order = my_exchange_node->tree_order, tag,
n_extra_sources = my_exchange_node->n_extra_sources,
n_exchange = my_exchange_node->n_exchanges, num_reqs;
ompi_communicator_t *comm =
ptpcoll_module->super.sbgp_partner_module->group_comm;
int *extra_sources_array = NULL,
**rank_exchanges = my_exchange_node->rank_exchanges;
ompi_request_t **requests;
opal_free_list_item_t *item;
mca_bcol_ptpcoll_collreq_t *collreq;
item = opal_free_list_wait (&ptpcoll_module->collreqs_free);
if (OPAL_UNLIKELY(NULL == item)) {
PTPCOLL_ERROR(("Free list waiting failed."));
return OMPI_ERR_OUT_OF_RESOURCE;
}
collreq = (mca_bcol_ptpcoll_collreq_t *) item;
input_args->bcol_opaque_data = (void *) collreq;
requests = collreq->requests;
/* TAG Calculation */
sequence_number = input_args->sequence_num;
/* Keep tag within the limit supportd by the pml */
tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
/* Mark this as a collective tag, to avoid conflict with user-level flags */
tag = -tag;
if (0 < n_extra_sources) { /* EXCHANGE_NODE case */
collreq->need_toserv_extra = 1;
extra_sources_array = my_exchange_node->rank_extra_sources_array;
/* I will participate in the exchange (of the algorithm) -
* wait for signal from extra process */
for (k = 0; k < n_extra_sources; ++k) {
pair_comm_rank =
ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];
rc = MCA_PML_CALL(irecv(
NULL, 0, MPI_INT,
pair_comm_rank, tag,
comm, &(requests[k])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("IRecv failed."));
return rc;
}
}
num_reqs = n_extra_sources;
/* Test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
collreq->tag = tag;
collreq->num_reqs = num_reqs;
collreq->exchange = 0;
return BCOL_FN_STARTED;
}
} else {
collreq->need_toserv_extra = 0;
}
/* loop over exchange send/recv pairs */
for (exchange = 0; exchange < n_exchange; ++exchange) {
for (k = 0; k < tree_order - 1; ++k) {
/* rank of exchange partner within the group */
pair_comm_rank =
ptpcoll_module->super.sbgp_partner_module->group_list[rank_exchanges[exchange][k]];
assert(2 * ptpcoll_module->k_nomial_radix > (k * 2 + 1));
/* send to partner - we will wait for completion, as send
* completion is at the MPI level, and will not
* incur network level completion costs
*/
rc = MCA_PML_CALL(isend(
NULL, 0, MPI_INT,
pair_comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD,
comm, &(requests[k * 2 + 1])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("ISend failed."));
return rc;
}
PTPCOLL_VERBOSE(10, ("Ex %d, K %d send to %d[%d]", exchange, k,
pair_comm_rank, rank_exchanges[exchange][k]));
/* recive from partner */
rc = MCA_PML_CALL(irecv(
NULL, 0, MPI_INT,
pair_comm_rank, tag,
comm, &(requests[k * 2])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("IRecv failed."));
return rc;
}
PTPCOLL_VERBOSE(10, ("Ex %d, K %d irecv from %d[%d]", exchange, k,
pair_comm_rank, rank_exchanges[exchange][k]));
}
num_reqs = 2 * (tree_order - 1);
/* Test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
collreq->tag = tag;
collreq->num_reqs = num_reqs;
collreq->exchange = exchange + 1;
return BCOL_FN_STARTED;
}
}
/* If non power of 2, may need to send message to "extra" proc */
if (0 < n_extra_sources) { /* EXCHANGE_NODE case */
for (k = 0; k < n_extra_sources; ++k) {
pair_comm_rank =
ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];
rc = MCA_PML_CALL(isend(
NULL, 0, MPI_INT,
pair_comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD,
comm, &(requests[k])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("ISend failed."));
return rc;
}
}
num_reqs = n_extra_sources;
/* Test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
collreq->tag = tag;
collreq->num_reqs = num_reqs;
collreq->exchange = n_exchange;
collreq->need_toserv_extra = 0;
return BCOL_FN_STARTED;
}
}
opal_free_list_return (&ptpcoll_module->collreqs_free, (opal_free_list_item_t *) collreq);
return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_knomial_new_progress(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variable */
mca_bcol_ptpcoll_module_t *ptpcoll_module =
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
netpatterns_k_exchange_node_t *my_exchange_node =
&ptpcoll_module->knomial_exchange_tree;
int rc, k, tag, pair_comm_rank, exchange,
tree_order = my_exchange_node->tree_order, num_reqs,
n_exchange = my_exchange_node->n_exchanges, completed,
n_extra_sources = my_exchange_node->n_extra_sources;
ompi_communicator_t *comm =
ptpcoll_module->super.sbgp_partner_module->group_comm;
int *extra_sources_array,
**rank_exchanges = my_exchange_node->rank_exchanges;
mca_bcol_ptpcoll_collreq_t *collreq =
(mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;
ompi_request_t **requests = collreq->requests;
num_reqs = collreq->num_reqs;
/* Test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
return BCOL_FN_STARTED;
}
/* Continue loop over exchange send/recv pairs */
tag = collreq->tag;
for (exchange = collreq->exchange; exchange < n_exchange; ++exchange) {
for (k = 0; k < tree_order - 1; ++k) {
/* rank of exchange partner within the group */
pair_comm_rank =
ptpcoll_module->super.sbgp_partner_module->group_list[rank_exchanges[exchange][k]];
assert(2 * ptpcoll_module->k_nomial_radix > (k * 2 + 1));
/* send to partner - we will wait for completion, as send
* completion is at the MPI level, and will not
* incur network level completion costs
*/
rc = MCA_PML_CALL(isend(
NULL, 0, MPI_INT,
pair_comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD,
comm, &(requests[k * 2 + 1])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("ISend failed."));
return rc;
}
PTPCOLL_VERBOSE(10, ("Ex %d, K %d send to %d[%d]", exchange, k,
pair_comm_rank, rank_exchanges[exchange][k]));
/* recive from partner */
rc = MCA_PML_CALL(irecv(
NULL, 0, MPI_INT,
pair_comm_rank, tag,
comm, &(requests[k * 2])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("IRecv failed."));
return rc;
}
PTPCOLL_VERBOSE(10, ("Ex %d, K %d irecv from %d[%d]", exchange, k,
pair_comm_rank, rank_exchanges[exchange][k]));
}
num_reqs = 2 * (tree_order - 1);
/* Test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
collreq->num_reqs = num_reqs;
collreq->exchange = exchange + 1;
return BCOL_FN_STARTED;
}
}
/* If non power of 2, may need to send message to "extra" proc */
if (collreq->need_toserv_extra) { /* EXCHANGE_NODE case */
extra_sources_array = my_exchange_node->rank_extra_sources_array;
for (k = 0; k < n_extra_sources; ++k) {
pair_comm_rank =
ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];
rc = MCA_PML_CALL(isend(
NULL, 0, MPI_INT,
pair_comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD,
comm, &(requests[k])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("ISend failed."));
return rc;
}
}
num_reqs = n_extra_sources;
/* Test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
collreq->num_reqs = num_reqs;
collreq->exchange = n_exchange;
collreq->need_toserv_extra = 0;
return BCOL_FN_STARTED;
}
}
return BCOL_FN_COMPLETE;
}
/****************************************** Extra node Barrier ******************************************/
static int bcol_ptpcoll_barrier_recurs_knomial_extra_new(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variable */
uint64_t sequence_number;
int rc, tag, pair_comm_rank,
completed, num_reqs = 2;
mca_bcol_ptpcoll_module_t *ptpcoll_module =
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
netpatterns_k_exchange_node_t *my_exchange_node =
&ptpcoll_module->knomial_exchange_tree;
ompi_communicator_t *comm =
ptpcoll_module->super.sbgp_partner_module->group_comm;
int *extra_sources_array = my_exchange_node->rank_extra_sources_array;
ompi_request_t **requests;
opal_free_list_item_t *item;
mca_bcol_ptpcoll_collreq_t *collreq;
item = opal_free_list_wait (&ptpcoll_module->collreqs_free);
if (OPAL_UNLIKELY(NULL == item)) {
PTPCOLL_ERROR(("Free list waiting failed."));
return OMPI_ERR_OUT_OF_RESOURCE;
}
collreq = (mca_bcol_ptpcoll_collreq_t *) item;
input_args->bcol_opaque_data = (void *) collreq;
requests = collreq->requests;
/* TAG Calculation */
sequence_number = input_args->sequence_num;
/* Keep tag within the limit supportd by the pml */
tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
/* Mark this as a collective tag, to avoid conflict with user-level flags */
tag = -tag;
pair_comm_rank =
ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[0]];
rc = MCA_PML_CALL(isend(
NULL, 0, MPI_INT,
pair_comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD,
comm, &(requests[0])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("ISend failed."));
return rc;
}
rc = MCA_PML_CALL(irecv(
NULL, 0, MPI_INT,
pair_comm_rank, tag,
comm, &(requests[1])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("IRecv failed."));
return rc;
}
/* Test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
return BCOL_FN_STARTED;
}
opal_free_list_return (&ptpcoll_module->collreqs_free, (opal_free_list_item_t *) collreq);
return BCOL_FN_COMPLETE;
}
/*************************************** Recursive-Doubling ***************************************/
/**************************************************************************************************/
static int bcol_ptpcoll_barrier_recurs_dbl_new(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variable */
uint64_t sequence_number;
mca_bcol_ptpcoll_module_t *ptp_module =
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;
int rc, my_extra_partner_comm_rank = 0, exchange, completed,
pair_comm_rank, pair_rank, delta, tag, num_reqs = 0,
my_rank = ptp_module->super.sbgp_partner_module->my_index,
n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2;
ompi_request_t **requests;
opal_free_list_item_t *item;
mca_bcol_ptpcoll_collreq_t *collreq;
item = opal_free_list_wait (&ptp_module->collreqs_free);
if (OPAL_UNLIKELY(NULL == item)) {
PTPCOLL_ERROR(("Free list waiting failed."));
return OMPI_ERR_OUT_OF_RESOURCE;
}
collreq = (mca_bcol_ptpcoll_collreq_t *) item;
input_args->bcol_opaque_data = (void *) collreq;
assert(PTPCOLL_EXTRA != ptp_module->pow_2type);
requests = collreq->requests;
/* TAG Calculation */
sequence_number = input_args->sequence_num;
/* keep tag within the limit supportd by the pml */
tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);
/* mark this as a collective tag, to avoid conflict with user-level flags */
tag = -tag;
if (PTPCOLL_PROXY == ptp_module->pow_2type) {
/* I will participate in the exchange - wait for signal from extra
** process */
/*
* recv from extra rank - my_extra_partner_comm_rank
* can use blocking recv, as no other communications
* need to take place.
*/
my_extra_partner_comm_rank =
ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];
collreq->need_toserv_extra = 1;
collreq->extra_partner_rank = my_extra_partner_comm_rank;
rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
my_extra_partner_comm_rank, tag, comm,
&(requests[0])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("IRecv failed."));
return rc;
}
completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for irecv failed."));
return rc;
}
if (!completed) {
collreq->tag = tag;
collreq->num_reqs = 1;
collreq->exchange = 0;
return BCOL_FN_STARTED;
}
} else {
collreq->need_toserv_extra = 0;
}
/* Loop over exchange send/recv pairs */
delta = 1;
for (exchange = 0; exchange < n_exchange; ++exchange) {
/* rank of exchange partner within the group */
pair_rank = my_rank ^ delta;
/* rank within the communicator */
pair_comm_rank =
ptp_module->super.sbgp_partner_module->group_list[pair_rank];
/* send to partner - we will wait for completion, as send
* completion is at the MPI level, and will not
* incur network level completion costs
*/
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
pair_comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[0])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("ISend failed."));
return rc;
}
++num_reqs;
/* recive from partner */
rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
pair_comm_rank, tag, comm,
&(requests[1])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("IRecv failed."));
return rc;
}
++num_reqs;
PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d",
exchange, pair_rank, pair_comm_rank));
/* test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
collreq->tag = tag;
collreq->num_reqs = num_reqs;
collreq->exchange = exchange + 1;
assert(collreq->exchange >= 0);
return BCOL_FN_STARTED;
}
delta <<= 1; /* delta *= 2 */
}
if (PTPCOLL_PROXY == ptp_module->pow_2type) {
/* send - let the extra rank know that we are done */
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
my_extra_partner_comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[0])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("ISend failed."));
return rc;
}
completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for isend failed."));
return rc;
}
if (!completed) {
collreq->tag = tag;
collreq->num_reqs = 1;
collreq->need_toserv_extra = 0;
collreq->exchange = n_exchange;
return BCOL_FN_STARTED;
}
}
opal_free_list_return (&ptp_module->collreqs_free, (opal_free_list_item_t *) collreq);
return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_dbl_new_progress(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variable */
mca_bcol_ptpcoll_module_t *ptp_module =
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;
int rc, exchange, pair_comm_rank, tag,
pair_rank, delta, num_reqs, completed,
my_rank = ptp_module->super.sbgp_partner_module->my_index,
n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2;
ompi_request_t **requests;
mca_bcol_ptpcoll_collreq_t *collreq =
(mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;
num_reqs = collreq->num_reqs;
requests = collreq->requests;
/* test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
return BCOL_FN_STARTED;
}
assert(PTPCOLL_EXTRA != ptp_module->pow_2type);
/* Continue loop over exchange send/recv pairs */
num_reqs = 0;
tag = collreq->tag;
exchange = collreq->exchange;
assert(exchange >= 0);
delta = 1 << exchange;
for (; exchange < n_exchange; ++exchange) {
/* rank of exchange partner within the group */
pair_rank = my_rank ^ delta;
/* rank within the communicator */
pair_comm_rank =
ptp_module->super.sbgp_partner_module->group_list[pair_rank];
/* send to partner - we will wait for completion, as send
* completion is at the MPI level, and will not
* incur network level completion costs
*/
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
pair_comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[0])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("ISend failed."));
return rc;
}
++num_reqs;
/* recive from partner */
rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
pair_comm_rank, tag, comm,
&(requests[1])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("IRecv failed."));
return rc;
}
++num_reqs;
PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d",
exchange, pair_rank, pair_comm_rank));
/* test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
collreq->num_reqs = num_reqs;
collreq->exchange = exchange + 1;
assert(collreq->exchange >= 0);
return BCOL_FN_STARTED;
}
delta <<= 1; /* delta *= 2 */
}
/* if non power of 2, may need to send message to "extra" proc */
if (collreq->need_toserv_extra) {
/* send - let the extra rank know that we are done */
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
collreq->extra_partner_rank, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[0])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("ISend failed."));
return rc;
}
completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for isend failed."));
return rc;
}
if (!completed) {
collreq->num_reqs = 1;
collreq->need_toserv_extra = 0;
collreq->exchange = n_exchange;
return BCOL_FN_STARTED;
}
}
return BCOL_FN_COMPLETE;
}
/****************************************** Extra node Barrier ******************************************/
static int bcol_ptpcoll_barrier_recurs_dbl_extra_new(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variable */
uint64_t sequence_number;
int rc, completed, num_reqs = 2,
tag, my_extra_partner_comm_rank;
ompi_request_t **requests;
opal_free_list_item_t *item;
mca_bcol_ptpcoll_collreq_t *collreq;
mca_bcol_ptpcoll_module_t *ptp_module =
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;
item = opal_free_list_wait (&ptp_module->collreqs_free);
if (OPAL_UNLIKELY(NULL == item)) {
PTPCOLL_ERROR(("Free list waiting failed."));
return OMPI_ERR_OUT_OF_RESOURCE;
}
collreq = (mca_bcol_ptpcoll_collreq_t *) item;
input_args->bcol_opaque_data = (void *) collreq;
requests = collreq->requests;
/* TAG Calculation */
sequence_number = input_args->sequence_num;
/* Keep tag within the limit supportd by the pml */
tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);
/* mark this as a collective tag, to avoid conflict with user-level flags */
tag = -tag;
/* I will not participate in the exchange - so just "register" as here,
* signal the extra rank that I am here */
my_extra_partner_comm_rank =
ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
my_extra_partner_comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[0])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Send failed."));
return rc;
}
/* Recv signal that the rest are done - my_extra_partner_comm_rank */
rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
my_extra_partner_comm_rank, tag, comm,
&(requests[1])));
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("IRecv failed."));
return rc;
}
/* Test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
return BCOL_FN_STARTED;
}
opal_free_list_return (&ptp_module->collreqs_free, (opal_free_list_item_t *) collreq);
return BCOL_FN_COMPLETE;
}
/* We have the same progress func for both cases (R-D and K-Nominal) */
static int bcol_ptpcoll_barrier_extra_node_progress(
bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variable */
ompi_request_t **requests;
int rc, completed, num_reqs = 2;
mca_bcol_ptpcoll_collreq_t *collreq =
(mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;
requests = collreq->requests;
/* test for completion */
completed =
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
PTPCOLL_ERROR(("Test for all failed."));
return rc;
}
if (!completed) {
return BCOL_FN_STARTED;
}
return BCOL_FN_COMPLETE;
}
static int mca_bcol_ptpcoll_barrier_setup(mca_bcol_base_module_t *super, int bcoll_type)
{
netpatterns_k_exchange_node_t *my_exchange_node;
mca_bcol_ptpcoll_module_t * ptpcoll_module =
(mca_bcol_ptpcoll_module_t *) super;
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
comm_attribs.bcoll_type = bcoll_type;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
switch(mca_bcol_ptpcoll_component.barrier_alg) {
case 1:
if (PTPCOLL_EXTRA == ptpcoll_module->pow_2type) {
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_ptpcoll_barrier_recurs_dbl_extra_new,
bcol_ptpcoll_barrier_extra_node_progress);
break;
}
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_ptpcoll_barrier_recurs_dbl_new,
bcol_ptpcoll_barrier_recurs_dbl_new_progress);
break;
case 2:
my_exchange_node = &ptpcoll_module->knomial_exchange_tree;
if (my_exchange_node->n_extra_sources > 0 &&
EXTRA_NODE == my_exchange_node->node_type) {
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_ptpcoll_barrier_recurs_knomial_extra_new,
bcol_ptpcoll_barrier_extra_node_progress);
break;
}
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_ptpcoll_barrier_recurs_knomial_new,
bcol_ptpcoll_barrier_recurs_knomial_new_progress);
break;
default:
PTPCOLL_ERROR(("Wrong barrier_alg flag value."));
}
return OMPI_SUCCESS;
}
int mca_bcol_ptpcoll_memsync_init(mca_bcol_base_module_t *super)
{
return mca_bcol_ptpcoll_barrier_setup(super, BCOL_SYNC);
}
int bcol_ptpcoll_barrier_init(mca_bcol_base_module_t *super)
{
return mca_bcol_ptpcoll_barrier_setup(super, BCOL_BARRIER);
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,868 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_PTPCOLL_BCAST_H
#define MCA_BCOL_PTPCOLL_BCAST_H
#include "ompi_config.h"
#include "bcol_ptpcoll.h"
#include "bcol_ptpcoll_utils.h"
BEGIN_C_DECLS
int bcol_ptpcoll_bcast_init(mca_bcol_base_module_t *super);
int bcol_ptpcoll_bcast_k_nomial_anyroot (bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_k_nomial_anyroot_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_k_nomial_known_root(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_k_nomial_known_root_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
/* macros */
#define K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER( \
radix_mask_pow, \
my_group_index, group_size, group_list, \
data_buffer, segment_size, count, tag, \
comm, send_requests, num_pending_sends) \
do { \
int rc = OMPI_SUCCESS; \
int dst; \
int comm_dst; \
int send_size; \
int send_offset; \
int delta; \
int dst_boundary_rank; \
int radix_mask = radix_mask_pow >= 0 ? 1 << radix_mask_pow : 0; \
\
while(radix_mask_pow >= 0) { \
/* For each level of tree, do sends */ \
dst = my_group_index ^ radix_mask; \
comm_dst = group_list[dst]; \
\
dst_boundary_rank = dst & ((~(int)0) << (radix_mask_pow)); \
\
send_offset = segment_size * dst_boundary_rank; \
/* Pasha: make sure that we handle the corner cases */ \
delta = count - send_offset; \
if (delta <= 0) { \
send_size = 0; /* we have to send something, other way it will hang */ \
} else { \
/* the tail case */ \
send_size = (int) \
(delta - (int)segment_size * radix_mask) < 0 ? delta : \
(int)segment_size * radix_mask; \
} \
\
/* Non blocking send .... */ \
PTPCOLL_VERBOSE(9 , \
("Bcast p2s, Isend to %d[%d],count %d,tag %d,addr %p [%p] send_size %d,send_offset %d, radix %d %d",\
dst, comm_dst, count, tag, \
data_buffer, (void *)((unsigned char *)data_buffer + (size_t)send_offset), \
send_size, \
send_offset, \
radix_mask, \
radix_mask_pow \
)); \
rc = MCA_PML_CALL(isend((void *)((unsigned char *)data_buffer + (size_t)send_offset), \
send_size, MPI_BYTE, \
comm_dst, tag, \
MCA_PML_BASE_SEND_STANDARD, comm, \
&(send_requests[*num_pending_sends]))); \
PTPCOLL_VERBOSE(10, ("send request addr is %p", send_requests[*num_pending_sends])); \
if( OMPI_SUCCESS != rc ) { \
PTPCOLL_VERBOSE(10, ("Failed to isend data")); \
return OMPI_ERROR; \
} \
++(*num_pending_sends); \
radix_mask >>= 1; \
radix_mask_pow--; \
} \
} while(0)
#define NARRAY_SCATTER_NB(narray_node, process_shift, group_size, \
data_buffer, base_block_size, count, tag, comm, send_requests, \
num_pending_sends) \
do { \
int n, rc = OMPI_SUCCESS; \
int dst; \
int comm_dst; \
int offset; \
int size_count = count; \
\
/* Send out data to all relevant childrens */ \
for (n = 0; n < narray_node->n_children && size_count > 0; n++) { \
\
dst = narray_node->children_ranks[n] + process_shift; \
if (dst >= group_size) { \
dst -= group_size; \
} \
\
comm_dst = group_list[dst]; \
offset = n * base_block_size; \
size_count -= base_block_size; \
if (OPAL_UNLIKELY(size_count < 0)) { \
count = base_block_size + size_count; \
} else { \
count = base_block_size; \
} \
\
/* Non blocking send .... */ \
PTPCOLL_VERBOSE(9 , ("Bcast, Isend data to %d[%d], count %d, tag %d, addr %p", \
dst, comm_dst, count, tag, \
data_buffer)); \
rc = MCA_PML_CALL(isend((void *)((char *)data_buffer + (size_t)offset), count, MPI_BYTE,\
comm_dst, tag, \
MCA_PML_BASE_SEND_STANDARD, comm, \
&(send_requests[*num_pending_sends]))); \
if( OMPI_SUCCESS != rc ) { \
PTPCOLL_VERBOSE(10, ("Failed to isend data")); \
return OMPI_ERROR; \
} \
++(*num_pending_sends); \
} \
} while(0)
#define NARRAY_SCATTER_B(narray_node, process_shift, group_size, \
data_buffer, base_block_size, count, tag, comm, send_requests, \
num_pending_sends, completed) \
do { \
NARRAY_SCATTER_NB(narray_node, process_shift, group_size, \
data_buffer, base_block_size, count, tag, comm, send_requests, \
num_pending_sends); \
if (*num_pending_sends > 0) { \
completed = mca_bcol_ptpcoll_test_all_for_match(num_pending_sends, send_requests, &rc); \
if (OMPI_SUCCESS != rc) { \
return OMPI_ERROR; \
} \
} else { \
completed = 1; \
} \
} while (0)
#define CHECK_IF_ROOT_OR_VROOT(module, i) \
(module->pow_2 == module->ml_mem.ml_buf_desc[i].radix_mask_pow)
/* inline functions */
static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra(
mca_bcol_ptpcoll_module_t *ptpcoll_module,
void *data_buffer, int count, int tag,
int extra_peer, ompi_communicator_t *comm,
int *active_requests, ompi_request_t **requests)
{
int rc = OMPI_SUCCESS;
int completed = 0;
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
/* tag is -1 already */
/* send the all data to your extra peer */
PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra to %d tag %d",
extra_peer, tag));
rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE,
group_list[extra_peer], tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to send data"));
return OMPI_ERROR;
}
++(*active_requests);
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if (0 == completed) {
PTPCOLL_VERBOSE(10, ("PR Extra send was not completed"));
/* we have to store the iteration number somewhere */
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
return BCOL_FN_COMPLETE;
}
static inline __opal_attribute_always_inline__
int bcol_ptpcoll_send_n_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
void *data_buffer, int count, int tag,
int *extra_peers, int num_peers, int skip,
ompi_communicator_t *comm,
int *active_requests, ompi_request_t **requests)
{
int rc = OMPI_SUCCESS;
int completed = 0;
int i;
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
/* send the all data to your extra peer */
for (i = 0; i < num_peers; i++) {
PTPCOLL_VERBOSE(10, ("send_n_extra to %d tag %d",
extra_peers[i], tag));
if (extra_peers[i] == skip) {
PTPCOLL_VERBOSE(10, ("SKIP"));
continue;
}
rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE,
group_list[extra_peers[i]], tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to send data"));
return OMPI_ERROR;
}
++(*active_requests);
}
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if (0 == completed) {
PTPCOLL_VERBOSE(10, ("PR Extra send was not completed"));
/* we have to store the iteration number somewhere */
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
return BCOL_FN_COMPLETE;
}
static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_binomial_gather_anyroot(mca_bcol_ptpcoll_module_t *ptpcoll_module,
int buffer_index, void *data_buffer, int count, int base_block_size)
{
int rc;
int completed = 0; /* not completed */
int *active_requests =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
int i;
int *iteration =
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
ompi_request_t **requests =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
void *curr_data_sbuffer = NULL,
*curr_data_rbuffer = NULL;
int radix_mask_pow = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow;
int delta;
int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_gather_anyroot %d %d %d",
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration,
ptpcoll_module->pow_2,
1 << ptpcoll_module->pow_2));
/* we assume the iteration #iteration already was completed with probe */
for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
i < ptpcoll_module->pow_2; i++) {
int pow2 = 1 << i;
int peer_index = my_group_index ^ pow2;
int comm_rank = group_list[peer_index];
int slen, rlen,
send_offset,
recv_offset;
if (i > radix_mask_pow) {
/* *active_requests = 0; */
/* send - receive data from the peer */
slen = rlen = pow2 * base_block_size;
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
curr_data_sbuffer = (void *)((unsigned char *)data_buffer + send_offset);
curr_data_rbuffer = (void *)((unsigned char *)data_buffer + recv_offset);
delta = count - recv_offset;
if (delta > 0) {
if (delta < rlen) {
/* recv the tail */
rlen = delta;
}
PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] recv data %p (offset %d) , len %d , dest %d",
pow2,
1 << ptpcoll_module->pow_2,
curr_data_rbuffer,
recv_offset,
rlen,
comm_rank));
rc = MCA_PML_CALL(irecv(curr_data_rbuffer, rlen, MPI_BYTE,
comm_rank, tag, comm, &requests[*active_requests]));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to receive data"));
return OMPI_ERROR;
}
++(*active_requests);
}
delta = count - send_offset;
if (delta > 0) {
if (delta < slen) {
/* recv the tail */
slen = delta;
}
PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] sending data %p (offset %d) , len %d , dest %d",
pow2,
1 << ptpcoll_module->pow_2,
curr_data_sbuffer,
send_offset,
slen,
comm_rank));
rc = MCA_PML_CALL(isend(curr_data_sbuffer, slen, MPI_BYTE,
comm_rank, tag,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to send data"));
return OMPI_ERROR;
}
++(*active_requests);
}
if (*active_requests > 0) {
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if (0 == completed) {
*iteration = i;
/* we have to store the iteration number somewhere */
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
}
} else if (i == radix_mask_pow) {
/* only receive data */
rlen = pow2 * base_block_size;
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
curr_data_rbuffer = (void *)((unsigned char *)data_buffer + recv_offset);
delta = count - recv_offset;
if (0 >= delta) {
/* we have nothing to send, skip the iteration */
continue;
}
if (delta < rlen) {
/* recv the tail */
rlen = delta;
}
/* receive data from the peer */
PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] recv data %p (offset %d) , len %d , dest %d",
pow2,
1 << ptpcoll_module->pow_2,
curr_data_rbuffer,
recv_offset,
rlen,
comm_rank));
rc = MCA_PML_CALL(irecv(curr_data_rbuffer, rlen, MPI_BYTE,
comm_rank, tag, comm, &(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to receive data"));
return OMPI_ERROR;
}
++(*active_requests);
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if (0 == completed) {
*iteration = i;
PTPCOLL_VERBOSE(10, ("Recv was not completed"));
/* we have to store the iteration number somewhere */
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
PTPCOLL_VERBOSE(10, ("Recv was completed"));
} else if (i < radix_mask_pow) {
/* Only send data */
slen = pow2 * base_block_size;
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
curr_data_sbuffer = (void *)((unsigned char *)data_buffer + send_offset);
delta = count - send_offset;
if (0 >= delta) {
/* we have nothing to send, skip the iteration */
continue;
}
if (delta < slen) {
slen = delta;
}
PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] sending data %p (offset %d) , len %d , dest %d",
pow2,
1 << ptpcoll_module->pow_2,
curr_data_sbuffer,
send_offset,
slen,
comm_rank));
rc = MCA_PML_CALL(isend(curr_data_sbuffer, slen, MPI_BYTE,
comm_rank, tag, MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to send data"));
return OMPI_ERROR;
}
++(*active_requests);
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if (0 == completed) {
*iteration = i;
/* we have to store the iteration number somewhere */
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
}
}
return BCOL_FN_COMPLETE;
}
static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot(mca_bcol_ptpcoll_module_t *ptpcoll_module,
int buffer_index, void *data_buffer, int count, int base_block_size)
{
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
int rc;
int completed = 0; /* not completed */
int comm_root;
int i;
int *radix_mask_pow =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow);
int *active_requests =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
ompi_status_public_t status;
ompi_request_t **requests =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
int pow2_group_size = ptpcoll_module->pow_2num;
int pow2_distance;
int my_left_boundary_rank;
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
int group_root_index = 0;
void *curr_data_buffer = NULL;
int tag =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
int recv_count = 0;
int *coll_status =
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status;
assert(0 == *active_requests);
PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot"));
for (i = 0; i < cm->num_to_probe &&
0 == completed; i++) {
MCA_PML_CALL(iprobe(MPI_ANY_SOURCE, tag,
comm, &completed, &status));
PTPCOLL_VERBOSE(10, ("Bcast, iprobe tag %d",
tag));
}
/* the function always returns OMPI_SUCCESS, so we don't check return code */
if (0 == completed) {
PTPCOLL_VERBOSE(10, ("IPROBE was not matched"));
/* No data was received, return no match error */
return BCOL_FN_NOT_STARTED;
}
comm_root = status.MPI_SOURCE;
PTPCOLL_VERBOSE(9, ("IPROBE was matched, root of the data on communicator is %d", comm_root));
/* For proxy we have to check if we got something from extra node */
if (PTPCOLL_PROXY & ptpcoll_module->pow_2type) {
if (group_list[ptpcoll_module->proxy_extra_index] == comm_root) {
PTPCOLL_VERBOSE(9, ("IPROBE was matched, root of the data on communicator is extra node %d",
comm_root));
/* scatter the data among other peer in the pow2 group */
*radix_mask_pow = ptpcoll_module->pow_2;
pow2_distance = ptpcoll_module->pow_2 - 1;
curr_data_buffer = data_buffer;
recv_count = count;
goto PR_SCATTHER;
}
}
/* Find group index for communicator root of the data */
group_root_index = get_group_index_and_distance_for_binomial
(my_group_index, comm_root, pow2_group_size, group_list, &pow2_distance);
if (OPAL_UNLIKELY(group_root_index < 0)) {
PTPCOLL_ERROR(("Fatal error, no group root index found, my id %d, pow2_g_size %d comm_root %d",
my_group_index, pow2_group_size, comm_root));
return OMPI_ERROR;
}
PTPCOLL_VERBOSE(10, ("Group root index is %d distance is %d",
group_root_index, pow2_distance));
/* Use group_root_index to calculate the */
/* Post receive that will fetch the data */
/* Pasha: Who is packing data ?
Should I assume that we get contiguous buffer ?
Or should I pack by myself
===================================================================================================
=== On this stage I assume that data is contiguous. So I use MPI_BYTE datatype and COUNT = size ===
===================================================================================================
*/
recv_count = base_block_size * (1 << pow2_distance); /* we may receive larger data */
my_left_boundary_rank = my_group_index & ((~(int)0) << pow2_distance );
curr_data_buffer = (void *)((unsigned char *)data_buffer +
(size_t) base_block_size * my_left_boundary_rank);
*radix_mask_pow = pow2_distance;
pow2_distance--;
PR_SCATTHER:
PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], "
"recv_count %d, tag %d, addr %p, offset %d, pow2_distace %d",
comm_root, group_root_index, recv_count,
tag, curr_data_buffer,
my_group_index * base_block_size, pow2_distance));
rc = MCA_PML_CALL(recv(curr_data_buffer, recv_count, MPI_BYTE,
comm_root, tag, comm, MPI_STATUS_IGNORE));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to receive data"));
return OMPI_ERROR;
}
PTPCOLL_VERBOSE(10, ("Bcast, Data was received"));
/* Sending forward the data over K-nomial tree */
*coll_status = PTPCOLL_SCATTER_STARTED;
K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(
pow2_distance,
my_group_index, group_size, group_list,
data_buffer, base_block_size,
count, tag, comm, requests,
active_requests);
/* Since the next step (gather) does not really require
completion on scatter , we may return complete */
return BCOL_FN_COMPLETE;
}
static inline __opal_attribute_always_inline__
int bcol_ptpcoll_binomial_root_to_src(int group_root, int my_rank,
int pow2_size, int group_size, int *distance)
{
int root, relative_rank, src,
pow2_distance = 0, i;
if (group_root < pow2_size) {
root = group_root;
} else {
/* the source of the data is extra node,
the real root it represented by some rank from
pow2 group */
root = group_root - pow2_size;
/* shortcut for the case when my rank is root for the group */
if (my_rank == root) {
*distance = -1;
return group_root;
}
}
relative_rank = (my_rank - root) < 0 ? my_rank - root + pow2_size :
my_rank - root;
for (i = 1; i < pow2_size; i<<=1, pow2_distance++) {
if (relative_rank & i) {
src = my_rank ^ i;
if (src >= pow2_size)
src -= pow2_size;
*distance = pow2_distance;
return src;
}
}
/* error case */
*distance = -1;
return -1;
}
static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_binomial_test_and_scatter_known_root(mca_bcol_ptpcoll_module_t *ptpcoll_module,
int buffer_index, void *data_buffer, int count, int base_block_size)
{
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
int rc;
int *active_requests =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
ompi_request_t **requests =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
int tmp_radix_mask_pow =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow - 1;
int tag =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
int *status =
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status;
PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot"));
if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests,
requests, &rc)) {
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
PTPCOLL_VERBOSE(10, ("Bcast, Data was received"));
/* Sending forward the data over binimial nomial tree */
*status = PTPCOLL_SCATTER_STARTED;
K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(
tmp_radix_mask_pow,
my_group_index, group_size, group_list,
data_buffer, base_block_size,
count, tag, comm, requests,
active_requests);
return BCOL_FN_COMPLETE;
}
#define NARRAY_BLOCK_SIZE(size, module, level_size) \
((size + (module)->full_narray_tree_num_leafs - 1) / \
(module)->full_narray_tree_num_leafs) * \
((module)->full_narray_tree_num_leafs / \
((0 == level_size) ? \
mca_bcol_ptpcoll_component.narray_knomial_radix : \
level_size))
static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_narray_test_and_scatter_known_root(mca_bcol_ptpcoll_module_t *ptpcoll_module,
int buffer_index, void *data_buffer, int count, int process_shift,
int relative_group_index)
{
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
int rc;
int *active_requests =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
ompi_request_t **requests =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
int *status =
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status;
int scatter_count = 0;
int offset = 0;
int base_block_size = 0;
void *curr_data_buffer = NULL;
PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_narray_test_and_scatter_known_root"));
if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests,
requests, &rc)) {
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
/* Sending forward the data over binimial nomial tree */
*status = PTPCOLL_SCATTER_STARTED;
if(0 == relative_group_index) {
scatter_count = count;
} else {
scatter_count = NARRAY_BLOCK_SIZE(count, ptpcoll_module,
ptpcoll_module->narray_knomial_node[relative_group_index].level_size);
}
offset = scatter_count *
ptpcoll_module->narray_knomial_node[relative_group_index].rank_on_level;
/* make sure that we do not overun memory */
if (OPAL_UNLIKELY(offset + scatter_count > count)) {
scatter_count = count - offset;
}
PTPCOLL_VERBOSE(10, ("Bcast, Data was received %d %d %d",
scatter_count,
ptpcoll_module->narray_knomial_node[relative_group_index].level_size,
ptpcoll_module->narray_knomial_node[relative_group_index].rank_on_level));
curr_data_buffer = (void *)((unsigned char *)data_buffer + (size_t)offset);
/* calculating scatter block size for next level of tree */
base_block_size = NARRAY_BLOCK_SIZE(count, ptpcoll_module,
ptpcoll_module->narray_knomial_node[relative_group_index].level_size *
mca_bcol_ptpcoll_component.narray_knomial_radix);
PTPCOLL_VERBOSE(10, ("scatter_known_rootaaa %d %d %d %d %d",scatter_count, offset, base_block_size,
ptpcoll_module->narray_knomial_node[relative_group_index].level_size /mca_bcol_ptpcoll_component.narray_knomial_radix,
ptpcoll_module->full_narray_tree_num_leafs));
NARRAY_SCATTER_NB((&ptpcoll_module->narray_knomial_node[relative_group_index]),
process_shift, ptpcoll_module->full_narray_tree_size,
curr_data_buffer, base_block_size, scatter_count, tag, comm,
requests, active_requests);
/* Bummer, I tried to prevent this, special case for virtual root */
if(0 == relative_group_index) {
if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests,
requests, &rc)) {
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
*status = PTPCOLL_ROOT_SEND_STARTED;
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
}
return BCOL_FN_COMPLETE;
}
static inline __opal_attribute_always_inline__
int bcol_ptpcoll_bcast_narray_knomial_gather(mca_bcol_ptpcoll_module_t *ptpcoll_module,
const int buffer_index, void *data_buffer, const int count,
const int relative_group_index)
{
int completed = 0; /* not completed */
int *active_requests =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
int blocks_in_step =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask;
int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
int group_size = ptpcoll_module->full_narray_tree_size;
int i, k,
rc,
len, slen, rlen,
peer, group_peer;
size_t s_offset,
r_offset;
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
ompi_request_t **requests =
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
netpatterns_narray_knomial_tree_node_t *narray_node =
&ptpcoll_module->narray_knomial_node[relative_group_index];
netpatterns_k_exchange_node_t *k_node =
&narray_node->k_node;
mca_bcol_ptpcoll_component_t *cm =
&mca_bcol_ptpcoll_component;
size_t base_block_size =
NARRAY_BLOCK_SIZE(count, ptpcoll_module, narray_node->level_size);
PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_narray_knomial_gather %d %d %d %d %d %d %d",
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration,
base_block_size, count, narray_node->level_size,
relative_group_index, k_node->n_exchanges, tag));
/* we assume the iteration #iteration already was completed with probe */
for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
i < k_node->n_exchanges; i++, blocks_in_step *= cm->narray_knomial_radix) {
len = base_block_size * blocks_in_step;
for (k = 0; k < cm->narray_knomial_radix - 1; k++) {
group_peer = my_group_index +
(k_node->rank_exchanges[i][k] - narray_node->rank_on_level);
if (group_peer >= group_size) {
group_peer -= group_size;
} else if (group_peer < 0) {
group_peer += group_size;
}
peer = group_list[group_peer];
r_offset = (size_t)k_node->rank_exchanges[i][k] / blocks_in_step *
len;
/* check that we do not run out of message boundary */
if (OPAL_UNLIKELY(r_offset + len > (size_t)count)) {
rlen = count - r_offset;
if (OPAL_UNLIKELY(rlen <= 0)) {
continue;
}
} else {
rlen = len;
}
PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p offset %d len %d %d %d tag %d",
peer, data_buffer, r_offset, rlen, len, blocks_in_step, tag));
rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + r_offset),
rlen, MPI_BYTE,
peer, tag, comm, &requests[*active_requests]));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to receive data"));
return OMPI_ERROR;
}
++(*active_requests);
}
for (k = 0; k < cm->narray_knomial_radix - 1; k++) {
group_peer = my_group_index +
(k_node->rank_exchanges[i][k] - narray_node->rank_on_level);
if (group_peer >= group_size) {
group_peer -= group_size;
} else if (group_peer < 0) {
group_peer += group_size;
}
peer = group_list[group_peer];
s_offset = (size_t)narray_node->rank_on_level / blocks_in_step *
len;
/* check that we do not run out of message boundary */
if (OPAL_UNLIKELY(s_offset + len > (size_t)count)) {
slen = count - s_offset;
if (OPAL_UNLIKELY(slen <= 0)) {
continue;
}
} else {
slen = len;
}
PTPCOLL_VERBOSE(10, ("Send data from %d, addr %p offset %d len %d %d %d tag %d",
peer, data_buffer, s_offset, slen, len, blocks_in_step, tag));
rc = MCA_PML_CALL(isend((void *)((unsigned char *)data_buffer + s_offset),
slen, MPI_BYTE,
peer, tag, MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[*active_requests])));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to send data"));
return OMPI_ERROR;
}
++(*active_requests);
}
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
if (0 == completed) {
/* cache data for next iteration */
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration =
i; /* why not to store step for next iteration ?! */
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask =
blocks_in_step * cm->narray_knomial_radix;
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
}
return BCOL_FN_COMPLETE;
}
END_C_DECLS
#endif

Просмотреть файл

@ -1,174 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#include "ompi_config.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/bcol.h"
#include "bcol_ptpcoll.h"
#include "ompi/mca/bcol/base/base.h"
#include "bcol_ptpcoll_mca.h"
#include "bcol_ptpcoll_utils.h"
/*
* Public string showing the bcol ptpcoll V2 component version number
*/
const char *mca_bcol_ptpcoll_component_version_string =
"Open MPI bcol - ptpcoll collective MCA component version " OMPI_VERSION;
/*
* Local functions
*/
static int ptpcoll_open(void);
static int ptpcoll_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
mca_bcol_ptpcoll_component_t mca_bcol_ptpcoll_component = {
/* First, fill in the super */
{
/* First, the mca_component_t struct containing meta
information about the component itself */
.bcol_version = {
MCA_BCOL_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "ptpcoll",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = ptpcoll_open,
.mca_close_component = ptpcoll_close,
.mca_register_component_params = mca_bcol_ptpcoll_register_mca_params,
},
/* Initialization / querying functions */
.collm_init_query = mca_bcol_ptpcoll_init_query,
.collm_comm_query = mca_bcol_ptpcoll_comm_query,
.init_done = false,
.need_ordering = false,
},
/* component specific */
};
static void
collreq_construct(mca_bcol_ptpcoll_collreq_t *collreq)
{
collreq->requests = NULL;
}
static void
collreq_destruct(mca_bcol_ptpcoll_collreq_t *collreq)
{
if (NULL != collreq->requests) {
free(collreq->requests);
}
}
OBJ_CLASS_INSTANCE(mca_bcol_ptpcoll_collreq_t,
opal_free_list_item_t,
collreq_construct,
collreq_destruct);
/*
* Open the component
*/
static int ptpcoll_open(void)
{
return OMPI_SUCCESS;
}
/*
* Close the component
*/
static int ptpcoll_close(void)
{
return OMPI_SUCCESS;
}
/* query to see if the component is available for use, and can
* satisfy the thread and progress requirements
*/
int mca_bcol_ptpcoll_init_query(bool enable_progress_threads,
bool enable_mpi_threads)
{
/* at this stage there is no reason to disaulify this component */
/* done */
return OMPI_SUCCESS;
}
/* memory management routines */
/* allocte memory - this is a no-op function intended to work with
* mpool2, which will use malloc for allocation, if no other allocator
* is available.
*/
void * bcol_ptpcoll_allocate_memory(size_t length, size_t alignment,
struct mca_bcol_base_module_t *bcol_module)
{
/* do nothing */
return NULL;
}
/*
* register memory - nothing to do
*/
int bcol_ptpcoll_register_memory(void * in_ptr, size_t length, size_t alignment,
struct mca_bcol_base_module_t *bcol_module)
{
/* nothing to do */
return OMPI_SUCCESS;
}
/* deregister memory - nothing to do
*/
int bcol_ptpcoll_deregister_memory( void * in_ptr,
struct mca_bcol_base_module_t *bcol_module)
{
/* nothing to do */
return OMPI_SUCCESS;
}
/* free memory - since we don't allocate, we also don't free */
int bcol_ptpcoll_free_memory(void *ptr,
struct mca_bcol_base_module_t *bcol_module)
{
/* nnthing to do */
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,28 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi/include/ompi/constants.h"
#include "ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h"
/*
* Fanin routines - no user data
*/
int bcol_ptpcoll_fanin( bcol_function_args_t *input_args,
struct mca_bcol_base_module_t *module)
{
/* local variable */
int ret=OMPI_SUCCESS;
/* mca_bcol_ptpcoll_module_t *ptp_module=(mca_bcol_ptpcoll_module_t *) module; */
/* done */
return ret;
}

Просмотреть файл

@ -1,30 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi/include/ompi/constants.h"
#include "ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h"
/*
* Fanin routines - no user data
*/
int bcol_ptpcoll_fanout( bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
/* local variable */
int ret = OMPI_SUCCESS;
/* TBD:
mca_bcol_ptpcoll_module_t *ptp_module=(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
*/
/* done */
return ret;
}

Просмотреть файл

@ -1,197 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include "bcol_ptpcoll_mca.h"
#include "bcol_ptpcoll.h"
/*
* Local flags
*/
enum {
REGINT_NEG_ONE_OK = 0x01,
REGINT_GE_ZERO = 0x02,
REGINT_GE_ONE = 0x04,
REGINT_NONZERO = 0x08,
REGINT_MAX = 0x88
};
enum {
REGSTR_EMPTY_OK = 0x01,
REGSTR_MAX = 0x88
};
#if 0 /* Pasha: we will be need this function in future */
/*
* utility routine for string parameter registration
*/
static int reg_string(const char* param_name,
const char* deprecated_param_name,
const char* param_desc,
const char* default_value, char **storage,
int flags)
{
int index;
*storage = default_value;
index = mca_base_component_var_register(&mca_bcol_ptpcoll_component.super.bcol_version,
param_name, param_desc, MCA_BASE_VAR_TYPE_STRING,
NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
if (NULL != deprecated_param_name) {
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "ptpcoll",
deprecated_param_name,
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
if (0 != (flags & REGSTR_EMPTY_OK) && (NULL == *storage || 0 == strlen(*storage))) {
opal_output(0, "Bad parameter value for parameter \"%s\"",
param_name);
return OMPI_ERR_BAD_PARAM;
}
return OMPI_SUCCESS;
}
#endif
/*
* utility routine for integer parameter registration
*/
static int reg_int(const char* param_name,
const char* deprecated_param_name,
const char* param_desc,
int default_value, int *storage, int flags)
{
int index;
*storage = default_value;
index = mca_base_component_var_register(&mca_bcol_ptpcoll_component.super.bcol_version,
param_name, param_desc, MCA_BASE_VAR_TYPE_INT,
NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
if (NULL != deprecated_param_name) {
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "ptpcoll",
deprecated_param_name,
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) {
return OMPI_SUCCESS;
}
if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) ||
(0 != (flags & REGINT_GE_ONE) && *storage < 1) ||
(0 != (flags & REGINT_NONZERO) && 0 == *storage)) {
opal_output(0, "Bad parameter value for parameter \"%s\"",
param_name);
return OMPI_ERR_BAD_PARAM;
}
return OMPI_SUCCESS;
}
static int reg_bool(const char* param_name,
const char* deprecated_param_name,
const char* param_desc,
bool default_value, bool *storage)
{
int index;
*storage = default_value;
index = mca_base_component_var_register(&mca_bcol_ptpcoll_component.super.bcol_version,
param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL,
NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
if (0 > index) {
return index;
}
if (NULL != deprecated_param_name) {
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "ptpcoll",
deprecated_param_name,
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
return OMPI_SUCCESS;
}
int mca_bcol_ptpcoll_register_mca_params(void)
{
int ret, tmp;
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
ret = OMPI_SUCCESS;
#define CHECK(expr) do {\
tmp = (expr); \
if (OMPI_SUCCESS != tmp) ret = tmp; \
} while (0)
CHECK(reg_int("priority", NULL,
"PTPCOLL component priority"
"(from 0(low) to 90 (high))", 90, &cm->super.priority, 0));
CHECK(reg_int("verbose", NULL,
"Output some verbose PTPCOLL information "
"(0 = no output, nonzero = output)", 0, &cm->verbose, REGINT_GE_ZERO));
CHECK(reg_int("k_nomial_radix", NULL,
"The radix of K-Nomial Tree "
"(starts from 2)", 2, &cm->k_nomial_radix, REGINT_GE_ONE));
CHECK(reg_int("narray_radix", NULL,
"The radix of Narray Tree "
"(starts from 2)", 2, &cm->narray_radix, REGINT_GE_ONE));
CHECK(reg_int("narray_knomial_radix", NULL,
"The radix of Narray/Knomial Tree for scatther-gather type algorithms"
"(starts from 2)", 2, &cm->narray_knomial_radix, REGINT_GE_ONE));
CHECK(reg_int("num_to_probe", NULL,
"Number of probe operation in single source data check"
"(starts from 8)", 8, &cm->num_to_probe, REGINT_GE_ONE));
CHECK(reg_int("bcast_small_msg_known_root_alg", NULL,
"Algorithm selection for bcast small messages known root"
"(1 - K-nomial, 2 - N-array)", 1, &cm->bcast_small_messages_known_root_alg,
REGINT_GE_ZERO));
CHECK(reg_int("bcast_large_msg_known_root_alg", NULL,
"Algorithm selection for bcast large messages known root"
"(1 - Binomial scatther-gather, 2 - N-array scather, K-nomial gather)",
1, &cm->bcast_large_messages_known_root_alg, REGINT_GE_ZERO));
CHECK(reg_int("barrier_alg", NULL,
"Algorithm selection for Barrier"
"(1 - Recursive doubling, 2 - Recursive K-ing)",
1, &cm->barrier_alg, REGINT_GE_ZERO));
/* register parmeters controlling message fragementation */
CHECK(reg_int("min_frag_size", NULL,
"Minimum fragment size",
getpagesize(), &cm->super.min_frag_size, REGINT_GE_ONE));
CHECK(reg_int("max_frag_size", NULL,
"Maximum fragment size",
FRAG_SIZE_NO_LIMIT, &cm->super.max_frag_size, REGINT_NONZERO));
CHECK(reg_bool("can_use_user_buffers", NULL,
"User memory can be used by the collective algorithms",
1, &cm->super.can_use_user_buffers));
return ret;
}

Просмотреть файл

@ -1,20 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_PTPCOLL_MCA_H
#define MCA_BCOL_PTPCOLL_MCA_H
#include "ompi_config.h"
BEGIN_C_DECLS
int mca_bcol_ptpcoll_register_mca_params(void);
END_C_DECLS
#endif

Просмотреть файл

@ -1,760 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#include "ompi_config.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/bcol.h"
#include "opal/util/show_help.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/mca/pml/pml.h" /* need this for the max tag size */
#include "bcol_ptpcoll.h"
#include "bcol_ptpcoll_utils.h"
#include "bcol_ptpcoll_bcast.h"
#include "bcol_ptpcoll_allreduce.h"
#include "bcol_ptpcoll_reduce.h"
#define BCOL_PTP_CACHE_LINE_SIZE 128
/*
* Local functions
*/
static int alloc_allreduce_offsets_array(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
int rc = OMPI_SUCCESS, i = 0;
netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
int n_exchanges = k_node->n_exchanges;
/* Precalculate the allreduce offsets */
if (0 < k_node->n_exchanges) {
ptpcoll_module->allgather_offsets = (int **) calloc (n_exchanges, sizeof(int *));
if (!ptpcoll_module->allgather_offsets) {
return OMPI_ERROR;
}
for (i = 0; i < n_exchanges ; i++) {
ptpcoll_module->allgather_offsets[i] = (int *) calloc (NOFFSETS, sizeof(int));
if (!ptpcoll_module->allgather_offsets[i]){
return OMPI_ERROR;
}
}
}
return rc;
}
static int free_allreduce_offsets_array(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
int rc = OMPI_SUCCESS, i = 0;
netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
int n_exchanges = k_node->n_exchanges;
if (ptpcoll_module->allgather_offsets) {
for (i=0; i < n_exchanges; i++) {
free (ptpcoll_module->allgather_offsets[i]);
}
}
free(ptpcoll_module->allgather_offsets);
ptpcoll_module->allgather_offsets = NULL;
return rc;
}
static void
mca_bcol_ptpcoll_module_construct(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
uint64_t i;
/* Pointer to component */
ptpcoll_module->narray_node = NULL;
ptpcoll_module->allgather_offsets = NULL;
ptpcoll_module->super.bcol_component = (mca_bcol_base_component_t *) &mca_bcol_ptpcoll_component;
ptpcoll_module->super.list_n_connected = NULL;
ptpcoll_module->super.hier_scather_offset = 0;
/* no header support in ptp */
ptpcoll_module->super.header_size = 0;
/* No network context */
ptpcoll_module->super.network_context = NULL;
/* set the upper limit on the tag */
i = 2;
ptpcoll_module->tag_mask = 1;
while ( i <= (uint64_t) mca_pml.pml_max_tag && i > 0) {
i <<= 1;
}
ptpcoll_module->ml_mem.ml_buf_desc = NULL;
ptpcoll_module->tag_mask = i - 1;
}
static void
mca_bcol_ptpcoll_module_destruct(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
int i;
mca_bcol_ptpcoll_local_mlmem_desc_t *ml_mem = &ptpcoll_module->ml_mem;
if (NULL != ml_mem->ml_buf_desc) {
/* Release the memory structs that were cache ML memory data */
uint32_t i, j, ci;
for (i = 0; i < ml_mem->num_banks; i++) {
for (j = 0; j < ml_mem->num_buffers_per_bank; j++) {
ci = i * ml_mem->num_buffers_per_bank + j;
if (NULL != ml_mem->ml_buf_desc[ci].requests) {
free(ml_mem->ml_buf_desc[ci].requests);
}
}
}
/* release the buffer descriptor */
free(ml_mem->ml_buf_desc);
ml_mem->ml_buf_desc = NULL;
}
if (NULL != ptpcoll_module->allgather_offsets) {
free_allreduce_offsets_array(ptpcoll_module);
}
if (NULL != ptpcoll_module->narray_node) {
for (i = 0; i < ptpcoll_module->group_size; i++) {
if (NULL != ptpcoll_module->narray_node[i].children_ranks) {
free(ptpcoll_module->narray_node[i].children_ranks);
}
}
free(ptpcoll_module->narray_node);
ptpcoll_module->narray_node = NULL;
}
OBJ_DESTRUCT(&ptpcoll_module->collreqs_free);
if (NULL != ptpcoll_module->super.list_n_connected) {
free(ptpcoll_module->super.list_n_connected);
ptpcoll_module->super.list_n_connected = NULL;
}
for (i = 0; i < BCOL_NUM_OF_FUNCTIONS; i++){
OPAL_LIST_DESTRUCT((&ptpcoll_module->super.bcol_fns_table[i]));
}
if (NULL != ptpcoll_module->kn_proxy_extra_index) {
free(ptpcoll_module->kn_proxy_extra_index);
ptpcoll_module->kn_proxy_extra_index = NULL;
}
if (NULL != ptpcoll_module->alltoall_iovec) {
free(ptpcoll_module->alltoall_iovec);
ptpcoll_module->alltoall_iovec = NULL;
}
if (NULL != ptpcoll_module->narray_knomial_proxy_extra_index) {
free(ptpcoll_module->narray_knomial_proxy_extra_index);
ptpcoll_module->narray_knomial_proxy_extra_index = NULL;
}
if (NULL != ptpcoll_module->narray_knomial_node) {
for(i = 0; i < ptpcoll_module->full_narray_tree_size; i++) {
netpatterns_cleanup_narray_knomial_tree (ptpcoll_module->narray_knomial_node + i);
}
free(ptpcoll_module->narray_knomial_node);
ptpcoll_module->narray_knomial_node = NULL;
}
netpatterns_cleanup_recursive_knomial_allgather_tree_node(&ptpcoll_module->knomial_allgather_tree);
netpatterns_cleanup_recursive_knomial_tree_node(&ptpcoll_module->knomial_exchange_tree);
}
OBJ_CLASS_INSTANCE(mca_bcol_ptpcoll_module_t,
mca_bcol_base_module_t,
mca_bcol_ptpcoll_module_construct,
mca_bcol_ptpcoll_module_destruct);
static int init_ml_buf_desc(mca_bcol_ptpcoll_ml_buffer_desc_t **desc, void *base_addr, uint32_t num_banks,
uint32_t num_buffers_per_bank, uint32_t size_buffer, uint32_t header_size, int group_size, int pow_k)
{
uint32_t i, j, ci;
mca_bcol_ptpcoll_ml_buffer_desc_t *tmp_desc = NULL;
int k_nomial_radix = mca_bcol_ptpcoll_component.k_nomial_radix;
int pow_k_val = (0 == pow_k) ? 1 : pow_k;
int num_to_alloc =
((k_nomial_radix - 1) * pow_k_val * 2 + 1 > mca_bcol_ptpcoll_component.narray_radix) ?
(k_nomial_radix - 1) * pow_k_val * 2 + 1 :
mca_bcol_ptpcoll_component.narray_radix * 2;
*desc = (mca_bcol_ptpcoll_ml_buffer_desc_t *)calloc(num_banks * num_buffers_per_bank,
sizeof(mca_bcol_ptpcoll_ml_buffer_desc_t));
if (NULL == *desc) {
PTPCOLL_ERROR(("Failed to allocate memory"));
return OMPI_ERROR;
}
tmp_desc = *desc;
for (i = 0; i < num_banks; i++) {
for (j = 0; j < num_buffers_per_bank; j++) {
ci = i * num_buffers_per_bank + j;
tmp_desc[ci].bank_index = i;
tmp_desc[ci].buffer_index = j;
/* *2 is for gather session +1 for extra peer */
tmp_desc[ci].requests = (ompi_request_t **)
calloc(num_to_alloc, sizeof(ompi_request_t *));
if (NULL == tmp_desc[ci].requests) {
PTPCOLL_ERROR(("Failed to allocate memory for requests"));
return OMPI_ERROR;
}
/*
* ptpcoll don't have any header, but other bcols may to have. So
* we need to take it in account.
*/
tmp_desc[ci].data_addr = (void *)
((unsigned char*)base_addr + ci * size_buffer + header_size);
PTPCOLL_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr));
/* init reduce implementation flags */
tmp_desc[ci].reduce_init_called = false;
tmp_desc[ci].reduction_status = 0;
}
}
return OMPI_SUCCESS;
}
static void mca_bcol_ptpcoll_set_small_msg_thresholds(struct mca_bcol_base_module_t *super)
{
mca_bcol_ptpcoll_module_t *ptpcoll_module =
(mca_bcol_ptpcoll_module_t *) super;
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
/* Subtract out the maximum header size when calculating the thresholds. This
* will account for the headers used by the basesmuma component. If we do not
* take these headers into account we may overrun our buffer. */
/* Set the Allgather threshold equals to a ML buff size */
super->small_message_thresholds[BCOL_ALLGATHER] =
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) /
ompi_comm_size(ptpcoll_module->super.sbgp_partner_module->group_comm);
/* Set the Bcast threshold, all Bcast algths have the same threshold */
super->small_message_thresholds[BCOL_BCAST] =
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX);
/* Set the Alltoall threshold, the Ring algth sets some limitation */
super->small_message_thresholds[BCOL_ALLTOALL] =
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / 2;
/* Set the Allreduce threshold, NARRAY algth sets some limitation */
super->small_message_thresholds[BCOL_ALLREDUCE] =
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / ptpcoll_module->k_nomial_radix;
/* Set the Reduce threshold, NARRAY algth sets some limitation */
super->small_message_thresholds[BCOL_REDUCE] =
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / cm->narray_radix;
}
/*
* Cache information about ML memory
*/
static int mca_bcol_ptpcoll_cache_ml_memory_info(struct mca_bcol_base_memory_block_desc_t *payload_block,
uint32_t data_offset,
struct mca_bcol_base_module_t *bcol,
void *reg_data)
{
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) bcol;
mca_bcol_ptpcoll_local_mlmem_desc_t *ml_mem = &ptpcoll_module->ml_mem;
struct mca_bcol_base_memory_block_desc_t *desc = payload_block;
int group_size = ptpcoll_module->super.sbgp_partner_module->group_size;
PTPCOLL_VERBOSE(10, ("mca_bcol_ptpcoll_init_buffer_memory was called"));
/* cache ml mem desc tunings localy */
ml_mem->num_banks = desc->num_banks;
ml_mem->num_buffers_per_bank = desc->num_buffers_per_bank;
ml_mem->size_buffer = desc->size_buffer;
PTPCOLL_VERBOSE(10, ("ML buffer configuration num banks %d num_per_bank %d size %d base addr %p",
desc->num_banks, desc->num_buffers_per_bank, desc->size_buffer, desc->block->base_addr));
/* Set first bank index for release */
ml_mem->bank_index_for_release = 0;
if (OMPI_SUCCESS != init_ml_buf_desc(&ml_mem->ml_buf_desc,
desc->block->base_addr,
ml_mem->num_banks,
ml_mem->num_buffers_per_bank,
ml_mem->size_buffer,
data_offset,
group_size,
ptpcoll_module->pow_k)) {
PTPCOLL_VERBOSE(10, ("Failed to allocate rdma memory descriptor\n"));
return OMPI_ERROR;
}
PTPCOLL_VERBOSE(10, ("ptpcoll_module = %p, ml_mem_desc = %p.\n",
ptpcoll_module));
return OMPI_SUCCESS;
}
/*
* Load ptpcoll bcol functions
*/
static void load_func(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
int fnc;
/* reset everything to NULL */
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
/*ptpcoll_module->super.bcol_function_table[fnc] = NULL;*/
ptpcoll_module->super.bcol_function_table[fnc] = NULL;
ptpcoll_module->super.bcol_function_init_table[fnc] = NULL;
}
ptpcoll_module->super.bcol_function_init_table[BCOL_BARRIER] = bcol_ptpcoll_barrier_init;
ptpcoll_module->super.bcol_function_init_table[BCOL_BCAST] = bcol_ptpcoll_bcast_init;
ptpcoll_module->super.bcol_function_init_table[BCOL_ALLREDUCE] = bcol_ptpcoll_allreduce_init;
ptpcoll_module->super.bcol_function_init_table[BCOL_ALLGATHER] = bcol_ptpcoll_allgather_init;
ptpcoll_module->super.bcol_function_table[BCOL_BCAST] = bcol_ptpcoll_bcast_k_nomial_anyroot;
ptpcoll_module->super.bcol_function_init_table[BCOL_ALLTOALL] = NULL;
ptpcoll_module->super.bcol_function_init_table[BCOL_SYNC] = mca_bcol_ptpcoll_memsync_init;
ptpcoll_module->super.bcol_function_init_table[BCOL_REDUCE] = bcol_ptpcoll_reduce_init;
/* ML memory cacher */
ptpcoll_module->super.bcol_memory_init = mca_bcol_ptpcoll_cache_ml_memory_info;
/* Set thresholds */
ptpcoll_module->super.set_small_msg_thresholds = mca_bcol_ptpcoll_set_small_msg_thresholds;
/* setup recursive k-ing tree */
ptpcoll_module->super.k_nomial_tree = mca_bcol_ptpcoll_setup_knomial_tree;
}
int mca_bcol_ptpcoll_setup_knomial_tree(mca_bcol_base_module_t *super)
{
mca_bcol_ptpcoll_module_t *p2p_module = (mca_bcol_ptpcoll_module_t *) super;
int rc = 0;
rc = netpatterns_setup_recursive_knomial_allgather_tree_node(
p2p_module->super.sbgp_partner_module->group_size,
p2p_module->super.sbgp_partner_module->my_index,
mca_bcol_ptpcoll_component.k_nomial_radix,
super->list_n_connected,
&p2p_module->knomial_allgather_tree);
return rc;
}
/* The function used to calculate size */
static int calc_full_tree_size(int radix, int group_size, int *num_leafs)
{
int level_cnt = 1;
int total_cnt = 0;
while( total_cnt < group_size ) {
total_cnt += level_cnt;
level_cnt *= radix;
}
if (total_cnt > group_size) {
*num_leafs = level_cnt / radix;
return total_cnt - level_cnt / radix;
} else {
*num_leafs = level_cnt;
return group_size;
}
}
/* Setup N-array scatter Knomial-gather static information */
static int load_narray_knomial_tree (mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
int rc, i, peer;
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
ptpcoll_module->full_narray_tree_size = calc_full_tree_size(
cm->narray_knomial_radix,
ptpcoll_module->group_size,
&ptpcoll_module->full_narray_tree_num_leafs);
ptpcoll_module->narray_knomial_proxy_extra_index = (int *)
malloc(sizeof(int) * (cm->narray_knomial_radix));
if (NULL == ptpcoll_module->narray_knomial_proxy_extra_index) {
PTPCOLL_ERROR(("Failed to allocate memory"));
goto Error;
}
ptpcoll_module->narray_knomial_node = calloc(
ptpcoll_module->full_narray_tree_size,
sizeof(netpatterns_narray_knomial_tree_node_t));
if(NULL == ptpcoll_module->narray_knomial_node) {
goto Error;
}
PTPCOLL_VERBOSE(10 ,("My type is proxy, full tree size = %d [%d]",
ptpcoll_module->full_narray_tree_size,
cm->narray_knomial_radix
));
if (ptpcoll_module->super.sbgp_partner_module->my_index <
ptpcoll_module->full_narray_tree_size) {
if (ptpcoll_module->super.sbgp_partner_module->my_index <
ptpcoll_module->group_size - ptpcoll_module->full_narray_tree_size) {
ptpcoll_module->narray_type = PTPCOLL_PROXY;
for (i = 0; i < cm->narray_knomial_radix; i++) {
peer =
ptpcoll_module->super.sbgp_partner_module->my_index *
cm->narray_knomial_radix + i +
ptpcoll_module->full_narray_tree_size;
if (peer >= ptpcoll_module->group_size) {
break;
}
ptpcoll_module->narray_knomial_proxy_extra_index[i] = peer;
}
ptpcoll_module->narray_knomial_proxy_num = i;
} else {
ptpcoll_module->narray_type = PTPCOLL_IN_GROUP;;
}
/* Setting node info */
for(i = 0; i < ptpcoll_module->full_narray_tree_size; i++) {
rc = netpatterns_setup_narray_knomial_tree(
cm->narray_knomial_radix,
i,
ptpcoll_module->full_narray_tree_size,
&ptpcoll_module->narray_knomial_node[i]);
if(OMPI_SUCCESS != rc) {
goto Error;
}
}
} else {
ptpcoll_module->narray_type = PTPCOLL_EXTRA;
ptpcoll_module->narray_knomial_proxy_extra_index[0] =
(ptpcoll_module->super.sbgp_partner_module->my_index -
ptpcoll_module->full_narray_tree_size) /
cm->narray_knomial_radix;
}
return OMPI_SUCCESS;
Error:
if (NULL != ptpcoll_module->narray_knomial_node) {
free(ptpcoll_module->narray_knomial_node);
}
if (NULL != ptpcoll_module->narray_knomial_proxy_extra_index) {
free(ptpcoll_module->narray_knomial_proxy_extra_index);
}
return OMPI_ERROR;
}
/* Setup N-array static information */
static int load_narray_tree(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
int rc, i;
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
ptpcoll_module->narray_node = calloc(ptpcoll_module->group_size,
sizeof(netpatterns_tree_node_t));
if(NULL == ptpcoll_module->narray_node ) {
goto Error;
}
for(i = 0; i < ptpcoll_module->group_size; i++) {
rc = netpatterns_setup_narray_tree(
cm->narray_radix,
i,
ptpcoll_module->group_size,
&ptpcoll_module->narray_node[i]);
if(OMPI_SUCCESS != rc) {
goto Error;
}
}
return OMPI_SUCCESS;
Error:
if (NULL != ptpcoll_module->narray_node) {
free(ptpcoll_module->narray_node);
}
return OMPI_ERROR;
}
static int load_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
int i;
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
ptpcoll_module->k_nomial_radix =
cm->k_nomial_radix > ptpcoll_module->group_size ?
ptpcoll_module->group_size :
cm->k_nomial_radix;
ptpcoll_module->pow_k = pow_k_calc(ptpcoll_module->k_nomial_radix,
ptpcoll_module->group_size,
&ptpcoll_module->pow_knum);
ptpcoll_module->kn_proxy_extra_index = (int *)
malloc(sizeof(int) * (ptpcoll_module->k_nomial_radix - 1));
if (NULL == ptpcoll_module->kn_proxy_extra_index) {
PTPCOLL_ERROR(("Failed to allocate memory"));
goto Error;
}
/* Setting peer type for K-nomial algorithm*/
if (ptpcoll_module->super.sbgp_partner_module->my_index < ptpcoll_module->pow_knum ) {
if (ptpcoll_module->super.sbgp_partner_module->my_index <
ptpcoll_module->group_size - ptpcoll_module->pow_knum) {
for (i = 0;
i < (ptpcoll_module->k_nomial_radix - 1) &&
ptpcoll_module->super.sbgp_partner_module->my_index *
(ptpcoll_module->k_nomial_radix - 1) +
i + ptpcoll_module->pow_knum < ptpcoll_module->group_size
; i++) {
ptpcoll_module->pow_ktype = PTPCOLL_KN_PROXY;
ptpcoll_module->kn_proxy_extra_index[i] =
ptpcoll_module->super.sbgp_partner_module->my_index *
(ptpcoll_module->k_nomial_radix - 1) +
i + ptpcoll_module->pow_knum;
PTPCOLL_VERBOSE(10 ,("My type is proxy, pow_knum = %d [%d] my extra %d",
ptpcoll_module->pow_knum,
ptpcoll_module->pow_k,
ptpcoll_module->kn_proxy_extra_index[i]));
}
ptpcoll_module->kn_proxy_extra_num = i;
} else {
PTPCOLL_VERBOSE(10 ,("My type is in group, pow_knum = %d [%d]", ptpcoll_module->pow_knum,
ptpcoll_module->pow_k));
ptpcoll_module->pow_ktype = PTPCOLL_KN_IN_GROUP;
}
} else {
ptpcoll_module->pow_ktype = PTPCOLL_KN_EXTRA;
ptpcoll_module->kn_proxy_extra_index[0] = (ptpcoll_module->super.sbgp_partner_module->my_index -
ptpcoll_module->pow_knum) / (ptpcoll_module->k_nomial_radix - 1);
PTPCOLL_VERBOSE(10 ,("My type is extra , pow_knum = %d [%d] my proxy %d",
ptpcoll_module->pow_knum,
ptpcoll_module->pow_k,
ptpcoll_module->kn_proxy_extra_index[0]));
}
return OMPI_SUCCESS;
Error:
if (NULL == ptpcoll_module->kn_proxy_extra_index) {
free(ptpcoll_module->kn_proxy_extra_index);
}
return OMPI_ERROR;
}
static int load_binomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
ptpcoll_module->pow_2 = pow_k_calc(2,
ptpcoll_module->group_size,
&ptpcoll_module->pow_2num);
assert(ptpcoll_module->pow_2num == 1 << ptpcoll_module->pow_2);
assert(ptpcoll_module->pow_2num <= ptpcoll_module->group_size);
/* Setting peer type for binary algorithm*/
if (ptpcoll_module->super.sbgp_partner_module->my_index < ptpcoll_module->pow_2num ) {
if (ptpcoll_module->super.sbgp_partner_module->my_index <
ptpcoll_module->group_size - ptpcoll_module->pow_2num) {
PTPCOLL_VERBOSE(10 ,("My type is proxy, pow_2num = %d [%d]", ptpcoll_module->pow_2num,
ptpcoll_module->pow_2));
ptpcoll_module->pow_2type = PTPCOLL_PROXY;
ptpcoll_module->proxy_extra_index = ptpcoll_module->super.sbgp_partner_module->my_index +
ptpcoll_module->pow_2num;
} else {
PTPCOLL_VERBOSE(10 ,("My type is in group, pow_2num = %d [%d]", ptpcoll_module->pow_2num,
ptpcoll_module->pow_2));
ptpcoll_module->pow_2type = PTPCOLL_IN_GROUP;
}
} else {
PTPCOLL_VERBOSE(10 ,("My type is extra , pow_2num = %d [%d]", ptpcoll_module->pow_2num,
ptpcoll_module->pow_2));
ptpcoll_module->pow_2type = PTPCOLL_EXTRA;
ptpcoll_module->proxy_extra_index = ptpcoll_module->super.sbgp_partner_module->my_index -
ptpcoll_module->pow_2num;
}
return OMPI_SUCCESS;
}
static int load_recursive_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
int rc = OMPI_SUCCESS;
rc = netpatterns_setup_recursive_knomial_tree_node(
ptpcoll_module->group_size,
ptpcoll_module->super.sbgp_partner_module->my_index,
mca_bcol_ptpcoll_component.k_nomial_radix,
&ptpcoll_module->knomial_exchange_tree);
return rc;
}
static int bcol_ptpcoll_collreq_init(opal_free_list_item_t *item, void* ctx)
{
mca_bcol_ptpcoll_module_t *ptpcoll_module= (mca_bcol_ptpcoll_module_t *) ctx;
mca_bcol_ptpcoll_collreq_t *collreq = (mca_bcol_ptpcoll_collreq_t *) item;
switch(mca_bcol_ptpcoll_component.barrier_alg) {
case 1:
collreq->requests = (ompi_request_t **)
calloc(2, sizeof(ompi_request_t *));
break;
case 2:
collreq->requests = (ompi_request_t **)
calloc(2 * ptpcoll_module->k_nomial_radix, sizeof(ompi_request_t *));
break;
}
if (NULL == collreq->requests) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}
/* query to see if the module is available for use on the given
* communicator, and if so, what it's priority is. This is where
* the backing shared-memory file is created.
*/
mca_bcol_base_module_t **mca_bcol_ptpcoll_comm_query(mca_sbgp_base_module_t *sbgp,
int *num_modules)
{
int rc;
/* local variables */
struct ompi_communicator_t *comm = sbgp->group_comm;
mca_bcol_ptpcoll_module_t *ptpcoll_module = NULL;
mca_bcol_base_module_t **ptpcoll_modules = NULL;
int iovec_size;
/* initialize local variables */
*num_modules = 0;
/*
* This is activated only for intra-communicators
*/
if (OMPI_COMM_IS_INTER(comm) ) {
return NULL;
}
/* allocate and initialize an sm-v2 module */
ptpcoll_modules = (mca_bcol_base_module_t **) malloc(sizeof(mca_bcol_base_module_t *));
if (NULL == ptpcoll_modules) {
return NULL;
}
ptpcoll_module = OBJ_NEW(mca_bcol_ptpcoll_module_t);
if (NULL == ptpcoll_module) {
free(ptpcoll_modules);
return NULL;
}
/* On this stage we support only one single module */
ptpcoll_modules[*num_modules] = &(ptpcoll_module->super);
(*num_modules)++;
/* set the subgroup */
ptpcoll_module->super.sbgp_partner_module = sbgp;
/* caching some useful information */
ptpcoll_module->group_size =
ptpcoll_module->super.sbgp_partner_module->group_size;
rc = load_binomial_info(ptpcoll_module);
if (OMPI_SUCCESS != rc) {
PTPCOLL_VERBOSE(10, ("Failed to load knomial info"));
goto CLEANUP;
}
rc = load_knomial_info(ptpcoll_module);
if (OMPI_SUCCESS != rc) {
PTPCOLL_VERBOSE(10, ("Failed to load knomial info"));
goto CLEANUP;
}
rc = load_narray_tree(ptpcoll_module);
if (OMPI_SUCCESS != rc) {
PTPCOLL_VERBOSE(10, ("Failed to load narray tree"));
goto CLEANUP;
}
rc = load_narray_knomial_tree(ptpcoll_module);
if (OMPI_SUCCESS != rc) {
PTPCOLL_VERBOSE(10, ("Failed to load narray-knomila tree"));
goto CLEANUP;
}
rc = load_recursive_knomial_info(ptpcoll_module);
if (OMPI_SUCCESS != rc) {
PTPCOLL_VERBOSE(10, ("Failed to load recursive knomial tree"));
goto CLEANUP;
}
/* creating collfrag free list */
OBJ_CONSTRUCT(&ptpcoll_module->collreqs_free, opal_free_list_t);
rc = opal_free_list_init (&ptpcoll_module->collreqs_free,
sizeof(mca_bcol_ptpcoll_collreq_t),
BCOL_PTP_CACHE_LINE_SIZE,
OBJ_CLASS(mca_bcol_ptpcoll_collreq_t),
0, BCOL_PTP_CACHE_LINE_SIZE,
256 /* free_list_num */,
-1 /* free_list_max, -1 = infinite */,
32 /* free_list_inc */,
NULL, 0, NULL,
bcol_ptpcoll_collreq_init,
ptpcoll_module);
if (OMPI_SUCCESS != rc) {
goto CLEANUP;
}
load_func(ptpcoll_module);
rc = alloc_allreduce_offsets_array(ptpcoll_module);
if (OMPI_SUCCESS != rc) {
goto CLEANUP;
}
/* Allocating iovec for PTP alltoall */
iovec_size = ptpcoll_module->group_size / 2 + ptpcoll_module->group_size % 2;
ptpcoll_module->alltoall_iovec = (struct iovec *) malloc(sizeof(struct iovec)
* iovec_size);
ptpcoll_module->log_group_size = lognum(ptpcoll_module->group_size);
rc = mca_bcol_base_bcol_fns_table_init(&(ptpcoll_module->super));
if (OMPI_SUCCESS != rc) {
goto CLEANUP;
}
/* Zero copy is supported */
ptpcoll_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY;
/* return */
return ptpcoll_modules;
CLEANUP:
OBJ_RELEASE(ptpcoll_module);
free(ptpcoll_modules);
return NULL;
}

Просмотреть файл

@ -1,405 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/include/ompi/constants.h"
#include "ompi/mca/bcol/bcol.h"
#include "bcol_ptpcoll_reduce.h"
#include "bcol_ptpcoll_utils.h"
static int bcol_ptpcoll_reduce_narray_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
static int bcol_ptpcoll_reduce_narray(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
#define NARRAY_RECV_NB(narray_node, process_shift, group_size, \
recv_buffer, pack_len, tag, comm, recv_requests, \
num_pending_recvs) \
do { \
int n, rc = OMPI_SUCCESS; \
int dst; \
int comm_dst; \
int offset = 0 ; \
\
/* Recieve data from all relevant childrens */ \
for (n = 0; n < narray_node->n_children; n++) { \
\
dst = narray_node->children_ranks[n] + process_shift; \
if (dst >= group_size) { \
dst -= group_size; \
} \
comm_dst = group_list[dst]; \
\
/* Non blocking send .... */ \
PTPCOLL_VERBOSE(1 , ("Reduce, Irecv data to %d[%d], count %d, tag %d, addr %p", \
dst, comm_dst, pack_len, tag, \
data_buffer)); \
rc = MCA_PML_CALL(irecv((void *)((unsigned char*)recv_buffer + offset), pack_len, MPI_BYTE, \
comm_dst, tag, comm, \
&(recv_requests[*num_pending_recvs]))); \
if( OMPI_SUCCESS != rc ) { \
PTPCOLL_VERBOSE(10, ("Failed to start non-blocking receive")); \
return OMPI_ERROR; \
} \
++(*num_pending_recvs); \
offset += pack_len; \
} \
} while(0)
static inline int narray_reduce(void *data_buffer, void *recv_buffer,
int nrecvs, int count,
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
int *reduction_status) {
int pack_len = count * dtype->super.size;
int i = 0;
void *source_buffer = NULL, *result_buffer = NULL;
source_buffer = data_buffer;
result_buffer = recv_buffer;
for (i = 0; i < nrecvs; i++) {
ompi_op_reduce(op, (void*)((unsigned char*) source_buffer) ,
(void*)((unsigned char*) result_buffer),
count,dtype);
source_buffer = (void *)((unsigned char*)recv_buffer
+ (i+1) * pack_len);
}
*reduction_status = 1;
return OMPI_SUCCESS;
}
static int bcol_ptpcoll_reduce_narray_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module;
int tag = -1;
int rc;
int group_size = ptpcoll_module->group_size;
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
uint32_t buffer_index = input_args->buffer_index;
struct ompi_op_t *op = input_args->op;
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
ompi_request_t **send_request =
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0];
ompi_request_t **recv_requests =
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[1];
void *data_buffer = NULL;
void *src_buffer = (void *) (
(unsigned char *)input_args->sbuf +
(size_t)input_args->sbuf_offset);
void *recv_buffer = (void *) (
(unsigned char *)input_args->rbuf +
(size_t)input_args->rbuf_offset);
int count = input_args->count;
struct ompi_datatype_t *dtype = input_args->dtype;
int pack_len = input_args->count * input_args->dtype->super.size;
int *active_requests =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
int matched = false;
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
int relative_group_index = 0;
netpatterns_tree_node_t *narray_node = NULL;
bool not_sent = false;
int parent_rank = -1, comm_parent_rank = -1;
int group_root_index = input_args->root;
if (!ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduce_init_called) {
bcol_ptpcoll_reduce_narray(input_args, const_args);
}
/*
* By default the src buffer is the data buffer,
* only after reduction, the recv buffer becomes the
* data buffer
*/
data_buffer = src_buffer;
relative_group_index = my_group_index - group_root_index;
if (relative_group_index < 0) {
relative_group_index +=group_size;
}
/* keep tag within the limit support by the pml */
tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
/* mark this as a collective tag, to avoid conflict with user-level tags */
tag = -tag;
narray_node = &ptpcoll_module->narray_node[relative_group_index];
PTPCOLL_VERBOSE(3, ("reduce, Narray tree Progress"));
PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_reduce_narray, buffer index: %d "
"tag: %d "
"tag_mask: %d "
"sn: %d "
"root: %d [%d]"
"buff: %p ",
buffer_index, tag,
ptpcoll_module->tag_mask, input_args->sequence_num,
input_args->root_flag, input_args->root_route->rank,
data_buffer));
/*
Check if the data was received
*/
if (0 != *active_requests) {
matched = mca_bcol_ptpcoll_test_all_for_match
(active_requests, recv_requests, &rc);
if (OMPI_SUCCESS != rc) {
return OMPI_ERROR;
}
/* All data was received, then do a reduction*/
if(matched) {
narray_reduce(data_buffer, recv_buffer, narray_node->n_children, count, dtype, op,
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status);
/*
* The reduction result is in the recv buffer, so it is the new data
* buffer
*/
data_buffer = recv_buffer;
/* If not reduced, means also, you might not posted a send */
not_sent = true;
} else {
PTPCOLL_VERBOSE(10, ("reduce root is started"));
return BCOL_FN_STARTED;
}
}
/* I'm root, I'm done */
if (input_args->root_flag) {
return BCOL_FN_COMPLETE;
}
PTPCOLL_VERBOSE(1,("Testing Sending Match"));
/* If send was not posted */
/* Manju: Leaf node should never post in the progress logic */
if (not_sent) {
parent_rank =
ptpcoll_module->narray_node[relative_group_index].parent_rank +
group_root_index;
if (parent_rank >= group_size) {
parent_rank -= group_size;
}
comm_parent_rank = group_list[parent_rank];
PTPCOLL_VERBOSE(1,("Sending data to %d ",comm_parent_rank));
rc = MCA_PML_CALL(isend(data_buffer, pack_len, MPI_BYTE,
comm_parent_rank,
tag, MCA_PML_BASE_SEND_STANDARD, comm, send_request));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to send data"));
return OMPI_ERROR;
}
}
if (0 == mca_bcol_ptpcoll_test_for_match(send_request, &rc)) {
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
/* Data has not been sent. Return that the collective has been stated
* because we MUST call test on this request once it is finished to
* ensure that it is properly freed. */
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_reduce_narray(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args)
{
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module;
int tag;
int rc;
int group_size = ptpcoll_module->group_size;
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
uint32_t buffer_index = input_args->buffer_index;
struct ompi_op_t *op = input_args->op;
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
ompi_request_t **recv_requests =
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[1];
ompi_request_t **send_request =
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0];
void *data_buffer = NULL;
void *src_buffer = (void *) (
(unsigned char *)input_args->sbuf +
(size_t)input_args->sbuf_offset);
void *recv_buffer = (void *) (
(unsigned char *)input_args->rbuf +
(size_t)input_args->rbuf_offset);
int count = input_args->count;
struct ompi_datatype_t *dtype = input_args->dtype;
int pack_len = input_args->count * input_args->dtype->super.size;
int *active_requests =
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
int matched = true;
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
int group_root_index = -1;
int relative_group_index = 0;
netpatterns_tree_node_t *narray_node = NULL;
int parent_rank = -1, comm_parent_rank = -1;
/* This is first function call that should be called, not progress.
* The fragmentation code does this, so switch from progress to here.
* The flag indicates whether, we have entered this code *
*/
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduce_init_called = true;
PTPCOLL_VERBOSE(1, ("Reduce, Narray tree"));
/* reset active request counter */
(*active_requests) = 0;
/* keep tag within the limit support by the pml */
tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
/* mark this as a collective tag, to avoid conflict with user-level flags */
tag = -tag;
PTPCOLL_VERBOSE(1, ("bcol_ptpcoll_reduce_narray, buffer index: %d "
"tag: %d "
"tag_mask: %d "
"sn: %d "
"root: %d "
"buff: %p ",
buffer_index, tag,
ptpcoll_module->tag_mask, input_args->sequence_num,
input_args->root_flag,
src_buffer));
/* Compute Root Index Shift */
group_root_index = input_args->root;
relative_group_index = my_group_index - group_root_index;
if (relative_group_index < 0) {
relative_group_index += group_size;
}
narray_node = &ptpcoll_module->narray_node[relative_group_index];
if (0 == narray_node->n_children) {
PTPCOLL_VERBOSE(10, ("I'm leaf of the data"));
/*
* I'm root of the operation
* send data to N childrens
*/
data_buffer = src_buffer;
goto NARRAY_SEND_DATA;
}
/* Not leaf, either an internal node or root */
NARRAY_RECV_NB(narray_node, group_root_index, group_size,
recv_buffer, pack_len, tag, comm, recv_requests,
active_requests);
/* We have not done reduction, yet */
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status = 0;
/* We can not block. So run couple of test for data arrival */
matched = mca_bcol_ptpcoll_test_all_for_match
(active_requests, recv_requests, &rc);
/* Check if received the data */
if(matched) {
narray_reduce(src_buffer, recv_buffer, narray_node->n_children,
count, dtype, op, &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status);
PTPCOLL_VERBOSE(1, ("Reduce, received data from all childrend "));
data_buffer = recv_buffer;
} else {
PTPCOLL_VERBOSE(1, ("reduce root is started"));
return BCOL_FN_STARTED;
}
/* I'm root, I'm done */
if (input_args->root_flag) {
return BCOL_FN_COMPLETE;
}
NARRAY_SEND_DATA:
/*
* Send the data (reduce in case of internal nodes, or just data in
* case of leaf nodes) to the parent
*/
narray_node = &ptpcoll_module->narray_node[relative_group_index];
parent_rank =
ptpcoll_module->narray_node[relative_group_index].parent_rank +
group_root_index;
if (parent_rank >= group_size) {
parent_rank -= group_size;
}
comm_parent_rank = group_list[parent_rank];
PTPCOLL_VERBOSE(1,("Sending data to %d ",comm_parent_rank));
rc = MCA_PML_CALL(isend(data_buffer, pack_len, MPI_BYTE,
comm_parent_rank,
tag, MCA_PML_BASE_SEND_STANDARD, comm, send_request));
if( OMPI_SUCCESS != rc ) {
PTPCOLL_VERBOSE(10, ("Failed to send data"));
return OMPI_ERROR;
}
/* We can not block. So run couple of test for data arrival */
if (0 == mca_bcol_ptpcoll_test_for_match(send_request, &rc)) {
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
/* No data was received, return no match error */
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
}
return BCOL_FN_COMPLETE;
}
int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super)
{
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
PTPCOLL_VERBOSE(1,("Initialization Reduce - Narray"));
comm_attribs.bcoll_type = BCOL_REDUCE;
comm_attribs.comm_size_min = 0;
comm_attribs.comm_size_max = 1024 * 1024;
comm_attribs.waiting_semantics = NON_BLOCKING;
inv_attribs.bcol_msg_min = 0;
inv_attribs.bcol_msg_max = 20000; /* range 1 */
inv_attribs.datatype_bitmap = 0xffffffff;
inv_attribs.op_types_bitmap = 0xffffffff;
comm_attribs.data_src = DATA_SRC_KNOWN;
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
bcol_ptpcoll_reduce_narray,
bcol_ptpcoll_reduce_narray_progress);
comm_attribs.data_src = DATA_SRC_KNOWN;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,25 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_PTPCOLL_REDUCE_H
#define MCA_BCOL_PTPCOLL_REDUCE_H
#include "ompi_config.h"
#include "bcol_ptpcoll.h"
#include "bcol_ptpcoll_utils.h"
BEGIN_C_DECLS
int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super);
int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super);
#endif /* MCA_BCOL_PTPCOLL_REDUCE_H */

Просмотреть файл

@ -1,139 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "bcol_ptpcoll.h"
#include "bcol_ptpcoll_utils.h"
/*
* Return closet power of K, for the number, and the number
*/
int pow_k_calc(int k, int number, int *out_number)
{
int power = 0;
int n = 1;
while (n < number) {
n *= k;
++power;
}
if (n > number) {
n /= k;
--power;
}
if (NULL != out_number) {
*out_number = n;
}
return power;
}
/*
* Communicator rank to group index conversion function for K-nomial tree.
* Complexity: (K-1) Log _base_K N
*
* Input:
* my_group_index - my process index in the group
* comm_source - the communicator rank of the source of data
* radix - radix of K-nomial tree
* group_size - the size of my group
* group_array[] - one to one map from group index to communicator rank
*
* Output:
* Group index for comm_source.
*/
int get_group_index_and_distance_for_binomial(int my_group_index, int comm_source,
int group_size, int *group_array, int *pow_distance)
{
int group_index;
int i;
*pow_distance = 0;
for (i = 1; i < group_size; i<<=1, (*pow_distance)++) {
group_index = my_group_index ^ i;
if (comm_source == group_array[group_index]) {
return group_index;
}
}
*pow_distance = -1;
return -1;
}
int get_group_index_and_distance_for_k_nomial(int my_group_index, int comm_source, int radix,
int group_size, int *group_array, int *pow_distance)
{
int group_index;
int offset = 1; /* offset equal to 1 (radix_power) */
int radix_power = 1; /* radix power 0 */
*pow_distance = 0;
/*
* Go trough range of possible offsets from my rank,
* for each offset we calculate k-nomial tree root.
*/
while(offset < group_size) {
/* K-nomial tree root calculation for the offset */
if (offset % (radix * radix_power)) {
group_index = my_group_index - offset;
/* wrap around if the group is negative */
if (group_index < 0) {
group_index += group_size;
}
PTPCOLL_VERBOSE(10, ("Checking %d", group_index));
if (comm_source == group_array[group_index]) {
return group_index;
}
offset += radix_power;
} else {
/* we done with the section of the tree, go to next one */
radix_power *= radix;
(*pow_distance)++;
}
}
/* No source was found, return -1 */
*pow_distance = -1;
return -1;
}
int get_group_index_for_k_nomial(int my_group_index, int comm_source, int radix, int group_size, int *group_array)
{
int group_index;
int radix_power = 1; /* radix power 0 */
int offset = 1; /* offset equal to 1 (radix_power) */
/*
* Go trough range of possible offsets from my rank,
* for each offset we calculate k-nomial tree root.
*/
while(offset < group_size) {
/* K-nomial tree root calculation for the offset */
if (offset % (radix * radix_power)) {
group_index = my_group_index - offset;
/* wrap around if the group is negative */
if (group_index < 0) {
group_index += group_size;
}
if (comm_source == group_array[group_index]) {
return group_index;
}
offset += radix_power;
} else {
/* we done with the section of the tree, go to next one */
radix_power *= radix;
}
}
/* No source was found, return -1 */
return -1;
}

Просмотреть файл

@ -1,80 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_PTPCOLL_UTILS_H
#define MCA_BCOL_PTPCOLL_UTILS_H
#include "ompi_config.h"
#include "ompi/mca/rte/rte.h"
BEGIN_C_DECLS
/*
* Return closet power of K, for the number
*/
int pow_k_calc(int k, int number, int *out_number);
/*
* Communicator rank to group index conversion function for K-nomial tree.
*/
int get_group_index_for_k_nomial(int my_group_index, int comm_source, int radix, int group_size, int *group_array);
/* the same like above, just more information on return */
int get_group_index_and_distance_for_k_nomial(int my_group_index, int comm_source, int radix,
int group_size, int *group_array, int *pow_distance);
int get_group_index_and_distance_for_binomial(int my_group_index, int comm_source,
int group_size, int *group_array, int *pow_distance);
/*
* Error and debug Macros/Functions
*/
static inline int mca_bcol_ptpcoll_err(const char* fmt, ...)
{
va_list list;
int ret;
va_start(list, fmt);
ret = vfprintf(stderr, fmt, list);
va_end(list);
return ret;
}
#define PTPCOLL_ERROR(args) \
do { \
mca_bcol_ptpcoll_err("[%s]%s[%s:%d:%s] PTPCOLL ", \
ompi_process_info.nodename, \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__); \
mca_bcol_ptpcoll_err args; \
mca_bcol_ptpcoll_err("\n"); \
} while(0)
#if OPAL_ENABLE_DEBUG
#define PTPCOLL_VERBOSE(level, args) \
do { \
if (mca_bcol_ptpcoll_component.verbose >= level) { \
mca_bcol_ptpcoll_err("[%s]%s[%s:%d:%s] PTPCOLL ", \
ompi_process_info.nodename, \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__); \
mca_bcol_ptpcoll_err args; \
mca_bcol_ptpcoll_err("\n"); \
} \
} while(0)
#else
#define PTPCOLL_VERBOSE(level, args)
#endif
END_C_DECLS
#endif

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: ORNL
status: unmaintained

Просмотреть файл

@ -1,89 +0,0 @@
#
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
# Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2016 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_LFLAGS = -Pcoll_ml_config_yy
LEX_OUTPUT_ROOT = lex.coll_ml_config_yy
dist_ompidata_DATA = \
mca-coll-ml.config \
help-mpi-coll-ml.txt
sources = coll_ml.h \
coll_ml_inlines.h \
coll_ml_module.c \
coll_ml_allocation.h \
coll_ml_allocation.c \
coll_ml_barrier.c \
coll_ml_bcast.c \
coll_ml_colls.h \
coll_ml_component.c \
coll_ml_copy_fns.c \
coll_ml_descriptors.c \
coll_ml_functions.h \
coll_ml_hier_algorithms.c \
coll_ml_hier_algorithms_setup.c \
coll_ml_hier_algorithms_bcast_setup.c \
coll_ml_hier_algorithms_allreduce_setup.c \
coll_ml_hier_algorithms_reduce_setup.c \
coll_ml_hier_algorithms_common_setup.c \
coll_ml_hier_algorithms_common_setup.h \
coll_ml_hier_algorithms_allgather_setup.c \
coll_ml_hier_algorithm_memsync_setup.c \
coll_ml_custom_utils.h \
coll_ml_custom_utils.c \
coll_ml_progress.c \
coll_ml_reduce.c \
coll_ml_allreduce.c \
coll_ml_allgather.c \
coll_ml_mca.h \
coll_ml_mca.c \
coll_ml_lmngr.h \
coll_ml_lmngr.c \
coll_ml_hier_algorithms_barrier_setup.c \
coll_ml_select.h \
coll_ml_select.c \
coll_ml_memsync.c \
coll_ml_lex.h \
coll_ml_lex.l \
coll_ml_config.c \
coll_ml_config.h
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
component_noinst =
component_install =
if MCA_BUILD_ompi_coll_ml_DSO
component_install += mca_coll_ml.la
else
component_noinst += libmca_coll_ml.la
endif
# See ompi/mca/btl/ml/Makefile.am for an explanation of
# libmca_common_ml.la.
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_coll_ml_la_SOURCES = $(sources)
mca_coll_ml_la_LDFLAGS = -module -avoid-version
mca_coll_ml_la_LIBADD =
noinst_LTLIBRARIES = $(component_noinst)
libmca_coll_ml_la_SOURCES =$(sources)
libmca_coll_ml_la_LDFLAGS = -module -avoid-version
maintainer-clean-local:
rm -f coll_ml_lex.c

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,633 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file */
#include "ompi_config.h"
#include <stdlib.h>
#include "ompi/constants.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/bcol/bcol.h"
#include "opal/sys/atomic.h"
#include "coll_ml.h"
#include "coll_ml_select.h"
#include "coll_ml_allocation.h"
static int mca_coll_ml_allgather_small_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
{
bool rcontig = coll_op->full_message.recv_data_continguous;
int n_ranks_in_comm = ompi_comm_size(OP_ML_MODULE(coll_op)->comm);
void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr +
(uintptr_t)coll_op->full_message.n_bytes_delivered);
void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
(size_t)coll_op->variable_fn_params.rbuf_offset);
if (rcontig) {
memcpy(dest, src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled);
} else {
mca_coll_ml_convertor_unpack(src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled,
&coll_op->fragment_data.message_descriptor->recv_convertor);
}
return OMPI_SUCCESS;
}
static inline void copy_data (mca_coll_ml_collective_operation_progress_t *coll_op, rank_properties_t *rank_props, int soffset) {
bool rcontig = coll_op->fragment_data.message_descriptor->recv_data_continguous;
size_t total_bytes = coll_op->fragment_data.message_descriptor->n_bytes_total;
size_t pack_len = coll_op->fragment_data.fragment_size;
int doffset = rank_props->rank;
void *dest, *src;
src = (void *) ((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
(size_t)coll_op->variable_fn_params.rbuf_offset + soffset * pack_len);
if (rcontig) {
dest = (void *) ((uintptr_t) coll_op->full_message.dest_user_addr +
(uintptr_t) coll_op->fragment_data.offset_into_user_buffer +
doffset * total_bytes);
memcpy(dest, src, pack_len);
} else {
size_t position;
opal_convertor_t *recv_convertor =
&coll_op->fragment_data.message_descriptor->recv_convertor;
position = (size_t) coll_op->fragment_data.offset_into_user_buffer +
doffset * total_bytes;
opal_convertor_set_position(recv_convertor, &position);
mca_coll_ml_convertor_unpack(src, pack_len, recv_convertor);
}
}
static int mca_coll_ml_allgather_noncontiguous_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
{
int i, j, n_level_one_sbgps;
size_t soffset;
mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info;
sub_group_params_t *array_of_all_subgroup_ranks = topo_info->array_of_all_subgroups;
n_level_one_sbgps = array_of_all_subgroup_ranks->level_one_index;
for (i = 0 ; i < n_level_one_sbgps; i++) {
/* determine where in the source buffer the data can be found */
soffset = array_of_all_subgroup_ranks[i].index_of_first_element;
for (j = 0 ; j < array_of_all_subgroup_ranks[i].n_ranks; j++, ++soffset) {
copy_data (coll_op, array_of_all_subgroup_ranks[i].rank_data + j, soffset);
}
}
return OMPI_SUCCESS;
}
/* Allgather dependencies seem easy, everyone needs to work from the "bottom up".
* Following Pasha, I too will put the simplest dependencies graph and change it later
* when we add hierarchy. Basically, allgather has the same dependency profile as the
* sequential broadcast except that there is only a single ordering of tasks.
*/
static int mca_coll_ml_allgather_task_setup(mca_coll_ml_collective_operation_progress_t *coll_op)
{
int fn_idx, h_level, my_index, root;
mca_sbgp_base_module_t *sbgp;
mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info;
fn_idx = coll_op->sequential_routine.current_active_bcol_fn;
h_level = coll_op->coll_schedule->component_functions[fn_idx].h_level;
sbgp = topo->component_pairs[h_level].
subgroup_module;
my_index = sbgp->my_index;
/* In the case of allgather, the local leader is always the root */
root = 0;
if (my_index == root) {
coll_op->variable_fn_params.root_flag = true;
coll_op->variable_fn_params.root_route = NULL;
} else {
coll_op->variable_fn_params.root_flag = false;
coll_op->variable_fn_params.root_route = &topo->route_vector[root];
}
return OMPI_SUCCESS;
}
static int mca_coll_ml_allgather_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
{
/* local variables */
int ret;
size_t frag_len, dt_size;
const void *buf;
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
mca_coll_ml_collective_operation_progress_t *new_op;
mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
bool scontig = coll_op->fragment_data.message_descriptor->send_data_continguous;
ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size);
/* Keep the pipeline filled with fragments */
while (coll_op->fragment_data.message_descriptor->n_active <
coll_op->fragment_data.message_descriptor->pipeline_depth) {
/* If an active fragment happens to have completed the collective during
* a hop into the progress engine, then don't launch a new fragment,
* instead break and return.
*/
if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
== coll_op->fragment_data.message_descriptor->n_bytes_total) {
break;
}
/* Get an ml buffer */
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
if (NULL == src_buffer_desc) {
/* If there exist outstanding fragments, then break out
* and let an active fragment deal with this later,
* there are no buffers available.
*/
if (0 < coll_op->fragment_data.message_descriptor->n_active) {
return OMPI_SUCCESS;
} else {
/* The fragment is already on list and
* the we still have no ml resources
* Return busy */
if (coll_op->pending & REQ_OUT_OF_MEMORY) {
ML_VERBOSE(10,("Out of resources %p", coll_op));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
coll_op->pending |= REQ_OUT_OF_MEMORY;
opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
(opal_list_item_t *)coll_op);
ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
}
/* Get a new collective descriptor and initialize it */
new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
coll_op->fragment_data.message_descriptor->src_user_addr,
coll_op->fragment_data.message_descriptor->dest_user_addr,
coll_op->fragment_data.message_descriptor->n_bytes_total,
coll_op->fragment_data.message_descriptor->n_bytes_scheduled);
new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;
/* set the task setup callback */
new_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
/*
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
src_buffer_desc->buffer_index, src_buffer_desc);
*/
/* We need this address for pointer arithmetic in memcpy */
buf = coll_op->fragment_data.message_descriptor->src_user_addr;
if (!scontig) {
frag_len = ml_module->small_message_thresholds[BCOL_ALLGATHER];
mca_coll_ml_convertor_get_send_frag_size(
ml_module, &frag_len,
coll_op->fragment_data.message_descriptor);
mca_coll_ml_convertor_pack(
(void *) ((uintptr_t) src_buffer_desc->data_addr +
frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
frag_len, &coll_op->fragment_data.message_descriptor->send_convertor);
} else {
/* calculate new frag length, there are some issues here */
frag_len = (coll_op->fragment_data.message_descriptor->n_bytes_total -
coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
coll_op->fragment_data.fragment_size ?
coll_op->fragment_data.message_descriptor->n_bytes_total -
coll_op->fragment_data.message_descriptor->n_bytes_scheduled :
coll_op->fragment_data.fragment_size);
/* everybody copies in, based on the new values */
memcpy((void *) ((uintptr_t)src_buffer_desc->data_addr +
frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].offset +
frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
(void *) ((uintptr_t) buf + (uintptr_t)
coll_op->fragment_data.message_descriptor->n_bytes_scheduled), frag_len);
}
new_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
new_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
/* update the number of bytes scheduled */
new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
/* everyone needs an unpack function */
new_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
new_op->fragment_data.fragment_size = frag_len;
new_op->fragment_data.buffer_desc = src_buffer_desc;
/* Setup fragment specific data */
++(new_op->fragment_data.message_descriptor->n_active);
ML_VERBOSE(10, ("Start more, My index %d ",
new_op->fragment_data.buffer_desc->buffer_index));
/* this is a bit buggy */
ML_SET_VARIABLE_PARAMS_BCAST(
new_op,
OP_ML_MODULE(new_op),
frag_len /* yes, we have consistent units, so this makes sense */,
MPI_BYTE /* we fragment according to buffer size
* we don't reduce the data thus we needn't
* keep "whole" datatypes, we may freely
* fragment without regard for multiples
* of any specific datatype
*/,
src_buffer_desc,
0,
0,
frag_len,
src_buffer_desc->data_addr);
/* initialize first coll */
ret = new_op->sequential_routine.seq_task_setup(new_op);
if (OMPI_SUCCESS != ret) {
ML_VERBOSE(3, ("Fragment failed to initialize itself"));
return ret;
}
new_op->variable_fn_params.buffer_size = frag_len;
new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
new_op->variable_fn_params.root = 0;
MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
/* append this collective !! */
OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
opal_list_append(&mca_coll_ml_component.sequential_collectives,
(opal_list_item_t *)new_op);
OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
}
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__
int mca_coll_ml_allgather_start (const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
ompi_request_t **req)
{
size_t pack_len, sdt_size;
int ret, n_fragments = 1, comm_size;
mca_coll_ml_topology_t *topo_info;
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
mca_coll_ml_component_t *cm = &mca_coll_ml_component;
mca_coll_ml_collective_operation_progress_t *coll_op;
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
ptrdiff_t lb, extent;
bool scontig, rcontig, in_place = false;
/* check for in place setting */
if (MPI_IN_PLACE == sbuf) {
in_place = true;
sdtype = rdtype;
scount = rcount;
}
/* scontig could be != to rcontig */
scontig = ompi_datatype_is_contiguous_memory_layout(sdtype, scount);
rcontig = ompi_datatype_is_contiguous_memory_layout(rdtype, rcount);
comm_size = ompi_comm_size(comm);
ML_VERBOSE(10, ("Starting allgather"));
assert(NULL != sdtype);
/* Calculate size of the data,
* at this stage, only contiguous data is supported */
/* this is valid for allagther */
ompi_datatype_type_size(sdtype, &sdt_size);
pack_len = scount * sdt_size;
if (in_place) {
sbuf = (char *) rbuf + ompi_comm_rank(comm) * pack_len;
}
/* Allocate collective schedule and pack message */
/* this is the total ending message size that will need to fit in the ml-buffer */
if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]) {
/* The len of the message can not be larger than ML buffer size */
ML_VERBOSE(10, ("Single frag %d %d %d", pack_len, comm_size, ml_module->payload_block->size_buffer));
assert(pack_len * comm_size <= ml_module->payload_block->size_buffer);
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
while (NULL == src_buffer_desc) {
opal_progress();
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
}
/* change 1 */
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
sbuf, rbuf, pack_len, 0 /* offset for first pack */);
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
src_buffer_desc->buffer_index, src_buffer_desc);
coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;
/* task setup callback function */
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
/* change 2 */
if (!scontig) {
coll_op->full_message.n_bytes_scheduled =
mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
&coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);
mca_coll_ml_convertor_pack(
(void *) ((uintptr_t) src_buffer_desc->data_addr + pack_len *
(coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
pack_len, &coll_op->full_message.send_convertor);
} else {
/* change 3 */
memcpy((void *)((uintptr_t) src_buffer_desc->data_addr + pack_len *
(coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
sbuf, pack_len);
coll_op->full_message.n_bytes_scheduled = pack_len;
}
if (!rcontig) {
mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
&coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
}
if (coll_op->coll_schedule->topo_info->ranks_contiguous) {
coll_op->process_fn = mca_coll_ml_allgather_small_unpack_data;
} else {
coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
}
/* whole ml-buffer is used to send AND receive */
coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
/* we can set the initial offset here */
coll_op->variable_fn_params.sbuf_offset = 0;
coll_op->variable_fn_params.rbuf_offset = 0;
coll_op->variable_fn_params.count = scount;
coll_op->fragment_data.fragment_size =
coll_op->full_message.n_bytes_scheduled;
/* For small CINCO, we may use the native datatype */
coll_op->variable_fn_params.dtype = sdtype;
coll_op->variable_fn_params.buffer_size = pack_len;
coll_op->variable_fn_params.root = 0;
} else if (cm->enable_fragmentation || pack_len * comm_size < (1 << 20)) {
/* calculate the number of fragments and the size of each frag */
size_t n_dts_per_frag, frag_len;
int pipeline_depth = mca_coll_ml_component.pipeline_depth;
/* Calculate the number of fragments required for this message careful watch the integer division !*/
frag_len = (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER] ?
pack_len : (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]);
n_dts_per_frag = frag_len / sdt_size;
n_fragments = (pack_len + sdt_size * n_dts_per_frag - 1) / (sdt_size * n_dts_per_frag);
pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth);
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
while (NULL == src_buffer_desc) {
opal_progress();
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
}
/* change 4 */
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
sbuf, rbuf, pack_len,
0 /* offset for first pack */);
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
src_buffer_desc->buffer_index, src_buffer_desc);
topo_info = coll_op->coll_schedule->topo_info;
/* task setup callback function */
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
if (!scontig) {
coll_op->full_message.send_converter_bytes_packed =
mca_coll_ml_convertor_prepare(
sdtype, scount, NULL,
&coll_op->full_message.dummy_convertor,
MCA_COLL_ML_NET_STREAM_SEND);
coll_op->full_message.dummy_conv_position = 0;
mca_coll_ml_convertor_get_send_frag_size(
ml_module, &frag_len,
&coll_op->full_message);
/* change 5 */
mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
&coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);
mca_coll_ml_convertor_pack(
(void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len *
(topo_info->hier_layout_info[0].offset +
topo_info->hier_layout_info[0].level_one_index)),
frag_len, &coll_op->full_message.send_convertor);
} else {
/* change 6 */
memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len *
(topo_info->hier_layout_info[0].offset +
topo_info->hier_layout_info[0].level_one_index)),
sbuf, frag_len);
}
if (!rcontig) {
mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
&coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
}
coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
/* hopefully this doesn't royaly screw things up idea behind this is the
* whole ml-buffer is used to send and receive
*/
coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
/* we can set the initial offset here */
coll_op->variable_fn_params.sbuf_offset = 0;
coll_op->variable_fn_params.rbuf_offset = 0;
coll_op->fragment_data.buffer_desc = src_buffer_desc;
coll_op->fragment_data.fragment_size = frag_len;
coll_op->fragment_data.message_descriptor->n_active = 1;
coll_op->full_message.n_bytes_scheduled = frag_len;
coll_op->full_message.fragment_launcher = mca_coll_ml_allgather_frag_progress;
coll_op->full_message.pipeline_depth = pipeline_depth;
coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;
/* remember this is different for frags !! Caused data corruption when
* not properly set. Need to be sure you have consistent units.
*/
coll_op->variable_fn_params.count = frag_len;
coll_op->variable_fn_params.dtype = MPI_BYTE; /* for fragmented data, we work in
* units of bytes. This means that
* all of our arithmetic is done
* in terms of bytes
*/
coll_op->variable_fn_params.root = 0;
coll_op->variable_fn_params.frag_size = frag_len;
coll_op->variable_fn_params.buffer_size = frag_len;
} else {
/* change 7 */
ML_VERBOSE(10, ("ML_ALLGATHER_LARGE_DATA_KNOWN case."));
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_allgather_functions[ML_LARGE_DATA_ALLGATHER],
sbuf, rbuf, pack_len, 0 /* offset for first pack */);
topo_info = coll_op->coll_schedule->topo_info;
if (MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG & topo_info->all_bcols_mode) {
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, MCA_COLL_ML_NO_BUFFER, NULL);
} else {
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
while (NULL == src_buffer_desc) {
opal_progress();
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
}
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc);
}
/* not sure if I really need this here */
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
coll_op->process_fn = NULL;
/* probably the most important piece */
coll_op->variable_fn_params.sbuf = sbuf;
coll_op->variable_fn_params.rbuf = rbuf;
coll_op->variable_fn_params.sbuf_offset = 0;
coll_op->variable_fn_params.rbuf_offset = 0;
coll_op->variable_fn_params.count = scount;
coll_op->variable_fn_params.dtype = sdtype;/* for zero copy, we want the
* native datatype and actual count
*/
coll_op->variable_fn_params.root = 0;
/* you still need to copy in your own data into the rbuf */
/* don't need to do this if you have in place data */
if (!in_place) {
memcpy((char *) rbuf + ompi_comm_rank(comm) * pack_len, sbuf, pack_len);
}
}
coll_op->full_message.send_count = scount;
coll_op->full_message.recv_count = rcount;
coll_op->full_message.send_data_continguous = scontig;
coll_op->full_message.recv_data_continguous = rcontig;
ompi_datatype_get_extent(sdtype, &lb, &extent);
coll_op->full_message.send_extent = (size_t) extent;
ompi_datatype_get_extent(rdtype, &lb, &extent);
coll_op->full_message.recv_extent = (size_t) extent;
/* Fill in the function arguments */
coll_op->variable_fn_params.sequence_num =
OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
coll_op->variable_fn_params.hier_factor = comm_size;
MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);
ret = mca_coll_ml_launch_sequential_collective (coll_op);
if (OMPI_SUCCESS != ret) {
ML_VERBOSE(10, ("Failed to launch"));
return ret;
}
*req = &coll_op->full_message.super;
return OMPI_SUCCESS;
}
int mca_coll_ml_allgather(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
ompi_request_t *req;
int ret;
ML_VERBOSE(10, ("Starting blocking allgather"));
ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module, &req);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
ret = ompi_request_wait (&req, MPI_STATUS_IGNORE);
ML_VERBOSE(10, ("Blocking allgather is complete"));
return ret;
}
int mca_coll_ml_allgather_nb(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
ompi_request_t **req,
mca_coll_base_module_t *module)
{
int ret;
ML_VERBOSE(10, ("Starting non-blocking allgather"));
ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module, req);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
ML_VERBOSE(10, ("Non-blocking allgather started"));
return ret;
}

Просмотреть файл

@ -1,213 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <stdlib.h>
#include "coll_ml.h"
#include "coll_ml_inlines.h"
#include "coll_ml_allocation.h"
mca_bcol_base_memory_block_desc_t *mca_coll_ml_allocate_block(struct mca_coll_ml_component_t *ml_component,
mca_bcol_base_memory_block_desc_t *ml_memblock)
{
mca_bcol_base_memory_block_desc_t *ret = NULL;
mca_bcol_base_memory_block_desc_t *memory_block = NULL;
mca_coll_ml_lmngr_t *memory_manager = NULL;
if (ml_memblock) {
ML_ERROR(("Memory already allocated - expecting NULL pointer"));
return ret;
}
memory_block = (mca_bcol_base_memory_block_desc_t*) calloc(1, sizeof(mca_bcol_base_memory_block_desc_t));
if (NULL == memory_block){
ML_ERROR(("Couldn't allocate memory for ml_memblock"));
return ret;
}
memory_manager = &ml_component->memory_manager;
memory_block->block = mca_coll_ml_lmngr_alloc(memory_manager);
memory_block->size_block = memory_manager->list_block_size;
if (!memory_block->block){
ML_VERBOSE(1, ("lmngr failed."));
free(memory_block);
return NULL;
}
return memory_block;
}
void mca_coll_ml_free_block (mca_bcol_base_memory_block_desc_t *ml_memblock)
{
if (!ml_memblock)
return;
if (ml_memblock->buffer_descs){
free(ml_memblock->buffer_descs);
}
mca_coll_ml_lmngr_free(ml_memblock->block);
free(ml_memblock->bank_release_counters);
free(ml_memblock->ready_for_memsync);
free(ml_memblock->bank_is_busy);
free(ml_memblock);
}
int mca_coll_ml_initialize_block(mca_bcol_base_memory_block_desc_t *ml_memblock,
uint32_t num_buffers,
uint32_t num_banks,
uint32_t buffer_size,
int32_t data_offset,
opal_list_t *bcols_in_use)
{
int ret = OMPI_SUCCESS;
uint32_t bank_loop, buff_loop;
uint64_t addr_offset = 0;
mca_bcol_base_payload_buffer_desc_t *pbuff_descs = NULL,*pbuff_desc = NULL;
if (0 == num_banks || 0 == num_buffers || 0 == buffer_size) {
return OMPI_ERR_BAD_PARAM;
}
if (NULL == ml_memblock){
ML_ERROR(("Memory block not initialized"));
ret = OMPI_ERROR;
goto exit_ERROR;
}
if (ml_memblock->size_block < (num_buffers * num_banks * buffer_size) ){
ML_ERROR(("Not enough memory for all buffers and banks in the memory block"));
ret = OMPI_ERROR;
goto exit_ERROR;
}
pbuff_descs = (mca_bcol_base_payload_buffer_desc_t*) malloc(sizeof(mca_bcol_base_payload_buffer_desc_t)
* num_banks * num_buffers);
if (NULL == pbuff_descs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(bank_loop = 0; bank_loop < num_banks; bank_loop++)
for(buff_loop = 0; buff_loop < num_buffers; buff_loop++){
pbuff_desc = &pbuff_descs[bank_loop*num_buffers + buff_loop];
pbuff_desc->base_data_addr = (void *)
((char *)ml_memblock->block->base_addr + addr_offset);
pbuff_desc->data_addr = (void *)
((char *)pbuff_desc->base_data_addr + (size_t)data_offset);
addr_offset+=buffer_size;
pbuff_desc->buffer_index = BUFFER_INDEX(bank_loop,num_buffers,buff_loop);
pbuff_desc->bank_index=bank_loop;
pbuff_desc->generation_number=0;
}
/* Initialize ml memory block */
/* gvm FIX:This counter when zero indicates that the bank is ready for
* recycle. This is initialized to number of bcol components as each bcol is responsible for
* releasing the buffers of a bank. This initialization will have
* faulty behavior, example in case of multiple interfaces, when more than
* one bcol module of the component type is in use.
*/
ml_memblock->bank_release_counters = (uint32_t *) calloc(num_banks, sizeof(uint32_t));
if (NULL == ml_memblock->bank_release_counters) {
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto exit_ERROR;
}
ml_memblock->ready_for_memsync = (bool *) calloc(num_banks, sizeof(bool));
if (NULL == ml_memblock->ready_for_memsync) {
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto exit_ERROR;
}
ml_memblock->bank_is_busy = (bool *) calloc(num_banks, sizeof(bool));
if (NULL == ml_memblock->bank_is_busy) {
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto exit_ERROR;
}
/* Set index for first bank to sync */
ml_memblock->memsync_counter = 0;
/* use first bank and first buffer */
ml_memblock->next_free_buffer = 0;
ml_memblock->block_addr_offset = addr_offset;
ml_memblock->num_buffers_per_bank = num_buffers;
ml_memblock->num_banks = num_banks;
ml_memblock->size_buffer = buffer_size;
ml_memblock->buffer_descs = pbuff_descs;
return ret;
exit_ERROR:
/* Free all buffer descriptors */
if (pbuff_descs){
free(pbuff_descs);
}
return ret;
}
mca_bcol_base_payload_buffer_desc_t *mca_coll_ml_alloc_buffer (mca_coll_ml_module_t *module)
{
uint64_t bindex;
uint32_t bank, buffer, num_buffers;
mca_bcol_base_memory_block_desc_t *ml_memblock = module->payload_block;
mca_bcol_base_payload_buffer_desc_t *pbuff_descs = NULL,
*ml_membuffer = NULL;
/* Return a buffer */
num_buffers = ml_memblock->num_buffers_per_bank;
pbuff_descs = ml_memblock->buffer_descs;
bindex = ml_memblock->next_free_buffer;
buffer = bindex % num_buffers;
bank = bindex/num_buffers;
ML_VERBOSE(10, ("ML allocator: allocating buffer index %d, bank index %d", buffer, bank));
/* First buffer in bank, use next bank */
if (0 == buffer) {
if(!ml_memblock->bank_is_busy[bank]) {
/* the bank is free, mark it busy */
ml_memblock->bank_is_busy[bank] = true;
ML_VERBOSE(10, ("ML allocator: reset bank %d to value %d", bank,
ml_memblock->bank_release_counters[bank]));
} else {
/* the bank is busy, return NULL and upper layer will handle it */
ML_VERBOSE(10, ("No free payload buffers are available for use."
" Next memory bank is still used by one of bcols"));
return NULL;
}
}
assert(true == ml_memblock->bank_is_busy[bank]);
ml_membuffer = &pbuff_descs[bindex];
ML_VERBOSE(10, ("ML allocator: ml buffer index %d", bindex));
/* Compute next free buffer */
buffer = (buffer == num_buffers - 1) ? 0 : buffer + 1;
if (0 == buffer) {
bank = (bank == ml_memblock->num_banks - 1) ? 0 : bank + 1;
}
ml_memblock->next_free_buffer = BUFFER_INDEX(bank,num_buffers,buffer);
return ml_membuffer;
}

Просмотреть файл

@ -1,111 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_ML_ALLOC_H
#define MCA_ML_ALLOC_H
#include "ompi_config.h"
#include "ompi/include/ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/bcol/base/base.h"
#include "opal/sys/atomic.h"
#include "opal/mca/mpool/base/base.h"
#include "coll_ml_lmngr.h"
/*
Returns a block of memory from mpool
ARGS:
IN ml_component: component descriptor
OUT ml_memblock: block_addr - Starting address of the memory block
size - Size of the block
register_info - Register information passed from the mpool
Return
On Sucess : Returns size of memory block
On Failure: Returns -1
*/
struct mca_coll_ml_component_t;
struct mca_coll_ml_module_t;
mca_bcol_base_memory_block_desc_t *mca_coll_ml_allocate_block(
struct mca_coll_ml_component_t *ml_component,
struct mca_bcol_base_memory_block_desc_t *ml_memblock
);
/* Allocate the memory from mpool */
/* Register the memory block with bcols */
void mca_coll_ml_free_block(
mca_bcol_base_memory_block_desc_t *ml_memblock
);
/*
Initialize the memory block and map into buffers and memory banks, and
also buffer descriptors are initialized.
IN ml_memblock: Memory block descriptor
IN num_buffers: number of buffers
IN num_banks: number of banks
Return
On Sucess: OMPI_SUCCESS
On Failure: OMPI_ERROR
*/
int mca_coll_ml_initialize_block(
mca_bcol_base_memory_block_desc_t *ml_memblock,
uint32_t num_buffers,
uint32_t num_banks,
uint32_t buffer_size,
int32_t data_offset,
opal_list_t *bcols_in_use
);
/* Map blocks into buffers and banks */
/* Initialize the descriptors */
/*
Allocate a memory buffer from the block
IN ml_memblock: Memory block descriptor
OUT ml_membuffer: Buffer allocated for data from the block
Return
On Sucess: OMPI_SUCCESS
On Failure: OMPI_ERROR
*/
mca_bcol_base_payload_buffer_desc_t *mca_coll_ml_alloc_buffer(
struct mca_coll_ml_module_t *module);
int mca_coll_ml_free_buffer(
mca_bcol_base_memory_block_desc_t *ml_memblock,
struct mca_bcol_base_payload_buffer_desc_t *ml_membuffer
);
/*
Register the memory block with bcol component
IN ml_memblock: Memory block descriptor
OUT registerations (ml_memblock)
Return
On Sucess: OMPI_SUCCESS
On Failure: OMPI_ERROR
*/
int mca_coll_ml_register_block_bcol(
mca_bcol_base_memory_block_desc_t *ml_memblock
);
#endif /* MCA_ML_ALLOC_H */

Просмотреть файл

@ -1,553 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file */
#include "ompi_config.h"
#include <stdlib.h>
#include "ompi/constants.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/bcol/bcol.h"
#include "opal/sys/atomic.h"
#include "coll_ml.h"
#include "coll_ml_select.h"
#include "coll_ml_allocation.h"
static int mca_coll_ml_allreduce_small_unpack(mca_coll_ml_collective_operation_progress_t *coll_op)
{
int ret;
/* need to put in more */
int count = coll_op->variable_fn_params.count;
ompi_datatype_t *dtype = coll_op->variable_fn_params.dtype;
void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr +
(uintptr_t)coll_op->fragment_data.offset_into_user_buffer);
void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
(size_t)coll_op->variable_fn_params.rbuf_offset);
ret = ompi_datatype_copy_content_same_ddt(dtype, (int32_t) count, (char *) dest,
(char *) src);
if (ret < 0) {
return OMPI_ERROR;
}
ML_VERBOSE(10, ("sbuf addr %p, sbuf offset %d, rbuf addr %p, rbuf offset %d.",
src, coll_op->variable_fn_params.sbuf_offset, dest,
coll_op->variable_fn_params.rbuf_offset));
return OMPI_SUCCESS;
}
static int mca_coll_ml_allreduce_task_setup(mca_coll_ml_collective_operation_progress_t *coll_op)
{
int fn_idx, h_level, my_index, root;
mca_sbgp_base_module_t *sbgp;
mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info;
fn_idx = coll_op->sequential_routine.current_active_bcol_fn;
h_level = coll_op->coll_schedule->component_functions[fn_idx].h_level;
sbgp = topo->component_pairs[h_level].subgroup_module;
my_index = sbgp->my_index;
/* In the case of allreduce, the local leader is always the root */
root = 0;
if (my_index == root) {
coll_op->variable_fn_params.root_flag = true;
coll_op->variable_fn_params.root_route = NULL;
} else {
coll_op->variable_fn_params.root_flag = false;
coll_op->variable_fn_params.root_route = &topo->route_vector[root];
}
/* NTH: This was copied from the old allreduce launcher. */
if (0 < fn_idx) {
coll_op->variable_fn_params.sbuf = coll_op->variable_fn_params.rbuf;
coll_op->variable_fn_params.userbuf = coll_op->variable_fn_params.rbuf;
}
return OMPI_SUCCESS;
}
static int mca_coll_ml_allreduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
{
/* local variables */
const void *buf;
size_t dt_size;
int ret, frag_len, count;
ptrdiff_t lb, extent;
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
mca_coll_ml_collective_operation_progress_t *new_op;
mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent);
if (ret < 0) {
return OMPI_ERROR;
}
dt_size = (size_t) extent;
/* Keep the pipeline filled with fragments */
while (coll_op->fragment_data.message_descriptor->n_active <
coll_op->fragment_data.message_descriptor->pipeline_depth) {
/* If an active fragment happens to have completed the collective during
* a hop into the progress engine, then don't launch a new fragment,
* instead break and return.
*/
if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
== coll_op->fragment_data.message_descriptor->n_bytes_total) {
break;
}
/* Get an ml buffer */
src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op));
if (NULL == src_buffer_desc) {
/* If there exist outstanding fragments, then break out
* and let an active fragment deal with this later,
* there are no buffers available.
*/
if (0 < coll_op->fragment_data.message_descriptor->n_active) {
return OMPI_SUCCESS;
}
/* It is useless to call progress from here, since
* ml progress can't be executed as result ml memsync
* call will not be completed and no memory will be
* recycled. So we put the element on the list, and we will
* progress it later when memsync will recycle some memory*/
/* The fragment is already on list and
* the we still have no ml resources
* Return busy */
if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) {
coll_op->pending |= REQ_OUT_OF_MEMORY;
opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
(opal_list_item_t *)coll_op);
ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
} else {
ML_VERBOSE(10,("Out of resources %p", coll_op));
}
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
/* Get a new collective descriptor and initialize it */
new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_allreduce_functions[coll_op->fragment_data.current_coll_op],
coll_op->fragment_data.message_descriptor->src_user_addr,
coll_op->fragment_data.message_descriptor->dest_user_addr,
coll_op->fragment_data.message_descriptor->n_bytes_total,
coll_op->fragment_data.message_descriptor->n_bytes_scheduled);
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
src_buffer_desc->buffer_index, src_buffer_desc);
new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;
/* set the task setup callback */
new_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup;
/* We need this address for pointer arithmetic in memcpy */
buf = coll_op->fragment_data.message_descriptor->src_user_addr;
/* calculate the number of data types in this packet */
count = (coll_op->fragment_data.message_descriptor->n_bytes_total -
coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
(size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_ALLREDUCE] ?
(coll_op->fragment_data.message_descriptor->n_bytes_total -
coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size :
(size_t) coll_op->variable_fn_params.count);
/* calculate the fragment length */
frag_len = count*dt_size;
ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count,
(char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t)
coll_op->fragment_data.message_descriptor->n_bytes_scheduled));
if (ret < 0) {
return OMPI_ERROR;
}
/* No unpack for root */
new_op->process_fn = mca_coll_ml_allreduce_small_unpack;
/* Setup fragment specific data */
new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
new_op->fragment_data.buffer_desc = src_buffer_desc;
new_op->fragment_data.fragment_size = frag_len;
(new_op->fragment_data.message_descriptor->n_active)++;
ML_SET_VARIABLE_PARAMS_BCAST(
new_op,
OP_ML_MODULE(new_op),
count,
MPI_BYTE,
src_buffer_desc,
0,
0,
frag_len,
src_buffer_desc->data_addr);
/* Fill in bcast specific arguments */
/* TBD: remove buffer_size */
new_op->variable_fn_params.buffer_size = frag_len;
new_op->variable_fn_params.count = count;
new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
new_op->variable_fn_params.op = coll_op->variable_fn_params.op;
new_op->variable_fn_params.dtype = coll_op->variable_fn_params.dtype;
new_op->variable_fn_params.root = 0;
new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d",
new_op->variable_fn_params.buffer_size,
new_op->fragment_data.fragment_size,
new_op->fragment_data.message_descriptor->n_bytes_scheduled));
/* initialize first coll */
ret = new_op->sequential_routine.seq_task_setup(new_op);
if (OMPI_SUCCESS != ret) {
ML_VERBOSE(3,("Fragment failed to initialize itself"));
return ret;
}
/* append this collective !! */
OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
opal_list_append(&mca_coll_ml_component.sequential_collectives,
(opal_list_item_t *)new_op);
OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
}
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__
int parallel_allreduce_start(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_ml_module_t *ml_module,
ompi_request_t **req,
int small_data_allreduce,
int large_data_allreduce)
{
int ret, n_fragments = 1, frag_len,
pipeline_depth, n_dts_per_frag ;
ptrdiff_t lb, extent;
size_t pack_len, dt_size;
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
mca_coll_ml_collective_operation_progress_t *coll_op;
mca_coll_ml_component_t *cm = &mca_coll_ml_component;
bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count);
if (MPI_IN_PLACE == sbuf) {
sbuf = rbuf;
}
ret = ompi_datatype_get_extent(dtype, &lb, &extent);
if (ret < 0) {
return OMPI_ERROR;
}
dt_size = (size_t) extent;
pack_len = count * dt_size;
ML_VERBOSE(1,("The allreduce requested %d enable fragmentation %d ",
pack_len,
cm->enable_fragmentation));
if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLREDUCE]) {
/* The len of the message can not be larger than ML buffer size */
assert(pack_len <= ml_module->payload_block->size_buffer);
ML_VERBOSE(1,("Using small data allreduce (threshold = %d)",
ml_module->small_message_thresholds[BCOL_ALLREDUCE]));
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
while (OPAL_UNLIKELY(NULL == src_buffer_desc)) {
opal_progress();
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
}
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_allreduce_functions[small_data_allreduce],
sbuf, rbuf, pack_len, 0);
coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
coll_op->variable_fn_params.count = count;
ret = ompi_datatype_copy_content_same_ddt(dtype, count,
(void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf);
if (ret < 0){
return OMPI_ERROR;
}
/* unpack function */
coll_op->process_fn = mca_coll_ml_allreduce_small_unpack;
} else if (cm->enable_fragmentation || !contiguous) {
ML_VERBOSE(1,("Using Fragmented Allreduce"));
/* fragment the data */
/* check for retarded application programming decisions */
if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_ALLREDUCE]) {
ML_ERROR(("Sorry, but we don't support datatypes that large"));
return OMPI_ERROR;
}
/* calculate the number of data types that can fit per ml-buffer */
n_dts_per_frag = ml_module->small_message_thresholds[BCOL_ALLREDUCE] / dt_size;
/* calculate the number of fragments */
n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */
/* calculate the actual pipeline depth */
pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth;
/* calculate the fragment size */
frag_len = n_dts_per_frag * dt_size;
/* allocate an ml buffer */
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
while (NULL == src_buffer_desc) {
opal_progress();
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
}
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_allreduce_functions[small_data_allreduce],
sbuf, rbuf, pack_len, 0 /* offset for first pack */);
/* task setup callback function */
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup;
coll_op->process_fn = mca_coll_ml_allreduce_small_unpack;
coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
coll_op->fragment_data.message_descriptor->n_active = 1;
coll_op->full_message.n_bytes_scheduled = frag_len;
coll_op->full_message.fragment_launcher = mca_coll_ml_allreduce_frag_progress;
coll_op->full_message.pipeline_depth = pipeline_depth;
coll_op->fragment_data.current_coll_op = small_data_allreduce;
coll_op->fragment_data.fragment_size = frag_len;
coll_op->variable_fn_params.count = n_dts_per_frag; /* seems fishy */
coll_op->variable_fn_params.buffer_size = frag_len;
/* copy into the ml-buffer */
ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag,
(char *) src_buffer_desc->data_addr, (char *) sbuf);
if (ret < 0) {
return OMPI_ERROR;
}
} else {
ML_VERBOSE(1,("Using zero-copy ptp allreduce"));
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_allreduce_functions[large_data_allreduce],
sbuf, rbuf, pack_len, 0);
coll_op->variable_fn_params.userbuf =
coll_op->variable_fn_params.sbuf = sbuf;
coll_op->variable_fn_params.rbuf = rbuf;
/* The ML buffer is used for testing. Later, when we
* switch to use knem/mmap/portals this should be replaced
* appropriately
*/
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
while (NULL == src_buffer_desc) {
opal_progress();
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
}
coll_op->variable_fn_params.count = count;
}
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index,
src_buffer_desc);
/* set the offset */
coll_op->variable_fn_params.sbuf_offset = 0;
coll_op->variable_fn_params.rbuf_offset = 0;
/* Fill in the function arguments */
coll_op->variable_fn_params.sequence_num =
OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
coll_op->sequential_routine.current_active_bcol_fn = 0;
coll_op->variable_fn_params.dtype = dtype;
coll_op->variable_fn_params.op = op;
coll_op->variable_fn_params.root = 0;
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup; /* invoked after each level in sequential
* progress call
*/
MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);
ret = mca_coll_ml_launch_sequential_collective (coll_op);
if (ret != OMPI_SUCCESS) {
ML_VERBOSE(10, ("Failed to launch"));
return ret;
}
*req = &coll_op->full_message.super;
return OMPI_SUCCESS;
}
int mca_coll_ml_allreduce(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module;
ompi_request_t *req;
int ret;
if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) {
/* coll/ml does not handle non-communative operations at this time. fallback
* on another collective module */
return ml_module->fallback.coll_allreduce (sbuf, rbuf, count, dtype, op, comm,
ml_module->fallback.coll_allreduce_module);
}
ret = parallel_allreduce_start(sbuf, rbuf, count, dtype, op, comm,
(mca_coll_ml_module_t *) module, &req,
ML_SMALL_DATA_ALLREDUCE,
ML_LARGE_DATA_ALLREDUCE);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
ML_ERROR(("Failed to launch"));
return ret;
}
ompi_request_wait_completion(req);
ompi_request_free(&req);
ML_VERBOSE(10, ("Blocking NB allreduce is done"));
return OMPI_SUCCESS;
}
int mca_coll_ml_allreduce_nb(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
struct ompi_communicator_t *comm,
ompi_request_t **req,
mca_coll_base_module_t *module)
{
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module;
int ret;
if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) {
/* coll/ml does not handle non-communative operations at this time. fallback
* on another collective module */
return ml_module->fallback.coll_iallreduce (sbuf, rbuf, count, dtype, op, comm, req,
ml_module->fallback.coll_iallreduce_module);
}
ret = parallel_allreduce_start(sbuf, rbuf, count, dtype, op, comm,
(mca_coll_ml_module_t *) module, req,
ML_SMALL_DATA_ALLREDUCE,
ML_LARGE_DATA_ALLREDUCE);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
ML_ERROR(("Failed to launch"));
return ret;
}
ML_VERBOSE(10, ("Blocking NB allreduce is done"));
return OMPI_SUCCESS;
}
int mca_coll_ml_allreduce_dispatch(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
{
int rc;
bool use_extra_topo;
ompi_request_t *req;
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
use_extra_topo = (count > 1) ?
!ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_MULTI_ELEM_TYPE] :
!ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_SINGLE_ELEM_TYPE];
if (use_extra_topo) {
rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
op, comm, ml_module, &req,
ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE,
ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE);
} else {
rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
op, comm, ml_module, &req,
ML_SMALL_DATA_ALLREDUCE,
ML_LARGE_DATA_ALLREDUCE);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
ML_ERROR(("Failed to launch"));
return rc;
}
ompi_request_wait_completion(req);
ompi_request_free(&req);
return OMPI_SUCCESS;
}
int mca_coll_ml_allreduce_dispatch_nb(const void *sbuf, void *rbuf, int count,
ompi_datatype_t *dtype, ompi_op_t *op,
ompi_communicator_t *comm,
ompi_request_t **req,
mca_coll_base_module_t *module)
{
int rc;
bool use_extra_topo;
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
use_extra_topo = (count > 1) ?
!ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_MULTI_ELEM_TYPE] :
!ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_SINGLE_ELEM_TYPE];
if (use_extra_topo) {
rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
op, comm, ml_module, req,
ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE,
ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE);
} else {
rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
op, comm, ml_module, req,
ML_SMALL_DATA_ALLREDUCE,
ML_LARGE_DATA_ALLREDUCE);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
ML_ERROR(("Failed to launch"));
return rc;
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,146 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file */
#include "ompi_config.h"
#include "ompi/constants.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/coll/coll.h"
#include "opal/sys/atomic.h"
#include "ompi/mca/coll/ml/coll_ml.h"
#include "ompi/mca/coll/ml/coll_ml_inlines.h"
static void mca_coll_ml_barrier_task_setup(
mca_coll_ml_task_status_t *task_status,
int index, mca_coll_ml_compound_functions_t *func)
{
task_status->rt_num_dependencies = func->num_dependencies;
task_status->rt_num_dependent_tasks = func->num_dependent_tasks;
task_status->rt_dependent_task_indices = func->dependent_task_indices;
}
static int mca_coll_ml_barrier_launch(mca_coll_ml_module_t *ml_module,
ompi_request_t **req)
{
opal_free_list_item_t *item;
mca_coll_ml_collective_operation_progress_t *coll_op;
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
/* allocate an ml buffer for signaling purposes */
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
while (NULL == src_buffer_desc) {
opal_progress();
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
}
/* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */
item = opal_free_list_wait (&(ml_module->coll_ml_collective_descriptors));
coll_op = (mca_coll_ml_collective_operation_progress_t *) item;
assert(NULL != coll_op);
ML_VERBOSE(10, ("Get coll request %p", coll_op));
MCA_COLL_ML_OP_BASIC_SETUP(coll_op, 0, 0, NULL, NULL, ml_module->coll_ml_barrier_function);
coll_op->fragment_data.buffer_desc = src_buffer_desc;
coll_op->dag_description.num_tasks_completed = 0;
coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
coll_op->variable_fn_params.sequence_num =
OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
/* Pointer to a coll finalize function */
coll_op->process_fn = NULL;
(*req) = &coll_op->full_message.super;
OMPI_REQUEST_INIT((*req), false);
(*req)->req_status._cancelled = 0;
(*req)->req_state = OMPI_REQUEST_ACTIVE;
(*req)->req_status.MPI_ERROR = OMPI_SUCCESS;
/* Set order info if there is a bcol needs ordering */
MCA_COLL_ML_SET_ORDER_INFO(coll_op, 1);
return mca_coll_ml_generic_collectives_launcher(coll_op, mca_coll_ml_barrier_task_setup);
}
/**
* Hierarchical blocking barrier
*/
int mca_coll_ml_barrier_intra(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rc;
ompi_request_t *req;
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
#if OPAL_ENABLE_DEBUG
static int barriers_count = 0;
#endif
ML_VERBOSE(10, ("Barrier num %d start.", ++barriers_count));
rc = mca_coll_ml_barrier_launch(ml_module, &req);
if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
ML_ERROR(("Failed to launch a barrier."));
return rc;
}
/* Blocking barrier */
ompi_request_wait_completion(req);
ompi_request_free(&req);
ML_VERBOSE(10, ("Barrier num %d was done.", barriers_count));
return OMPI_SUCCESS;
}
/**
* Hierarchical non-blocking barrier
*/
int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm,
ompi_request_t **req,
mca_coll_base_module_t *module)
{
int rc;
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
#if OPAL_ENABLE_DEBUG
static int barriers_count = 0;
#endif
ML_VERBOSE(10, ("IBarrier num %d start.", ++barriers_count));
rc = mca_coll_ml_barrier_launch(ml_module, req);
if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
ML_ERROR(("Failed to launch a barrier."));
return rc;
}
ML_VERBOSE(10, ("IBarrier num %d was done.", barriers_count));
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,849 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file */
#include "ompi_config.h"
#include <unistd.h>
#include <sys/uio.h>
#include "opal/threads/mutex.h"
#include "opal/sys/atomic.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/bcol/bcol.h"
#include "coll_ml.h"
#include "coll_ml_inlines.h"
#include "coll_ml_colls.h"
#include "coll_ml_allocation.h"
#define ML_BUFFER_ALLOC_WAIT(ml, buffer) \
do { \
buffer = mca_coll_ml_alloc_buffer(ml); \
while (NULL == buffer) { \
opal_progress(); \
buffer = mca_coll_ml_alloc_buffer(ml); \
} \
} while (0)
#define COLL_ML_SETUP_ORDERING_INFO(op, last, prev) \
do { \
/* Don't change order of commands !!!! */ \
(op)->prev_frag = prev; \
(op)->fragment_data.message_descriptor->last_started_frag = last; \
/* op->next_to_process_frag = NULL; */ \
} while (0)
#define ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, op, coll_index, root, \
total_len, frag_len, buf, ml_buff_desc) \
do { \
op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, \
ml_module->coll_ml_bcast_functions[coll_index], \
buf, buf, \
total_len, \
0 /* offset for first pack */); \
if (OPAL_LIKELY(frag_len > 0)) { \
if (ompi_comm_rank(ml_module->comm) == root) { \
/* single frag, pack the data */ \
memcpy((void *)(uintptr_t)(ml_buff_desc)->data_addr, \
buf, frag_len); \
/* No unpack for root */ \
op->process_fn = NULL; \
} else { \
op->process_fn = mca_coll_ml_bcast_small_unpack_data; \
} \
} \
op->full_message.n_bytes_scheduled = frag_len; \
} while (0)
#define SMALL_BCAST 0
#define LARGE_BCAST (SMALL_BCAST + 1)
/* bcast data unpack */
static int mca_coll_ml_bcast_converter_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
{
struct iovec iov;
uint32_t iov_count = 1;
size_t max_data = 0;
mca_coll_ml_collective_operation_progress_t *next_op;
mca_coll_ml_module_t *ml_module =
(mca_coll_ml_module_t *) coll_op->coll_module;
size_t max_index =
ml_module->payload_block->num_banks * ml_module->payload_block->num_buffers_per_bank;
bool is_first = true;
int ret;
/* Check if the fragment delivered in order */
if (coll_op->fragment_data.buffer_desc->buffer_index !=
coll_op->fragment_data.message_descriptor->next_expected_index) {
mca_coll_ml_collective_operation_progress_t *prev_coll_op = coll_op->prev_frag;
assert(NULL == prev_coll_op->next_to_process_frag);
/* make sure that previous process will have pointer to the out
of order process */
prev_coll_op->next_to_process_frag = coll_op;
assert(!(coll_op->pending & REQ_OUT_OF_ORDER));
coll_op->pending |= REQ_OUT_OF_ORDER;
/* we will unpack it later */
ML_VERBOSE(10, ("Get %d expecting %d previous %d",
coll_op->fragment_data.buffer_desc->buffer_index,
coll_op->fragment_data.message_descriptor->next_expected_index,
prev_coll_op->fragment_data.buffer_desc->buffer_index));
return ORTE_ERR_NO_MATCH_YET;
}
do {
iov.iov_len = coll_op->fragment_data.fragment_size;
iov.iov_base = (void *)((uintptr_t) coll_op->fragment_data.buffer_desc->data_addr);
ML_VERBOSE(10, ("Data unpack with convertern index %d",
coll_op->fragment_data.buffer_desc->buffer_index));
opal_convertor_unpack(&coll_op->fragment_data.message_descriptor->recv_convertor,
&iov, &iov_count, &max_data);
/* update next index */
++coll_op->fragment_data.message_descriptor->next_expected_index;
if (coll_op->fragment_data.message_descriptor->next_expected_index >= max_index) {
coll_op->fragment_data.message_descriptor->next_expected_index = 0;
}
/* Return to queue if the packet is done,
the exeption is first packet, we release it later.
*/
next_op = coll_op->next_to_process_frag;
coll_op->next_to_process_frag = NULL;
if ((!is_first) &&
(0 != coll_op->fragment_data.offset_into_user_buffer)) {
assert(coll_op->pending & REQ_OUT_OF_ORDER);
coll_op->pending ^= REQ_OUT_OF_ORDER;
/* Pasha: On one hand - I'm not sure that conceptually it is right place to call buffer recycling. Potentially,
coll_ml_fragment_completion_processing() sounds like right place for out of order unpack/sync handling.
* On the other hand - non contiguous data is not supper common and we would like to minimize effect on critical pass
* for non contiguous data types. */
ret = mca_coll_ml_buffer_recycling(coll_op);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return OMPI_ERROR;
}
CHECK_AND_RECYCLE(coll_op);
}
coll_op = next_op;
is_first = false;
} while (NULL != coll_op);
return OMPI_SUCCESS;
}
static int mca_coll_ml_bcast_small_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
{
void * dest = (void *)((uintptr_t) coll_op->full_message.dest_user_addr +
(uintptr_t) coll_op->full_message.n_bytes_delivered);
void * src = (void *)((uintptr_t) coll_op->fragment_data.buffer_desc->data_addr);
memcpy(dest, src, coll_op->fragment_data.fragment_size);
return OMPI_SUCCESS;
}
static int mca_coll_ml_bcast_large_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
{
void * dest = (void *)((uintptr_t) coll_op->fragment_data.message_descriptor->dest_user_addr +
(uintptr_t) coll_op->fragment_data.offset_into_user_buffer);
void * src = (void *)((uintptr_t) coll_op->fragment_data.buffer_desc->data_addr);
memcpy(dest, src, coll_op->fragment_data.fragment_size);
return OMPI_SUCCESS;
}
static int mca_coll_ml_bcast_frag_converter_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
{
/* local variables */
int ret, frag_len;
size_t max_data = 0;
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
mca_coll_ml_collective_operation_progress_t *new_op = NULL;
mca_coll_ml_task_setup_fn_t task_setup = NULL;
mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
/* Keep the pipeline filled with fragments */
while (coll_op->fragment_data.message_descriptor->n_active <
mca_coll_ml_component.pipeline_depth) {
/* If an active fragment happens to have completed the collective during
* a hop into the progress engine, then don't launch a new fragment,
* instead break and return.
*/
if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
== coll_op->fragment_data.message_descriptor->n_bytes_total) {
break;
}
/* Get an ml buffer */
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
if (OPAL_UNLIKELY(NULL == src_buffer_desc)) {
/* If there exist outstanding fragments, then break out
* and let an active fragment deal with this later,
* there are no buffers available.
*/
if (0 < coll_op->fragment_data.message_descriptor->n_active) {
return OMPI_SUCCESS;
}
/* It is useless to call progress from here, since
* ml progress can't be executed as result ml memsync
* call will not be completed and no memory will be
* recycled. So we put the element on the list, and we will
* progress it later when memsync will recycle some memory*/
/* The fragment is already on list and
* the we still have no ml resources
* Return busy */
if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) {
coll_op->pending |= REQ_OUT_OF_MEMORY;
opal_list_append(&ml_module->waiting_for_memory_list,
(opal_list_item_t *)coll_op);
}
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
/* Get a new collective descriptor and initialize it */
new_op = mca_coll_ml_duplicate_op_prog_single_frag_dag
(ml_module, coll_op);
/* We need this address for pointer arithmetic in memcpy */
frag_len = ML_GET_FRAG_SIZE(coll_op, BCOL_BCAST);
/* Decide based on global flag, not variable one */
if (coll_op->fragment_data.message_descriptor->root) {
struct iovec iov;
uint32_t iov_count = 1;
/* OBJ_RETAIN(new_op->variable_fn_params.dtype); */
iov.iov_base = (IOVBASE_TYPE*) src_buffer_desc->data_addr;
iov.iov_len = ml_module->small_message_thresholds[BCOL_BCAST];
assert(0 != iov.iov_len);
max_data = ml_module->small_message_thresholds[BCOL_BCAST];
opal_convertor_pack(&new_op->fragment_data.message_descriptor->send_convertor,
&iov, &iov_count, &max_data);
new_op->process_fn = NULL;
new_op->variable_fn_params.root_flag = true;
new_op->variable_fn_params.root_route = NULL;
task_setup = OP_ML_MODULE(new_op)->
coll_ml_bcast_functions[new_op->fragment_data.current_coll_op]->
task_setup_fn[COLL_ML_ROOT_TASK_FN];
} else {
new_op->process_fn = mca_coll_ml_bcast_converter_unpack_data;
new_op->variable_fn_params.root_flag = false;
new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route;
task_setup = OP_ML_MODULE(new_op)->
coll_ml_bcast_functions[new_op->fragment_data.current_coll_op]->
task_setup_fn[COLL_ML_GENERAL_TASK_FN];
max_data = ml_module->small_message_thresholds[BCOL_BCAST];
mca_coll_ml_convertor_get_send_frag_size(
ml_module, &max_data,
new_op->fragment_data.message_descriptor);
}
new_op->fragment_data.message_descriptor->n_bytes_scheduled += max_data;
new_op->fragment_data.fragment_size = max_data;
new_op->fragment_data.buffer_desc = src_buffer_desc;
/* Setup fragment specific data */
++(new_op->fragment_data.message_descriptor->n_active);
COLL_ML_SETUP_ORDERING_INFO(new_op, new_op,
new_op->fragment_data.message_descriptor->last_started_frag);
ML_VERBOSE(10, ("Start more, My index %d my prev %d",
new_op->fragment_data.buffer_desc->buffer_index,
new_op->prev_frag->fragment_data.buffer_desc->buffer_index));
ML_SET_VARIABLE_PARAMS_BCAST(
new_op,
OP_ML_MODULE(new_op),
frag_len,
MPI_BYTE,
src_buffer_desc,
0,
0,
frag_len,
src_buffer_desc->data_addr);
/* TBD: remove buffer_size */
new_op->variable_fn_params.buffer_size = coll_op->variable_fn_params.buffer_size;
new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
/* Set order info for new frag if there is a bcol needs ordering */
MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
/* Launch this collective !! */
ret = mca_coll_ml_generic_collectives_append_to_queue(new_op, task_setup);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
ML_ERROR(("Failed to launch"));
return ret;
}
}
return OMPI_SUCCESS;
}
static int mca_coll_ml_bcast_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
{
/* local variables */
int ret;
int frag_len, current_coll_op = coll_op->fragment_data.current_coll_op;
size_t dt_size;
void *buf;
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
mca_coll_ml_collective_operation_progress_t *new_op = NULL;
mca_coll_ml_task_setup_fn_t task_setup = NULL;
ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size);
/* Keep the pipeline filled with fragments */
while (coll_op->fragment_data.message_descriptor->n_active <
coll_op->fragment_data.message_descriptor->pipeline_depth) {
/* If an active fragment happens to have completed the collective during
* a hop into the progress engine, then don't launch a new fragment,
* instead break and return.
*/
if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
== coll_op->fragment_data.message_descriptor->n_bytes_total) {
break;
}
/* Get an ml buffer */
src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op));
if (NULL == src_buffer_desc) {
/* If there exist outstanding fragments, then break out
* and let an active fragment deal with this later,
* there are no buffers available.
*/
if (0 < coll_op->fragment_data.message_descriptor->n_active) {
return OMPI_SUCCESS;
}
/* It is useless to call progress from here, since
* ml progress can't be executed as result ml memsync
* call will not be completed and no memory will be
* recycled. So we put the element on the list, and we will
* progress it later when memsync will recycle some memory*/
/* The fragment is already on list and
* the we still have no ml resources
* Return busy */
if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) {
ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
coll_op->pending |= REQ_OUT_OF_MEMORY;
opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
(opal_list_item_t *) coll_op);
} else {
ML_VERBOSE(10,("Out of resources %p", coll_op));
}
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
/* Get a new collective descriptor and initialize it */
new_op = mca_coll_ml_duplicate_op_prog_single_frag_dag
(OP_ML_MODULE(coll_op), coll_op);
/* We need this address for pointer arithmetic in memcpy */
buf = coll_op->fragment_data.message_descriptor->dest_user_addr;
frag_len = ML_GET_FRAG_SIZE(coll_op, BCOL_BCAST);
/* Decide based on global flag, not variable one */
if (coll_op->fragment_data.message_descriptor->root) {
memcpy((void *)(uintptr_t)src_buffer_desc->data_addr,
(void *) ((uintptr_t) buf + (uintptr_t) coll_op->
fragment_data.message_descriptor->n_bytes_scheduled) , frag_len);
/* No unpack for root */
new_op->process_fn = NULL;
new_op->variable_fn_params.root_flag = true;
new_op->variable_fn_params.root_route = NULL;
task_setup = OP_ML_MODULE(new_op)->coll_ml_bcast_functions[current_coll_op]->
task_setup_fn[COLL_ML_ROOT_TASK_FN];
} else {
new_op->process_fn = mca_coll_ml_bcast_large_unpack_data;
new_op->variable_fn_params.root_flag = false;
new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route;
task_setup = OP_ML_MODULE(new_op)->coll_ml_bcast_functions[current_coll_op]->
task_setup_fn[COLL_ML_GENERAL_TASK_FN];
}
/* Setup fragment specific data */
new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
new_op->fragment_data.buffer_desc = src_buffer_desc;
new_op->fragment_data.fragment_size = frag_len;
new_op->fragment_data.message_descriptor->n_active++;
ML_SET_VARIABLE_PARAMS_BCAST(
new_op,
OP_ML_MODULE(new_op),
frag_len,
MPI_BYTE,
src_buffer_desc,
0,
0,
frag_len,
src_buffer_desc->data_addr);
/* Fill in bcast specific arguments */
/* TBD: remove buffer_size */
new_op->variable_fn_params.buffer_size = coll_op->variable_fn_params.buffer_size;
new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
/* Set order info for new frag if there is a bcol needs ordering */
MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
ML_VERBOSE(10, ("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d",
new_op->variable_fn_params.buffer_size ,
new_op->fragment_data.fragment_size,
new_op->fragment_data.message_descriptor->n_bytes_scheduled));
/* Launch this collective !! */
ret = mca_coll_ml_generic_collectives_append_to_queue(new_op, task_setup);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
ML_VERBOSE(10, ("Failed to launch"));
return ret;
}
}
return OMPI_SUCCESS;
}
#define BCAST_FRAGMENTATION_IS_ENABLED(module) \
(module->bcast_fn_index_table[LARGE_BCAST] < ML_BCAST_LARGE_DATA_KNOWN)
static inline __opal_attribute_always_inline__
int parallel_bcast_start(void *buf, int count, struct ompi_datatype_t *dtype,
int root, mca_coll_base_module_t *module, ompi_request_t **req)
{
size_t pack_len = 0;
size_t dt_size = 0;
bool contig = false;
int bcast_index, n_fragments = 1;
mca_coll_ml_collective_operation_progress_t * coll_op = NULL;
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
mca_coll_ml_task_setup_fn_t task_setup;
OPAL_PTRDIFF_TYPE lb, extent;
/* actual starting place of the user buffer (lb added) */
void *actual_buf;
ML_VERBOSE(10, ("Starting bcast, mca_coll_ml_bcast_uknown_root buf: %p", buf));
ompi_datatype_type_size(dtype, &dt_size);
pack_len = count * dt_size;
/* Setup data buffer */
ML_BUFFER_ALLOC_WAIT(ml_module, src_buffer_desc);
/* Get information about memory layout */
contig = opal_datatype_is_contiguous_memory_layout((opal_datatype_t *)dtype, count);
ompi_datatype_get_extent (dtype, &lb, &extent);
actual_buf = (void *) ((uintptr_t) buf + lb);
/* Allocate collective schedule and pack message */
if (contig) {
if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_BCAST]) {
assert(pack_len <= ml_module->payload_block->size_buffer);
bcast_index = ml_module->bcast_fn_index_table[SMALL_BCAST];
ML_VERBOSE(10, ("Contig + small message %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index));
ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op, bcast_index, root, pack_len,
pack_len, actual_buf, src_buffer_desc);
ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, count, dtype,
src_buffer_desc, 0, 0, ml_module->payload_block->size_buffer,
(src_buffer_desc->data_addr));
} else if (BCAST_FRAGMENTATION_IS_ENABLED(ml_module)) {
/* We moved the fragmentation decision from communication creation time to
runtime, since for large messages the if latency is not so critical */
size_t n_dts_per_frag;
int frag_len, pipeline_depth = mca_coll_ml_component.pipeline_depth;
bcast_index = ml_module->bcast_fn_index_table[LARGE_BCAST];
ML_VERBOSE(10, ("Contig + fragmentation %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index));
/* Calculate the number of fragments required for this message */
frag_len = (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_BCAST] ?
pack_len : (size_t) ml_module->small_message_thresholds[BCOL_BCAST]);
n_dts_per_frag = frag_len/dt_size;
n_fragments = (pack_len + dt_size*n_dts_per_frag - 1)/(dt_size*n_dts_per_frag);
pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth);
ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op, bcast_index, root, pack_len,
frag_len, actual_buf, src_buffer_desc);
ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, (frag_len/dt_size), dtype,
src_buffer_desc, 0, 0, frag_len, (src_buffer_desc->data_addr));
coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_progress;
coll_op->full_message.pipeline_depth = pipeline_depth;
/* Initialize fragment specific information */
coll_op->fragment_data.current_coll_op = bcast_index;
/* coll_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; */
coll_op->fragment_data.fragment_size = frag_len;
coll_op->fragment_data.message_descriptor->n_active++;
/* should be removed */
coll_op->variable_fn_params.buffer_size = frag_len;
ML_VERBOSE(10, ("Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d",
coll_op->variable_fn_params.buffer_size,
coll_op->fragment_data.fragment_size));
} else {
bcast_index = ml_module->bcast_fn_index_table[LARGE_BCAST];
ML_VERBOSE(10, ("Contig + zero copy %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index));
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_bcast_functions[bcast_index],
actual_buf, actual_buf, pack_len,
0 /* offset for first pack */);
/* For large messages (bcast) this points to userbuf */
/* Pasha: temporary work around for basesmuma, userbuf should
be removed */
coll_op->variable_fn_params.userbuf = buf;
coll_op->process_fn = NULL;
coll_op->full_message.n_bytes_scheduled = pack_len;
ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, count, dtype,
src_buffer_desc, 0, 0,
ml_module->payload_block->size_buffer, buf);
}
} else {
/* Non contiguous data type */
bcast_index = ml_module->bcast_fn_index_table[SMALL_BCAST];
ML_VERBOSE(10, ("NON Contig + fragmentation %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index));
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_bcast_functions[bcast_index],
actual_buf, actual_buf, pack_len,
0 /* offset for first pack */);
if (OPAL_LIKELY(pack_len > 0)) {
size_t max_data = 0;
if (ompi_comm_rank(ml_module->comm) == root) {
struct iovec iov;
uint32_t iov_count = 1;
opal_convertor_copy_and_prepare_for_send(
ompi_mpi_local_convertor,
&dtype->super, count, buf, 0,
&coll_op->full_message.send_convertor);
opal_convertor_get_packed_size(&coll_op->full_message.send_convertor,
&coll_op->full_message.send_converter_bytes_packed);
coll_op->full_message.n_bytes_total =
coll_op->full_message.send_converter_bytes_packed;
iov.iov_base = (IOVBASE_TYPE*) src_buffer_desc->data_addr;
iov.iov_len = ml_module->small_message_thresholds[BCOL_BCAST];
max_data = ml_module->small_message_thresholds[BCOL_BCAST];
opal_convertor_pack(&coll_op->full_message.send_convertor,
&iov, &iov_count, &max_data);
coll_op->process_fn = NULL;
coll_op->full_message.n_bytes_scheduled = max_data;
/* We need prepare the data for future pipe line comunication */
coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_converter_progress;
coll_op->full_message.pipeline_depth = mca_coll_ml_component.pipeline_depth;
coll_op->full_message.root = true;
} else {
opal_convertor_copy_and_prepare_for_send(
ompi_mpi_local_convertor,
&dtype->super, count, NULL, 0,
&coll_op->full_message.dummy_convertor);
/* In non-root case we use it for #bytes remaining to receive */
opal_convertor_get_packed_size(&coll_op->full_message.dummy_convertor,
&coll_op->full_message.send_converter_bytes_packed);
opal_convertor_copy_and_prepare_for_recv(
ompi_mpi_local_convertor,
&dtype->super, count, buf, 0,
&coll_op->full_message.recv_convertor);
opal_convertor_get_unpacked_size(&coll_op->full_message.recv_convertor,
&coll_op->full_message.recv_converter_bytes_packed);
coll_op->full_message.root = false;
coll_op->full_message.n_bytes_total =
coll_op->full_message.recv_converter_bytes_packed;
coll_op->process_fn = mca_coll_ml_bcast_converter_unpack_data;
coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_converter_progress;
coll_op->full_message.pipeline_depth = mca_coll_ml_component.pipeline_depth;
max_data = ml_module->small_message_thresholds[BCOL_BCAST];
coll_op->full_message.dummy_conv_position = 0;
mca_coll_ml_convertor_get_send_frag_size(
ml_module, &max_data,
&coll_op->full_message);
coll_op->full_message.n_bytes_scheduled = max_data;
}
}
coll_op->fragment_data.current_coll_op = bcast_index;
coll_op->fragment_data.message_descriptor->n_active++;
coll_op->fragment_data.fragment_size = coll_op->full_message.n_bytes_scheduled;
/* Set initial index */
coll_op->full_message.next_expected_index = src_buffer_desc->buffer_index;
/* Prepare linking information for future frags */
COLL_ML_SETUP_ORDERING_INFO(coll_op, coll_op, NULL);
/* Since the data is already packed we will use MPI_BYTE and byte count as datatype */
ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, coll_op->full_message.n_bytes_scheduled, MPI_BYTE,
src_buffer_desc, 0, 0, ml_module->payload_block->size_buffer,(src_buffer_desc->data_addr));
n_fragments = (coll_op->full_message.n_bytes_total +
ml_module->small_message_thresholds[BCOL_BCAST] - 1) / ml_module->small_message_thresholds[BCOL_BCAST];
}
coll_op->variable_fn_params.hier_factor = 1;
coll_op->fragment_data.buffer_desc = src_buffer_desc;
/* Set order info if there is a bcol needs ordering */
MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);
if (ompi_comm_rank(ml_module->comm) == root) {
coll_op->full_message.root =
coll_op->variable_fn_params.root_flag = true;
coll_op->variable_fn_params.root_route = NULL;
task_setup = ml_module->coll_ml_bcast_functions[bcast_index]->
task_setup_fn[COLL_ML_ROOT_TASK_FN];
} else {
coll_op->full_message.root =
coll_op->variable_fn_params.root_flag = false;
coll_op->variable_fn_params.root_route =
(NULL == coll_op->coll_schedule->topo_info->route_vector ?
NULL : &coll_op->coll_schedule->topo_info->route_vector[root]);
task_setup = ml_module->coll_ml_bcast_functions[bcast_index]->
task_setup_fn[COLL_ML_GENERAL_TASK_FN];
}
*req = &coll_op->full_message.super;
return mca_coll_ml_generic_collectives_launcher(coll_op, task_setup);
}
int mca_coll_ml_parallel_bcast(void *buf, int count, struct ompi_datatype_t *dtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int ret;
ompi_request_t *req;
ret = parallel_bcast_start(buf, count, dtype, root, module, &req);
if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) {
ML_VERBOSE(10, ("Failed to launch"));
return ret;
}
/* Blocking bcast */
ompi_request_wait_completion(req);
ompi_request_free(&req);
ML_VERBOSE(10, ("Bcast is done mca_coll_ml_bcast_known"));
return OMPI_SUCCESS;
}
int mca_coll_ml_parallel_bcast_nb(void *buf, int count, struct ompi_datatype_t *dtype,
int root, struct ompi_communicator_t *comm,
ompi_request_t **req,
mca_coll_base_module_t *module)
{
int ret;
ret = parallel_bcast_start(buf, count, dtype, root, module, req);
if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) {
ML_VERBOSE(10, ("Failed to launch"));
return ret;
}
ML_VERBOSE(10, ("Bcast is done mca_coll_ml_bcast_known"));
return OMPI_SUCCESS;
}
int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype_t *dtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
/* local variables */
int ret, fn_idx;
size_t pack_len = 0;
size_t dt_size = 0;
mca_coll_ml_collective_operation_progress_t * coll_op = NULL;
mca_coll_ml_compound_functions_t *fixed_schedule;
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
mca_bcol_base_coll_fn_desc_t *func;
OPAL_PTRDIFF_TYPE lb, extent;
/* actual starting place of the user buffer (lb added) */
void *actual_buf;
ML_VERBOSE(10, ("Starting static bcast, small messages"));
assert(NULL != dtype);
/* Calculate size of the data,
* on this stage only contiguous data is supported */
ompi_datatype_type_size(dtype, &dt_size);
pack_len = count * dt_size;
ompi_datatype_get_extent (dtype, &lb, &extent);
actual_buf = (void *) ((uintptr_t) buf + lb);
/* Setup data buffer */
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
while (NULL == src_buffer_desc) {
opal_progress();
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
}
/* Allocate collective schedule and pack message */
if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_BCAST]) {
/* The len of the message can not be larger than ML buffer size */
assert(pack_len <= ml_module->payload_block->size_buffer);
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_bcast_functions[ML_BCAST_SMALL_DATA_SEQUENTIAL],
actual_buf, actual_buf, pack_len,
0 /* offset for first pack */);
if (ompi_comm_rank(comm) == root) {
/* single frag, pack the data */
memcpy((void *)(uintptr_t)src_buffer_desc->data_addr,
buf, pack_len);
/* No unpack for root */
coll_op->process_fn = NULL;
} else {
coll_op->process_fn = mca_coll_ml_bcast_small_unpack_data;
}
coll_op->variable_fn_params.sbuf =
src_buffer_desc->data_addr;
} else {
ML_VERBOSE(10, ("ML_BCAST_LARGE_DATA_KNOWN case."));
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
ml_module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_SEQUENTIAL],
actual_buf, actual_buf, pack_len,
0 /* offset for first pack */);
/* For large messages (bcast) this points to userbuf */
/* Pasha: temporary work around for basesmuma, userbuf should
be removed */
coll_op->variable_fn_params.userbuf =
coll_op->variable_fn_params.sbuf = actual_buf;
coll_op->process_fn = NULL;
}
/* Fill in the function arguments */
coll_op->variable_fn_params.sequence_num =
OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
coll_op->variable_fn_params.count = count;
coll_op->variable_fn_params.dtype = dtype;
coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
coll_op->variable_fn_params.src_desc = src_buffer_desc;
coll_op->variable_fn_params.sbuf_offset = 0;
coll_op->variable_fn_params.rbuf_offset = 0;
/* pasha - why we duplicate it ? */
coll_op->fragment_data.buffer_desc = src_buffer_desc;
/* pack data into payload buffer - NOTE: assume no fragmenation at this stage */
if (ompi_comm_rank(comm) == root) {
coll_op->variable_fn_params.root_flag = true;
coll_op->variable_fn_params.root_route =
&coll_op->coll_schedule->topo_info->route_vector[root];
coll_op->full_message.n_bytes_scheduled = pack_len;
} else {
coll_op->variable_fn_params.root_flag = false;
coll_op->variable_fn_params.root_route =
&coll_op->coll_schedule->topo_info->route_vector[root];
}
/* seems like we should fix a schedule here and now */
fixed_schedule = coll_op->coll_schedule->
comp_fn_arr[coll_op->variable_fn_params.root_route->level];
/* now we set this schedule as the compound function list */
coll_op->coll_schedule->component_functions = fixed_schedule;
coll_op->sequential_routine.current_active_bcol_fn = 0;
while (true) {
/* ready, aim, fire collective(s)!! */
fn_idx = coll_op->sequential_routine.current_active_bcol_fn;
func = fixed_schedule[fn_idx].bcol_function;
ret = func->coll_fn(&coll_op->variable_fn_params,
(struct mca_bcol_base_function_t *) &fixed_schedule[fn_idx].constant_group_data);
/* set the coll_fn_started flag to true */
if (BCOL_FN_COMPLETE == ret) {
/* done with this routine, bump the active counter */
coll_op->sequential_routine.current_active_bcol_fn++;
coll_op->variable_fn_params.root_flag = true;
/* check for collective completion */
if (coll_op->sequential_routine.current_active_bcol_fn ==
coll_op->coll_schedule->n_fns) {
/* handle fragment completion */
ret = coll_ml_fragment_completion_processing(coll_op);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing");
}
/* break out of while loop */
break;
}
} else {
/* put entire collective opperation onto sequential queue */
opal_list_append(&mca_coll_ml_component.sequential_collectives,
(opal_list_item_t *) coll_op);
break;
}
}
/* Blocking bcast */
ompi_request_wait_completion(&coll_op->full_message.super);
ompi_request_free((ompi_request_t **) &coll_op);
ML_VERBOSE(10, ("Bcast is done"));
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,552 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_ML_COLLS_H
#define MCA_COLL_ML_COLLS_H
#include "ompi_config.h"
#include "ompi/mca/bcol/bcol.h"
#define COLL_ML_FN_NAME_LEN 256
/* utility information used to coordinate activities, such as resource
* management between different functions in the hierarchy
*/
struct mca_coll_ml_utility_data_t {
/* RLG - temp fix !!!! - really need to remove this, but right now
do not want to change the signature of the collective primitives to
use coll_ml_utility_data_t rather than mca_bcol_base_function_t */
int dummy;
/* module */
struct mca_bcol_base_module_t *bcol_module;
/* */
int index_in_consecutive_same_bcol_calls;
/* number of times functions from this bcol are called in order */
int n_of_this_type_in_a_row;
/* number of times functions from this module are called
* in the collective operation. */
int n_of_this_type_in_collective;
int index_of_this_type_in_collective;
};
typedef struct mca_coll_ml_utility_data_t mca_coll_ml_utility_data_t;
/* forward declaration */
struct mca_coll_ml_collective_operation_progress_t;
struct mca_coll_ml_task_status_t;
typedef int (* mca_coll_ml_process_op_fn_t)
(struct mca_coll_ml_collective_operation_progress_t *coll_op);
typedef int (* mca_coll_ml_task_comp_fn_t)
(struct mca_coll_ml_task_status_t *coll_op);
typedef int (* mca_coll_ml_fragment_launch_fn_t)
( struct mca_coll_ml_collective_operation_progress_t *coll_op);
typedef int (* mca_coll_ml_sequential_task_setup_fn_t)
( struct mca_coll_ml_collective_operation_progress_t *coll_op);
/* This data structure defines the dependencies for a given
* compound operation. We will use this as a basis for implementing
* collective operations.
*/
struct mca_coll_ml_compound_functions_t {
/* label */
char fn_name[COLL_ML_FN_NAME_LEN];
/* hierarchy level that is used for this bcol */
int h_level;
/* the list of functions that make up this task */
/* coll_bcol_collective_description_t *bcol_function; */
mca_bcol_base_coll_fn_desc_t *bcol_function;
/* task completion function for this compound function */
mca_coll_ml_task_comp_fn_t task_comp_fn;
/* module specific information that is a constant on a per group
* basis
*/
mca_coll_ml_utility_data_t constant_group_data;
/* number of dependencies to be satified before these function can be
* started */
int num_dependencies;
/*
* number of notifications to perform on completion. The assumption
* is that a counter will be incremented.
*/
int num_dependent_tasks;
/*
* pointers to counters that need be updated. This assumes
* an array of tasks is used to describe the ML level
* collective operation, with these indecies referencing elements
* in this array.
*/
int *dependent_task_indices;
};
typedef struct mca_coll_ml_compound_functions_t mca_coll_ml_compound_functions_t;
/* Forward declaration for operation_description_t */
struct mca_coll_ml_module_t;
enum {
COLL_ML_GENERAL_TASK_FN,
COLL_ML_ROOT_TASK_FN,
COLL_ML_MAX_TASK_FN
};
enum {
SEQ_TASK_NOT_STARTED,
SEQ_TASK_PENDING,
SEQ_TASK_IN_PROG
};
typedef void (*mca_coll_ml_task_setup_fn_t) (struct mca_coll_ml_task_status_t *task_status, int index, struct mca_coll_ml_compound_functions_t *func);
/*
* Collective operation definition
*/
struct mca_coll_ml_collective_operation_description_t {
/*
* Type of collective opeartion - there are two types:
* 1) sequential progress through the collectives is sufficient
* 2) general treatment, popping tasks onto execution queus is needed.
*/
int progress_type;
struct mca_coll_ml_topology_t *topo_info;
/*
* number of functions in collective operation
*/
int n_fns;
/*
* list of functions
*/
mca_coll_ml_compound_functions_t *component_functions;
/*
* array of lists of functions
*/
mca_coll_ml_compound_functions_t **comp_fn_arr;
/*
* indices into the list - fixes a sequential schedule
*/
int *sch_idx;
/*
* Task setup functions, so far we have only 3 - root and non-root
*/
mca_coll_ml_task_setup_fn_t task_setup_fn[COLL_ML_MAX_TASK_FN];
/* number of functions are called for bcols need ordering */
int n_fns_need_ordering;
};
typedef struct mca_coll_ml_collective_operation_description_t
mca_coll_ml_collective_operation_description_t;
/* Data structure used to track the state of individual bcol
* functions. This is used to track dependencies and completion
* to progress the ML level function correctly.
*
* mca_coll_ml_task_status_t will be associated with an
* mca_coll_ml_collective_operation_progress_t structure for
* the duration of the lifetime of a communicator.
* An array of task statuses will be stored with
* the mca_coll_ml_collective_operation_progress_t data structure, so
* that the taks status elements do not need to be moved back to
* a free list before they are re-used. When the ML level function
* is complete, all mca_coll_ml_task_status_t are available for
* re-use.
*/
struct mca_coll_ml_task_status_t{
/* need to move this between lists to progress this correctly */
opal_list_item_t item;
/* number of dependencies satisfied */
int n_dep_satisfied;
/* ***************************************************************
* Pasha:
* I'm adding to the status: num_dependencies, num_dependent_tasks and
* dependent_task_indices. The information originally resided on mca_coll_ml_compound_functions_t.
* For collective operation with static nature it is not problem.
* But for Bcast operation, where run time parameters, like root, actually
* define the dependency. rt prefix mean run-time.
*/
/* number of dependencies to be satisfied before these function can be
* started */
int rt_num_dependencies;
/*
* number of notifications to perform on completion. The assumption
* is that a counter will be incremented.
*/
int rt_num_dependent_tasks;
/*
* pointers to counters that need be updated. This assumes
* an array of tasks is used to describe the ML level
* collective operation, with these indecies referencing elements
* in this array.
*/
int *rt_dependent_task_indices;
/*
*
* ***************************************************************/
/* index in collective schedule */
int my_index_in_coll_schedule;
/* function pointers */
mca_bcol_base_coll_fn_desc_t *bcol_fn;
/* association with a specific collective task - the ML
* mca_coll_ml_collective_operation_progress_t stores the
* specific function parameters */
struct mca_coll_ml_collective_operation_progress_t *ml_coll_operation;
mca_coll_ml_task_comp_fn_t task_comp_fn;
};
typedef struct mca_coll_ml_task_status_t mca_coll_ml_task_status_t;
typedef enum mca_coll_ml_pending_type_t {
REQ_OUT_OF_ORDER = 1,
REQ_OUT_OF_MEMORY = 1 << 1
} mca_coll_ml_pending_type_t;
/* Forward declaration */
struct mca_bcol_base_payload_buffer_desc_t;
/* Data structure used to track ML level collective operation
* progress.
*/
struct mca_coll_ml_collective_operation_progress_t {
/* need this to put on a list properly */
/* Full message information */
struct full_message_t {
/* make this a list item */
ompi_request_t super;
/* Next expected fragment.
* It used for controling order of converter unpack operation */
size_t next_expected_index;
/* Pointer to last intilized fragment.
* It used for controling order of converter unpack operation */
struct mca_coll_ml_collective_operation_progress_t *last_started_frag;
/* destination data address in user memory */
void *dest_user_addr;
/* source data address in user memory */
const void *src_user_addr;
/* total message size */
size_t n_bytes_total;
/* per-process total message size - relevant for operations
* such as gather and scatter, where each rank has it's
* own unique data
*/
size_t n_bytes_per_proc_total;
size_t max_n_bytes_per_proc_total;
/* data processes - from a local perspective */
size_t n_bytes_delivered;
/* current offset - where to continue with next fragment */
size_t n_bytes_scheduled;
/* number of fragments needed to process this message */
size_t n_fragments;
/* number of active frags */
int n_active;
/* actual pipeline depth */
int pipeline_depth;
/* am I the real root of the collective ? */
bool root;
/* collective fragment launcher */
mca_coll_ml_fragment_launch_fn_t fragment_launcher;
/* is data contingous */
bool send_data_continguous;
bool recv_data_continguous;
/* data type count */
int64_t send_count;
int64_t recv_count;
/* extent of the data types */
size_t send_extent;
size_t recv_extent;
/* send data type */
struct ompi_datatype_t * send_data_type;
/* needed for non-contigous buffers */
size_t offset_into_send_buffer;
/* receive data type */
struct ompi_datatype_t * recv_data_type;
/* needed for non-contigous buffers */
size_t offset_into_recv_buffer;
/* Convertors for non contigous data */
opal_convertor_t send_convertor;
opal_convertor_t recv_convertor;
/* Will be used by receiver for #bytes calc in the next frag */
opal_convertor_t dummy_convertor;
size_t dummy_conv_position;
/* Size of packed data */
size_t send_converter_bytes_packed;
size_t recv_converter_bytes_packed;
/* In case if ordering is needed: order num for next frag */
int next_frag_num;
/* The variable is used by non-blocking memory synchronization code
* for caching bank index */
int bank_index_to_recycle;
/* need a handle for collective progress e.g. alltoall*/
bcol_fragment_descriptor_t frag_info;
} full_message;
/* collective operation being progressed */
mca_coll_ml_collective_operation_description_t *coll_schedule;
/* */
mca_coll_ml_process_op_fn_t process_fn;
mca_coll_base_module_t *coll_module;
/* If not null , we have to release next fragment */
struct mca_coll_ml_collective_operation_progress_t *next_to_process_frag;
/* pointer to previous fragment */
struct mca_coll_ml_collective_operation_progress_t *prev_frag;
/* This flag marks that the fragment is pending on the waiting
* to be processed prior to recycling
*/
enum mca_coll_ml_pending_type_t pending;
/* Fragment data */
struct fragment_data_t {
/* current buffer pointer - offset (in bytes) into the user data */
size_t offset_into_user_buffer;
size_t offset_into_user_buffer_per_proc;
/* amount of data (in bytes) in this fragment - amount of data
* actually processed */
size_t fragment_size;
size_t per_rank_fragment_size;
size_t data_type_count_per_frag;
/* pointer to full message progress data */
struct full_message_t *message_descriptor;
/* ML buffer descriptor attached to this buffer */
struct mca_bcol_base_payload_buffer_desc_t *buffer_desc;
/* handle for collective progress, e.g. alltoall */
bcol_fragment_descriptor_t bcol_fragment_desc;
/* Which collective algorithm */
int current_coll_op;
} fragment_data;
/* specific function parameters */
/* the assumption is that the variable parameters passed into
* the ML level function will persist until the collective operation
* is complete. For a blocking function this is until the collective
* function is exited, and for nonblocking collective functions this
* is until test or wait completes the collective.
*/
int global_root;
bcol_function_args_t variable_fn_params;
struct{
/* current active function - for sequential algorithms */
int current_active_bcol_fn;
/* current function status - not started, or in progress.
* When the routine has completed, the active bcol index is
* incremented, so no need to keep track of a completed
* status.
*/
int current_bcol_status;
/* use this call back to setup algorithm specific info
after each level necessary
*/
mca_coll_ml_sequential_task_setup_fn_t seq_task_setup;
} sequential_routine;
struct{
/*
* BCOL function status - individual elements will be posted to
* ml level component queues, as appropriate.
*/
mca_coll_ml_task_status_t *status_array;
/* number of completed tasks - need this for collective completion.
* Resource completion is tracked by each BCOL module .
*/
int num_tasks_completed;
} dag_description;
};
typedef struct mca_coll_ml_collective_operation_progress_t
mca_coll_ml_collective_operation_progress_t;
OBJ_CLASS_DECLARATION(mca_coll_ml_collective_operation_progress_t);
#define OP_ML_MODULE(op) ((mca_coll_ml_module_t *)((op)->coll_module))
#define GET_COMM(op) ((OP_ML_MODULE(op))->comm)
#define IS_COLL_SYNCMEM(op) (ML_MEMSYNC == op->fragment_data.current_coll_op)
#define CHECK_AND_RECYCLE(op) \
do { \
if (0 == (op)->pending) { \
/* Caching 2 values that we can't to touch on op after returing it */ \
/* back to the free list (free list may release memory on distruct )*/ \
struct ompi_communicator_t *comm = GET_COMM(op); \
bool is_coll_sync = IS_COLL_SYNCMEM(op); \
ML_VERBOSE(10, ("Releasing %p", op)); \
OMPI_REQUEST_FINI(&(op)->full_message.super); \
opal_free_list_return (&(((mca_coll_ml_module_t *)(op)->coll_module)-> \
coll_ml_collective_descriptors), \
(opal_free_list_item_t *)op); \
/* Special check for memory synchronization completion */ \
/* We have to return it first to free list, since the communicator */ \
/* release potentially may trigger ML module distraction and having */ \
/* the element not on the list may cause memory leak. */ \
if (OPAL_UNLIKELY(is_coll_sync)) { \
if (OMPI_COMM_IS_INTRINSIC(comm)) { \
opal_show_help("help-mpi-coll-ml.txt", \
"coll-ml-check-fatal-error", true, \
comm->c_name); \
ompi_mpi_abort(comm, 6); \
} else { \
opal_show_help("help-mpi-coll-ml.txt", \
"coll-ml-check-error", true, \
comm->c_name); \
/* After this point it is UNSAFE to touch ml module */ \
/* or communicator */ \
OBJ_RELEASE(comm); \
} \
} \
} \
} while (0)
#define MCA_COLL_ML_SET_ORDER_INFO(coll_progress, num_frags) \
do { \
mca_coll_ml_topology_t *topo = (coll_progress)->coll_schedule->topo_info; \
bcol_function_args_t *variable_params = &(coll_progress)->variable_fn_params; \
if (topo->topo_ordering_info.num_bcols_need_ordering > 0) { \
variable_params->order_info.bcols_started = 0; \
variable_params->order_info.order_num = \
topo->topo_ordering_info.next_order_num; \
variable_params->order_info.n_fns_need_ordering = \
(coll_progress)->coll_schedule->n_fns_need_ordering; \
topo->topo_ordering_info.next_order_num += num_frags; \
(coll_progress)->fragment_data.message_descriptor->next_frag_num = \
variable_params->order_info.order_num + 1; \
} \
} while (0)
#define MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(coll_progress) \
do { \
mca_coll_ml_topology_t *topo = (coll_progress)->coll_schedule->topo_info; \
if (topo->topo_ordering_info.num_bcols_need_ordering > 0) { \
bcol_function_args_t *variable_params = &(coll_progress)->variable_fn_params; \
struct fragment_data_t *frag_data = &(coll_progress)->fragment_data; \
variable_params->order_info.bcols_started = 0; \
variable_params->order_info.order_num = frag_data->message_descriptor->next_frag_num; \
variable_params->order_info.n_fns_need_ordering = \
(coll_progress)->coll_schedule->n_fns_need_ordering; \
frag_data->message_descriptor->next_frag_num++; \
} \
} while (0)
#define MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule) \
do { \
int i; \
(schedule)->n_fns_need_ordering = 0; \
for (i = 0; i < (schedule)->n_fns; ++i) { \
mca_bcol_base_module_t *current_bcol = \
(schedule)->component_functions[i].constant_group_data.bcol_module; \
assert (NULL != current_bcol); \
if (current_bcol->bcol_component->need_ordering) { \
(schedule)->n_fns_need_ordering++; \
} \
} \
} while (0)
enum {
MCA_COLL_ML_NET_STREAM_SEND,
MCA_COLL_ML_NET_STREAM_RECV
};
static inline __opal_attribute_always_inline__
int mca_coll_ml_convertor_prepare(ompi_datatype_t *dtype, int count, const void *buff,
opal_convertor_t *convertor, int stream)
{
size_t bytes_packed;
if (MCA_COLL_ML_NET_STREAM_SEND == stream) {
opal_convertor_copy_and_prepare_for_send(
ompi_mpi_local_convertor,
&dtype->super, count, buff, 0,
convertor);
} else {
opal_convertor_copy_and_prepare_for_recv(
ompi_mpi_local_convertor,
&dtype->super, count, buff, 0,
convertor);
}
opal_convertor_get_packed_size(convertor, &bytes_packed);
return bytes_packed;
}
static inline __opal_attribute_always_inline__
int mca_coll_ml_convertor_pack(void *data_addr, size_t buff_size,
opal_convertor_t *convertor)
{
struct iovec iov;
size_t max_data = 0;
uint32_t iov_count = 1;
iov.iov_base = (IOVBASE_TYPE*) data_addr;
iov.iov_len = buff_size;
opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
return max_data;
}
static inline __opal_attribute_always_inline__
int mca_coll_ml_convertor_unpack(void *data_addr, size_t buff_size,
opal_convertor_t *convertor)
{
struct iovec iov;
size_t max_data = 0;
uint32_t iov_count = 1;
iov.iov_base = (void *) (uintptr_t) data_addr;
iov.iov_len = buff_size;
opal_convertor_unpack(convertor, &iov, &iov_count, &max_data);
return max_data;
}
#endif /* MCA_COLL_ML_COLLS_H */

Просмотреть файл

@ -1,449 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Most of the description of the data layout is in the
* coll_sm_module.c file.
*/
#include "ompi_config.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include "ompi/constants.h"
#include "ompi/mca/coll/base/base.h"
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/mpool/mpool.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/mca/sbgp/base/base.h"
#include "coll_ml.h"
#include "coll_ml_inlines.h"
#include "ompi/patterns/net/netpatterns.h"
#include "coll_ml_mca.h"
#include "coll_ml_custom_utils.h"
/*
* Public string showing the coll ompi_ml V2 component version number
*/
const char *mca_coll_ml_component_version_string =
"Open MPI ml-V2 collective MCA component version " OMPI_VERSION;
/*
* Local functions
*/
static int ml_open(void);
static int ml_close(void);
static int coll_ml_progress(void);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
mca_coll_ml_component_t mca_coll_ml_component = {
/* First, fill in the super */
.super = {
/* First, the mca_component_t struct containing meta
information about the component itself */
.collm_version = {
MCA_COLL_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "ml",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
/* Component open, close, and register functions */
.mca_open_component = ml_open,
.mca_close_component = ml_close,
.mca_register_component_params = mca_coll_ml_register_params
},
.collm_data = {
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
/* Initialization / querying functions */
.collm_init_query = mca_coll_ml_init_query,
.collm_comm_query = mca_coll_ml_comm_query,
},
};
void mca_coll_ml_abort_ml(char *message)
{
ML_ERROR(("ML Collective FATAL ERROR: %s", message));
/* shutdown the MPI */
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_INTERN);
}
/*
* progress function
*/
#define INDEX(task) ((task)->my_index_in_coll_schedule)
#define ACTIVE_L (&mca_coll_ml_component.active_tasks)
#define PENDING_L (&mca_coll_ml_component.pending_tasks)
#define SEQ_L (&mca_coll_ml_component.sequential_collectives)
static int coll_ml_progress()
{
int rc = OMPI_SUCCESS;
int fn_idx;
mca_coll_ml_task_status_t *task_status, *task_status_tmp;
mca_coll_ml_collective_operation_progress_t *seq_coll_op;
mca_coll_ml_collective_operation_progress_t *seq_coll_op_tmp;
mca_bcol_base_module_collective_fn_primitives_t progress_fn,
coll_fn;
mca_coll_ml_utility_data_t *const_args;
mca_coll_ml_component_t *cm = &mca_coll_ml_component;
/* Pasha: Not sure that is it correct way to resolve the problem.
Iprobe call for progress engine. The progress engine calls for our
progress and as result the first element on the list is progressed again
and so we call for Iprobe again.... as result we get HUGE stack.
One way to prevent it - remove the item from the list, and once you finish
to process it - put it back.
Other way - put flag on component, if the progress is running - exit immediate.
*/
if (cm->progress_is_busy) {
/* We are already working...*/
return OMPI_SUCCESS;
} else {
cm->progress_is_busy = true;
}
/* progress sequential collective operations */
/* RLG - need to do better here for parallel progress */
OPAL_THREAD_LOCK(&(cm->sequential_collectives_mutex));
OPAL_LIST_FOREACH_SAFE(seq_coll_op, seq_coll_op_tmp, SEQ_L, mca_coll_ml_collective_operation_progress_t) {
do {
fn_idx = seq_coll_op->sequential_routine.current_active_bcol_fn;
/* initialize the task */
if (SEQ_TASK_IN_PROG == seq_coll_op->sequential_routine.current_bcol_status){
progress_fn = seq_coll_op->coll_schedule->
component_functions[fn_idx].bcol_function->progress_fn;
} else {
/* PPP Pasha - apparently task setup should be called only here. see linr 190 */
progress_fn = seq_coll_op->coll_schedule->
component_functions[fn_idx].bcol_function->coll_fn;
}
const_args = &seq_coll_op->coll_schedule->component_functions[fn_idx].constant_group_data;
/* RLG - note need to move to useing coll_ml_utility_data_t as
* collective argument, rather than mca_bcol_base_function_t
*/
rc = progress_fn(&(seq_coll_op->variable_fn_params), (mca_bcol_base_function_t *)const_args);
if (BCOL_FN_COMPLETE == rc) {
/* done with this routine */
seq_coll_op->sequential_routine.current_active_bcol_fn++;
/* this is totally hardwired for bcast, need a general call-back */
fn_idx = seq_coll_op->sequential_routine.current_active_bcol_fn;
if (fn_idx == seq_coll_op->coll_schedule->n_fns) {
/* done with this collective - recycle descriptor */
/* remove from the progress list */
(void) opal_list_remove_item(SEQ_L, (opal_list_item_t *)seq_coll_op);
/* handle fragment completion */
rc = coll_ml_fragment_completion_processing(seq_coll_op);
if (OMPI_SUCCESS != rc) {
mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing");
}
} else {
rc = seq_coll_op->sequential_routine.seq_task_setup(seq_coll_op);
if (OMPI_SUCCESS != rc) {
mca_coll_ml_abort_ml("Failed to run sequential task setup");
}
seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
continue;
}
} else if (BCOL_FN_NOT_STARTED == rc) {
seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
} else if (BCOL_FN_STARTED == rc) {
seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_IN_PROG;
}
break;
} while (true);
}
OPAL_THREAD_UNLOCK(&(cm->sequential_collectives_mutex));
/* general dag's */
/* see if active tasks can be progressed */
OPAL_THREAD_LOCK(&(cm->active_tasks_mutex));
OPAL_LIST_FOREACH(task_status, ACTIVE_L, mca_coll_ml_task_status_t) {
/* progress task */
progress_fn = task_status->bcol_fn->progress_fn;
const_args = &task_status->ml_coll_operation->coll_schedule->
component_functions[INDEX(task_status)].constant_group_data;
rc = progress_fn(&(task_status->ml_coll_operation->variable_fn_params),
(mca_bcol_base_function_t *)const_args);
if (BCOL_FN_COMPLETE == rc) {
ML_VERBOSE(3, ("GOT BCOL_COMPLETED!!!!"));
rc = mca_coll_ml_task_completion_processing(&task_status, ACTIVE_L);
if (OMPI_SUCCESS != rc) {
mca_coll_ml_abort_ml("Failed to run mca_coll_ml_task_completion_processing");
}
} else if (BCOL_FN_STARTED == rc) {
/* nothing to do */
} else {
mca_coll_ml_abort_ml("Failed to run mca_coll_ml_task_completion_processing");
}
}
OPAL_THREAD_UNLOCK(&(cm->active_tasks_mutex));
/* see if new tasks can be initiated */
OPAL_THREAD_LOCK(&(cm->pending_tasks_mutex));
OPAL_LIST_FOREACH_SAFE(task_status, task_status_tmp, PENDING_L, mca_coll_ml_task_status_t) {
/* check to see if dependencies are satisfied */
int n_dependencies = task_status->rt_num_dependencies;
int n_dependencies_satisfied = task_status->n_dep_satisfied;
if (n_dependencies == n_dependencies_satisfied) {
/* initiate the task */
coll_fn = task_status->bcol_fn->coll_fn;
const_args = &task_status->ml_coll_operation->coll_schedule->
component_functions[INDEX(task_status)].constant_group_data;
rc = coll_fn(&(task_status->ml_coll_operation->variable_fn_params),
(mca_bcol_base_function_t *)const_args);
if (BCOL_FN_COMPLETE == rc) {
ML_VERBOSE(3, ("GOT BCOL_COMPLETED!"));
rc = mca_coll_ml_task_completion_processing(&task_status, PENDING_L);
if (OMPI_SUCCESS != rc) {
mca_coll_ml_abort_ml("Failed to run mca_coll_ml_task_completion_processing");
}
} else if ( BCOL_FN_STARTED == rc ) {
ML_VERBOSE(3, ("GOT BCOL_STARTED!"));
(void) opal_list_remove_item(PENDING_L, (opal_list_item_t *)task_status);
/* RLG - is there potential for deadlock here ? Need to
* look at this closely
*/
OPAL_THREAD_LOCK(&(cm->active_tasks_mutex));
opal_list_append(ACTIVE_L, (opal_list_item_t *)task_status);
OPAL_THREAD_UNLOCK(&(cm->active_tasks_mutex));
} else if( BCOL_FN_NOT_STARTED == rc ) {
/* nothing to do */
ML_VERBOSE(10, ("GOT BCOL_FN_NOT_STARTED!"));
} else {
OPAL_THREAD_UNLOCK(&(cm->pending_tasks_mutex));
/* error will be returned - RLG : need to reconsider return
* types - we have no way to convey error information
* the way the code is implemented now */
ML_VERBOSE(3, ("GOT error !"));
rc = OMPI_ERROR;
OMPI_ERRHANDLER_RETURN(rc,MPI_COMM_WORLD,rc,"Error returned from bcol function: aborting");
break;
}
}
}
OPAL_THREAD_UNLOCK(&(cm->pending_tasks_mutex));
/* return */
cm->progress_is_busy = false;
return rc;
}
static void adjust_coll_config_by_mca_param(void)
{
/* setting bcast mca params */
if (COLL_ML_STATIC_BCAST == mca_coll_ml_component.bcast_algorithm) {
mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_KNOWN;
mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_KNOWN;
} else if (COLL_ML_SEQ_BCAST == mca_coll_ml_component.bcast_algorithm) {
mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_SEQUENTIAL;
mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_SEQUENTIAL;
} else { /* Unknown root */
mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_UNKNOWN;
mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_UNKNOWN;
}
}
/*
* Open the component
*/
static int ml_open(void)
{
/* local variables */
int rc, c_idx, m_idx;
mca_coll_ml_component_t *cs = &mca_coll_ml_component;
/* set the starting sequence number */
cs->base_sequence_number = -1;
cs->progress_is_busy = false;
/* If the priority is zero (default) disable the component */
if (mca_coll_ml_component.ml_priority <= 0) {
return OMPI_ERR_NOT_AVAILABLE;
}
/* Init memory structures (no real memory is allocated) */
OBJ_CONSTRUCT(&cs->memory_manager, mca_coll_ml_lmngr_t);
if (OMPI_SUCCESS != (rc = mca_base_framework_open(&ompi_sbgp_base_framework, 0))) {
fprintf(stderr," failure in open mca_sbgp_base_open \n");
return rc;
}
if (OMPI_SUCCESS != (rc = mca_base_framework_open(&ompi_bcol_base_framework, 0))) {
fprintf(stderr," failure in open mca_bcol_base_open \n");
return rc;
}
/* Reset collective tunings cache */
for (c_idx = 0; c_idx < ML_NUM_OF_FUNCTIONS; c_idx++) {
for (m_idx = 0; m_idx < ML_NUM_MSG; m_idx++) {
mca_coll_ml_reset_config(&cs->coll_config[c_idx][m_idx]);
}
}
adjust_coll_config_by_mca_param();
/* Load configuration file and cache the configuration on component */
rc = mca_coll_ml_config_file_init();
if (OMPI_SUCCESS != rc) {
return OMPI_ERROR;
}
/* reigster the progress function */
rc = opal_progress_register(coll_ml_progress);
if (OMPI_SUCCESS != rc ) {
fprintf(stderr," failed to register the ml progress function \n");
fflush(stderr);
return rc;
}
OBJ_CONSTRUCT(&(cs->pending_tasks_mutex), opal_mutex_t);
OBJ_CONSTRUCT(&(cs->pending_tasks), opal_list_t);
OBJ_CONSTRUCT(&(cs->active_tasks_mutex), opal_mutex_t);
OBJ_CONSTRUCT(&(cs->active_tasks), opal_list_t);
OBJ_CONSTRUCT(&(cs->sequential_collectives_mutex), opal_mutex_t);
OBJ_CONSTRUCT(&(cs->sequential_collectives), opal_list_t);
rc = netpatterns_init();
if (OMPI_SUCCESS != rc) {
return rc;
}
cs->topo_discovery_fn[COLL_ML_HR_FULL] =
mca_coll_ml_fulltree_hierarchy_discovery;
cs->topo_discovery_fn[COLL_ML_HR_ALLREDUCE] =
mca_coll_ml_allreduce_hierarchy_discovery;
cs->topo_discovery_fn[COLL_ML_HR_NBS] =
mca_coll_ml_fulltree_exclude_basesmsocket_hierarchy_discovery;
cs->topo_discovery_fn[COLL_ML_HR_SINGLE_PTP] =
mca_coll_ml_fulltree_ptp_only_hierarchy_discovery;
cs->topo_discovery_fn[COLL_ML_HR_SINGLE_IBOFFLOAD] =
mca_coll_ml_fulltree_iboffload_only_hierarchy_discovery;
cs->need_allreduce_support = false;
return OMPI_SUCCESS;
}
/*
* Close the component
*/
static int ml_close(void)
{
int ret;
mca_coll_ml_component_t *cs = &mca_coll_ml_component;
/* There is not need to release/close resource if the
* priority was set to zero */
if (cs->ml_priority <= 0) {
return OMPI_SUCCESS;
}
OBJ_DESTRUCT(&cs->memory_manager);
OBJ_DESTRUCT(&cs->pending_tasks_mutex);
OBJ_DESTRUCT(&cs->pending_tasks);
OBJ_DESTRUCT(&cs->active_tasks_mutex);
OBJ_DESTRUCT(&cs->active_tasks);
OBJ_DESTRUCT(&cs->sequential_collectives_mutex);
OBJ_DESTRUCT(&cs->sequential_collectives);
/* deregister progress function */
ret = opal_progress_unregister(coll_ml_progress);
if (OMPI_SUCCESS != ret ) {
OMPI_ERROR_LOG(ret);
return ret;
}
/* close the sbgp and bcol frameworks */
if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_sbgp_base_framework))) {
OMPI_ERROR_LOG(ret);
return ret;
}
if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bcol_base_framework))) {
OMPI_ERROR_LOG(ret);
return ret;
}
return OMPI_SUCCESS;
}
/* query to see if the component is available for use, and can
* satisfy the thread and progress requirements
*/
int mca_coll_ml_init_query(bool enable_progress_threads,
bool enable_mpi_threads)
{
int ret;
/* at this stage there is no reason to disaulify this component */
/* Add here bcol init nand sbgp init */
ret = mca_sbgp_base_init(enable_progress_threads, enable_mpi_threads);
if (OMPI_SUCCESS != ret) {
return ret;
}
ret = mca_bcol_base_init(enable_progress_threads, enable_mpi_threads);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* done */
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,613 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013-2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "coll_ml.h"
#include "coll_ml_inlines.h"
#include "coll_ml_config.h"
#include "coll_ml_lex.h"
static char *key_buffer = NULL;
static size_t key_buffer_len = 0;
typedef struct section_config_t {
char *section_name;
int section_id;
per_collective_configuration_t config;
} section_config_t;
typedef struct coll_config_t {
char *coll_name;
int coll_id;
section_config_t section;
} coll_config_t;
static int algorithm_name_to_id(char *name)
{
assert (NULL != name);
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_KNOWN"))
return ML_BCAST_SMALL_DATA_KNOWN;
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_UNKNOWN"))
return ML_BCAST_SMALL_DATA_UNKNOWN;
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_SEQUENTIAL"))
return ML_BCAST_SMALL_DATA_SEQUENTIAL;
if (!strcasecmp(name,"ML_BCAST_LARGE_DATA_KNOWN"))
return ML_BCAST_LARGE_DATA_KNOWN;
if (!strcasecmp(name,"ML_BCAST_LARGE_DATA_UNKNOWN"))
return ML_BCAST_LARGE_DATA_UNKNOWN;
if (!strcasecmp(name,"ML_BCAST_LARGE_DATA_SEQUENTIAL"))
return ML_BCAST_LARGE_DATA_SEQUENTIAL;
if (!strcasecmp(name,"ML_N_DATASIZE_BINS"))
return ML_N_DATASIZE_BINS;
if (!strcasecmp(name,"ML_NUM_BCAST_FUNCTIONS"))
return ML_NUM_BCAST_FUNCTIONS;
if (!strcasecmp(name,"ML_SCATTER_SMALL_DATA_KNOWN"))
return ML_SCATTER_SMALL_DATA_KNOWN;
if (!strcasecmp(name,"ML_SCATTER_N_DATASIZE_BINS"))
return ML_SCATTER_N_DATASIZE_BINS;
if (!strcasecmp(name,"ML_SCATTER_SMALL_DATA_UNKNOWN"))
return ML_SCATTER_SMALL_DATA_UNKNOWN;
if (!strcasecmp(name,"ML_SCATTER_SMALL_DATA_SEQUENTIAL"))
return ML_SCATTER_SMALL_DATA_SEQUENTIAL;
if (!strcasecmp(name,"ML_NUM_SCATTER_FUNCTIONS"))
return ML_NUM_SCATTER_FUNCTIONS;
if (!strcasecmp(name,"ML_SMALL_DATA_ALLREDUCE"))
return ML_SMALL_DATA_ALLREDUCE;
if (!strcasecmp(name,"ML_LARGE_DATA_ALLREDUCE"))
return ML_LARGE_DATA_ALLREDUCE;
if (!strcasecmp(name,"ML_SMALL_DATA_REDUCE"))
return ML_SMALL_DATA_ALLREDUCE;
if (!strcasecmp(name,"ML_LARGE_DATA_REDUCE"))
return ML_LARGE_DATA_ALLREDUCE;
if (!strcasecmp(name,"ML_SMALL_DATA_REDUCE"))
return ML_SMALL_DATA_REDUCE;
if (!strcasecmp(name,"ML_LARGE_DATA_REDUCE"))
return ML_LARGE_DATA_REDUCE;
if (!strcasecmp(name,"ML_NUM_ALLREDUCE_FUNCTIONS"))
return ML_NUM_ALLREDUCE_FUNCTIONS;
if (!strcasecmp(name,"ML_SMALL_DATA_ALLTOALL"))
return ML_SMALL_DATA_ALLTOALL;
if (!strcasecmp(name,"ML_LARGE_DATA_ALLTOALL"))
return ML_LARGE_DATA_ALLTOALL;
if (!strcasecmp(name,"ML_NUM_ALLTOALL_FUNCTIONS"))
return ML_NUM_ALLTOALL_FUNCTIONS;
if (!strcasecmp(name,"ML_SMALL_DATA_ALLGATHER"))
return ML_SMALL_DATA_ALLGATHER;
if (!strcasecmp(name,"ML_LARGE_DATA_ALLGATHER"))
return ML_LARGE_DATA_ALLGATHER;
if (!strcasecmp(name,"ML_NUM_ALLGATHER_FUNCTIONS"))
return ML_NUM_ALLGATHER_FUNCTIONS;
if (!strcasecmp(name,"ML_SMALL_DATA_GATHER"))
return ML_SMALL_DATA_GATHER;
if (!strcasecmp(name,"ML_LARGE_DATA_GATHER"))
return ML_LARGE_DATA_GATHER;
if (!strcasecmp(name,"ML_NUM_GATHER_FUNCTIONS"))
return ML_NUM_GATHER_FUNCTIONS;
if (!strcasecmp(name,"ML_BARRIER_DEFAULT"))
return ML_BARRIER_DEFAULT;
/* ERROR */
return ML_UNDEFINED;
}
static int hierarchy_name_to_id(char *name)
{
assert (NULL != name);
if (!strcasecmp(name, "FULL_HR")) {
return COLL_ML_HR_FULL;
}
if (!strcasecmp(name, "FULL_HR_NO_BASESOCKET")) {
return COLL_ML_HR_NBS;
}
if (!strcasecmp(name, "PTP_ONLY")) {
return COLL_ML_HR_SINGLE_PTP;
}
if (!strcasecmp(name, "IBOFFLOAD_ONLY")) {
return COLL_ML_HR_SINGLE_IBOFFLOAD;
}
/* Error */
return ML_UNDEFINED;
}
static int section_name_to_id(char *name)
{
assert (NULL != name);
if (!strcasecmp(name, "SMALL")) {
return ML_SMALL_MSG;
}
if (!strcasecmp(name, "LARGE")) {
return ML_LARGE_MSG;
}
/* Error */
return ML_UNDEFINED;
}
static int coll_name_to_id(char *name)
{
assert (NULL != name);
if (!strcasecmp(name, "ALLGATHER")) {
return ML_ALLGATHER;
}
if (!strcasecmp(name, "ALLGATHERV")) {
return ML_ALLGATHERV;
}
if (!strcasecmp(name, "ALLREDUCE")) {
return ML_ALLREDUCE;
}
if (!strcasecmp(name, "ALLTOALL")) {
return ML_ALLTOALL;
}
if (!strcasecmp(name, "ALLTOALLV")) {
return ML_ALLTOALLV;
}
if (!strcasecmp(name, "ALLTOALLW")) {
return ML_ALLTOALLW;
}
if (!strcasecmp(name, "ALLTOALLW")) {
return ML_ALLTOALLW;
}
if (!strcasecmp(name, "BARRIER")) {
return ML_BARRIER;
}
if (!strcasecmp(name, "BCAST")) {
return ML_BCAST;
}
if (!strcasecmp(name, "EXSCAN")) {
return ML_EXSCAN;
}
if (!strcasecmp(name, "GATHER")) {
return ML_GATHER;
}
if (!strcasecmp(name, "GATHERV")) {
return ML_GATHERV;
}
if (!strcasecmp(name, "REDUCE")) {
return ML_REDUCE;
}
if (!strcasecmp(name, "REDUCE_SCATTER")) {
return ML_REDUCE_SCATTER;
}
if (!strcasecmp(name, "SCAN")) {
return ML_SCAN;
}
if (!strcasecmp(name, "SCATTER")) {
return ML_SCATTER;
}
if (!strcasecmp(name, "SCATTERV")) {
return ML_SCATTERV;
}
/* nonblocking functions */
if (!strcasecmp(name, "IALLGATHER")) {
return ML_IALLGATHER;
}
if (!strcasecmp(name, "IALLGATHERV")) {
return ML_IALLGATHERV;
}
if (!strcasecmp(name, "IALLREDUCE")) {
return ML_IALLREDUCE;
}
if (!strcasecmp(name, "IALLTOALL")) {
return ML_IALLTOALL;
}
if (!strcasecmp(name, "IALLTOALLV")) {
return ML_IALLTOALLV;
}
if (!strcasecmp(name, "IALLTOALLW")) {
return ML_IALLTOALLW;
}
if (!strcasecmp(name, "IALLTOALLW")) {
return ML_IALLTOALLW;
}
if (!strcasecmp(name, "IBARRIER")) {
return ML_IBARRIER;
}
if (!strcasecmp(name, "IBCAST")) {
return ML_IBCAST;
}
if (!strcasecmp(name, "IEXSCAN")) {
return ML_IEXSCAN;
}
if (!strcasecmp(name, "IGATHER")) {
return ML_IGATHER;
}
if (!strcasecmp(name, "IGATHERV")) {
return ML_IGATHERV;
}
if (!strcasecmp(name, "IREDUCE")) {
return ML_IREDUCE;
}
if (!strcasecmp(name, "IREDUCE_SCATTER")) {
return ML_IREDUCE_SCATTER;
}
if (!strcasecmp(name, "ISCAN")) {
return ML_ISCAN;
}
if (!strcasecmp(name, "ISCATTER")) {
return ML_ISCATTER;
}
if (!strcasecmp(name, "ISCATTERV")) {
return ML_ISCATTERV;
}
/* Error - collecives name was not matched */
return ML_UNDEFINED;
}
static int set_collective_name(coll_config_t *coll_config)
{
int coll_id =
coll_name_to_id(coll_ml_config_yytext);
if (ML_UNDEFINED == coll_id) {
return OMPI_ERROR;
}
coll_config->coll_id = coll_id;
coll_config->coll_name = strdup(coll_ml_config_yytext);
return OMPI_SUCCESS;
}
static int set_section_name(section_config_t *section_config)
{
int section_id;
section_id = section_name_to_id(coll_ml_config_yytext);
if (ML_UNDEFINED == section_id) {
return OMPI_ERROR;
}
section_config->section_id = section_id;
section_config->section_name = strdup(coll_ml_config_yytext);
return OMPI_SUCCESS;
}
void mca_coll_ml_reset_config(per_collective_configuration_t *config)
{
config->topology_id = ML_UNDEFINED;
config->threshold = ML_UNDEFINED;
config->algorithm_id = ML_UNDEFINED;
config->fragmentation_enabled = ML_UNDEFINED;
}
static void reset_section(section_config_t *section_cf)
{
if (section_cf->section_name) {
free (section_cf->section_name);
section_cf->section_name = NULL;
}
section_cf->section_id = ML_UNDEFINED;
mca_coll_ml_reset_config(&section_cf->config);
}
static void reset_collective(coll_config_t *coll_cf)
{
if (coll_cf->coll_name) {
free (coll_cf->coll_name);
coll_cf->coll_name = NULL;
}
coll_cf->coll_id = ML_UNDEFINED;
reset_section(&coll_cf->section);
}
/*
* String to integer;
*/
static int string_to_int(char *str)
{
while (isspace(*str)) {
++str;
}
/* Nope -- just decimal, so use atoi() */
return atoi(str);
}
static int parse_algorithm_key(section_config_t *section, char *value)
{
int ret;
ret = algorithm_name_to_id(value);
if (ML_UNDEFINED == ret) {
return OMPI_ERROR;
} else {
section->config.algorithm_id = ret;
}
return OMPI_SUCCESS;
}
static int parse_threshold_key(section_config_t *section, char *value)
{
assert (NULL != value);
if(!strcasecmp(value, "unlimited")) {
section->config.threshold = -1;
} else {
section->config.threshold = string_to_int(value);
}
return OMPI_SUCCESS;
}
static int parse_hierarchy_key(section_config_t *section, char *value)
{
int ret;
ret = hierarchy_name_to_id(value);
if (ML_UNDEFINED == ret) {
return OMPI_ERROR;
}
section->config.topology_id = ret;
return OMPI_SUCCESS;
}
static int parse_fragmentation_key(section_config_t *section, char *value)
{
assert (NULL != value);
if(!strcasecmp(value, "enable")) {
section->config.fragmentation_enabled = 1;
} else if (!strcasecmp(value, "disable")) {
section->config.fragmentation_enabled = 0;
} else {
ML_ERROR(("Line %d, unexpected fragmentation value %s. Legal values are: enable/disable",
coll_ml_config_yynewlines, value));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/* Save configuration that have been collected so far */
static int save_settings(coll_config_t *coll_config)
{
per_collective_configuration_t *cf;
if (ML_UNDEFINED == coll_config->coll_id || ML_UNDEFINED == coll_config->section.section_id) {
return OMPI_ERROR;
}
cf = &mca_coll_ml_component.coll_config[coll_config->coll_id][coll_config->section.section_id];
cf->topology_id = coll_config->section.config.topology_id;
cf->threshold = coll_config->section.config.threshold;
cf->algorithm_id = coll_config->section.config.algorithm_id;
cf->fragmentation_enabled = coll_config->section.config.fragmentation_enabled;
return OMPI_SUCCESS;
}
/*
* Parse a single line
*/
static int parse_line(section_config_t *section)
{
int val, ret = OMPI_SUCCESS;
char *value = NULL;
/* Save the name name */
if (key_buffer_len < strlen(coll_ml_config_yytext) + 1) {
char *tmp;
key_buffer_len = strlen(coll_ml_config_yytext) + 1;
tmp = (char *) realloc(key_buffer, key_buffer_len);
if (NULL == tmp) {
free(key_buffer);
key_buffer_len = 0;
key_buffer = NULL;
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
key_buffer = tmp;
}
strncpy(key_buffer, coll_ml_config_yytext, key_buffer_len);
/* The first thing we have to see is an "=" */
val = coll_ml_config_yylex();
if (coll_ml_config_parse_done || COLL_ML_CONFIG_PARSE_EQUAL != val) {
ML_ERROR(("Line %d, expected = before key: %s",
coll_ml_config_yynewlines,
key_buffer));
return OMPI_ERROR;
}
/* Next we get the value */
val = coll_ml_config_yylex();
if (COLL_ML_CONFIG_PARSE_SINGLE_WORD == val ||
COLL_ML_CONFIG_PARSE_VALUE == val) {
value = strdup(coll_ml_config_yytext);
if (NULL == value) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Now we need to see the newline */
val = coll_ml_config_yylex();
if (COLL_ML_CONFIG_PARSE_NEWLINE != val &&
COLL_ML_CONFIG_PARSE_DONE != val) {
ML_ERROR(("Line %d, expected new line after %s",
coll_ml_config_yynewlines,
key_buffer));
free(value);
return OMPI_ERROR;
}
}
/* If we did not get EOL or EOF, something is wrong */
else if (COLL_ML_CONFIG_PARSE_DONE != val &&
COLL_ML_CONFIG_PARSE_NEWLINE != val) {
ML_ERROR(("Line %d, expected new line or end of line",
coll_ml_config_yynewlines));
return OMPI_ERROR;
} else {
ML_ERROR(("Line %d malformed", coll_ml_config_yynewlines));
return OMPI_ERROR;
}
/* Line parsing is done, read the values */
if (!strcasecmp(key_buffer, "algorithm")) {
ret = parse_algorithm_key(section, value);
} else if (!strcasecmp(key_buffer, "threshold")) {
ret = parse_threshold_key(section, value);
} else if (!strcasecmp(key_buffer, "hierarchy")) {
ret = parse_hierarchy_key(section, value);
} else if (!strcasecmp(key_buffer, "fragmentation")) {
ret = parse_fragmentation_key(section, value);
/* Failed to parse the key */
} else {
ML_ERROR(("Line %d, unknown key %s",
coll_ml_config_yynewlines, key_buffer));
}
/* All done */
free(value);
return ret;
}
/**************************************************************************/
/*
* Parse a single file
*/
static int parse_file(char *filename)
{
int val;
int ret = OMPI_SUCCESS;
bool first_section = true, first_coll = true;
coll_config_t coll_config;
memset (&coll_config, 0, sizeof (coll_config));
reset_collective(&coll_config);
/* Open the file */
coll_ml_config_yyin = fopen(filename, "r");
if (NULL == coll_ml_config_yyin) {
ML_ERROR(("Failed to open config file %s", filename));
ret = OMPI_ERR_NOT_FOUND;
goto cleanup;
}
/* Do the parsing */
coll_ml_config_parse_done = false;
coll_ml_config_yynewlines = 1;
coll_ml_config_init_buffer(coll_ml_config_yyin);
while (!coll_ml_config_parse_done) {
val = coll_ml_config_yylex();
switch (val) {
case COLL_ML_CONFIG_PARSE_DONE:
case COLL_ML_CONFIG_PARSE_NEWLINE:
break;
case COLL_ML_CONFIG_PARSE_COLLECTIVE:
/* dump all the information to last section that was defined */
if (!first_coll) {
ret = save_settings(&coll_config);
if (OMPI_SUCCESS != ret) {
ML_ERROR(("Error in syntax for collective %s", coll_config.coll_name));
goto cleanup;
}
}
/* reset collective config */
reset_collective(&coll_config);
first_coll = false;
first_section = true;
ret = set_collective_name(&coll_config);
if (OMPI_SUCCESS != ret) {
goto cleanup;
}
break;
case COLL_ML_CONFIG_PARSE_SECTION:
if (ML_UNDEFINED == coll_config.coll_id) {
ML_ERROR(("Collective section wasn't defined !"));
ret = OMPI_ERROR;
goto cleanup;
}
if (!first_section) {
/* dump all the information to last section that was defined */
ret = save_settings(&coll_config);
if (OMPI_SUCCESS != ret) {
ML_ERROR(("Error in syntax for collective %s section %s", coll_config.coll_name,
coll_config.section.section_name));
goto cleanup;
}
}
first_section = false;
/* reset all section values */
reset_section(&coll_config.section);
/* set new section name */
ret = set_section_name(&coll_config.section);
if (OMPI_SUCCESS != ret) {
goto cleanup;
}
break;
case COLL_ML_CONFIG_PARSE_SINGLE_WORD:
if (ML_UNDEFINED == coll_config.coll_id ||
ML_UNDEFINED == coll_config.section.section_id) {
ML_ERROR(("Collective section or sub-section was not defined !"));
ret = OMPI_ERROR;
goto cleanup;
} else {
parse_line(&coll_config.section);
}
break;
default:
/* anything else is an error */
ML_ERROR(("Unexpected token!"));
ret = OMPI_ERROR;
goto cleanup;
break;
}
}
save_settings(&coll_config);
fclose(coll_ml_config_yyin);
coll_ml_config_yylex_destroy ();
ret = OMPI_SUCCESS;
cleanup:
reset_collective(&coll_config);
if (NULL != key_buffer) {
free(key_buffer);
key_buffer = NULL;
key_buffer_len = 0;
}
return ret;
}
int mca_coll_ml_config_file_init(void)
{
return parse_file(mca_coll_ml_component.config_file_name);
}

Просмотреть файл

@ -1,23 +0,0 @@
#ifndef COLL_ML_CONFIG_H_
#define COLL_ML_CONFIG_H_
#include "opal_config.h"
#include <stdio.h>
BEGIN_C_DECLS
#define ML_UNDEFINED -1
struct per_collective_configuration_t {
int topology_id;
int threshold;
int algorithm_id;
int fragmentation_enabled;
};
typedef struct per_collective_configuration_t per_collective_configuration_t;
void mca_coll_ml_reset_config(per_collective_configuration_t *config);
int mca_coll_ml_config_file_init(void);
END_C_DECLS
#endif

Просмотреть файл

@ -1,131 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file */
#include "ompi_config.h"
#include "ompi/constants.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/bcol/bcol.h"
#include "opal/sys/atomic.h"
#include "ompi/mca/coll/ml/coll_ml.h"
#include "ompi/mca/coll/ml/coll_ml_inlines.h"
#include "ompi/mca/coll/ml/coll_ml_allocation.h"
#include "coll_ml_colls.h"
#include <unistd.h>
#include <sys/uio.h>
/* This routine re-orders and packs user data. The assumption is that
* there is per-process data, the amount of data is the same for all
* ranks, and the user data is contigous.
*/
int mca_coll_ml_pack_reorder_contiguous_data(mca_coll_ml_collective_operation_progress_t *coll_op)
{
int i, rank;
void *user_buf, *library_buf;
size_t bytes_per_proc;
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *)
coll_op->coll_module;
mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info;
ptrdiff_t ptr_dif;
/* get the offset into each processes data. The assumption is that
* we are manipulating the same amount of data for each process.
*/
/* figure out how much data per-proc to copy */
bytes_per_proc=coll_op->fragment_data.per_rank_fragment_size;
/* loop over all the ranks in the communicator */
for( i=0 ; i < ompi_comm_size(ml_module->comm) ; i++ ) {
/* look up the rank of the i'th element in the sorted list */
rank = topo_info->sort_list[i];
/* get the pointer to user data */
user_buf=(void *)coll_op->full_message.src_user_addr;
/* compute offset into the user buffer */
/* offset for data already processed */
ptr_dif=rank*coll_op->full_message.n_bytes_per_proc_total+
coll_op->fragment_data.offset_into_user_buffer_per_proc;
user_buf=(void *) ((char *)user_buf+ptr_dif);
/*
rank*coll_op->full_message.n_bytes_per_proc_total+
coll_op->fragment_data.offset_into_user_buffer_per_proc);
*/
/* get the pointer to the ML buffer */
library_buf= (void *)
((char *)coll_op->variable_fn_params.src_desc->data_addr+i*bytes_per_proc);
/* copy the data */
memcpy(library_buf, user_buf, bytes_per_proc);
}
return OMPI_SUCCESS;
}
/* This routine re-orders and packs user data. The assumption is that
* there is per-process data, the amount of data is the same for all
* ranks, and the user data is contigous.
*/
int mca_coll_ml_pack_reorder_noncontiguous_data(mca_coll_ml_collective_operation_progress_t *coll_op)
{
int i, rank;
void *user_buf, *library_buf;
size_t bytes_per_proc;
ptrdiff_t ptr_dif;
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *)
coll_op->coll_module;
mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info;
/* get the offset into each processes data. The assumption is that
* we are manipulating the same amount of data for each process.
*/
/* figure out how much data per-proc to copy */
bytes_per_proc = coll_op->fragment_data.per_rank_fragment_size;
/* loop over all the ranks in the communicator */
for(i = 0; i < ompi_comm_size(ml_module->comm); i++ ) {
/* look up the rank of the i'th element in the sorted list */
rank = topo_info->sort_list[i];
/* get the pointer to user data */
user_buf=(void *)coll_op->full_message.src_user_addr;
/* compute offset into the user buffer */
/* offset for data already processed */
ptr_dif=rank*coll_op->full_message.send_count*
coll_op->full_message.send_extent+
coll_op->fragment_data.offset_into_user_buffer_per_proc;
user_buf=(void *) ((char *)user_buf+ptr_dif);
/* get the pointer to the ML buffer */
library_buf= (void *)
((char *)coll_op->variable_fn_params.src_desc->data_addr+i*bytes_per_proc);
/* copy the data */
memcpy(library_buf, user_buf, bytes_per_proc);
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,139 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file */
#include "ompi_config.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "opal/util/output.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_object.h"
#include "ompi/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/threads/mutex.h"
#include "opal/sys/atomic.h"
#include "ompi/op/op.h"
#include "ompi/constants.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/coll/ml/coll_ml.h"
#include "ompi/mca/coll/ml/coll_ml_inlines.h"
#include "ompi/patterns/comm/coll_ops.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/bcol/base/base.h"
#include "coll_ml_custom_utils.h"
/*
* Local types
*/
struct avail_coll_t {
opal_list_item_t super;
int ac_priority;
mca_coll_base_module_2_1_0_t *ac_module;
};
typedef struct avail_coll_t avail_coll_t;
/*
* Stuff for the OBJ interface
* If topo_index == COLL_ML_TOPO_MAX it looks over all possilbe topologies, otherwhise it looks
* in the topology that was specified.
*/
int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_module_t *ml_module,
int topo_index)
{
int i, rc, hier, *ranks_in_comm,
is_used = 0,
comm_size = ompi_comm_size(ml_module->comm);
int n_hier, tp , max_tp;
const mca_coll_ml_topology_t *topo_info;
ranks_in_comm = (int *) malloc(comm_size * sizeof(int));
if (OPAL_UNLIKELY(NULL == ranks_in_comm)) {
ML_ERROR(("Memory allocation failed."));
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_NO_MEM);
/* not reached but causes a clang warning to not return here */
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < comm_size; ++i) {
ranks_in_comm[i] = i;
}
if (COLL_ML_TOPO_MAX == topo_index) {
tp = 0;
max_tp = COLL_ML_TOPO_MAX;
} else {
tp = topo_index;
max_tp = topo_index + 1;
}
for (; tp < max_tp; tp++) {
topo_info = &ml_module->topo_list[tp];
n_hier = topo_info->n_levels;
for (hier = 0; hier < n_hier; ++hier) {
hierarchy_pairs *pair = &topo_info->component_pairs[hier];
mca_bcol_base_component_t *b_cm = pair->bcol_component;
if(0 == strcmp(bcol_name,
b_cm->bcol_version.mca_component_name)) {
is_used = 1;
break;
}
}
}
rc = comm_allreduce_pml(&is_used, &is_used, 1, MPI_INT,
ompi_comm_rank(ml_module->comm), MPI_MAX,
comm_size, ranks_in_comm, ml_module->comm);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
ML_ERROR(("comm_allreduce_pml failed."));
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_OP);
}
free(ranks_in_comm);
return is_used;
}
/* The function is very different from the above function */
int mca_coll_ml_check_if_bcol_is_requested(const char *component_name)
{
mca_base_component_list_item_t *bcol_comp;
ML_VERBOSE(10, ("Loop over bcol components"));
OPAL_LIST_FOREACH(bcol_comp, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
if(0 == strcmp(component_name,
((mca_bcol_base_component_2_0_0_t *)
bcol_comp->cli_component)->bcol_version.mca_component_name)) {
return true;
}
}
/* the component was not resquested */
return false;
}

Просмотреть файл

@ -1,28 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file */
#ifndef MCA_COLL_ML_CUSTOM_UTILS_H
#define MCA_COLL_ML_CUSTOM_UTILS_H
#include "ompi_config.h"
#include "coll_ml.h"
/* the function is used to check if the bcol name is used in this ml module */
int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_module_t *ml_module,
int topo_index);
/* The function is used to check if the bcol component was REQUESTED by user */
int mca_coll_ml_check_if_bcol_is_requested(const char *component_name);
END_C_DECLS
#endif /* MCA_COLL_ML_ML_H */

Просмотреть файл

@ -1,60 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "coll_ml.h"
#include "coll_ml_inlines.h"
static inline void mca_coll_ml_fragment_constructor(mca_coll_ml_fragment_t *frag)
{
frag->fn_args = NULL;
}
static inline void mca_coll_ml_fragment_destructor(mca_coll_ml_fragment_t *frag)
{
if (frag->fn_args) {
free(frag->fn_args);
frag->fn_args = NULL;
}
}
static inline void mca_coll_ml_descriptor_constructor(mca_coll_ml_descriptor_t *descriptor)
{
OBJ_CONSTRUCT(&(descriptor->fragment),mca_coll_ml_fragment_t);
/* this fragment is alway associated with this message descriptor */
descriptor->fragment.full_msg_descriptor=descriptor;
}
static inline void mca_coll_ml_descriptor_destructor(mca_coll_ml_descriptor_t *descriptor)
{
OBJ_DESTRUCT(&(descriptor->fragment));
}
OBJ_CLASS_INSTANCE(
mca_coll_ml_fragment_t,
opal_list_item_t,
mca_coll_ml_fragment_constructor,
mca_coll_ml_fragment_destructor);
OBJ_CLASS_INSTANCE(
mca_coll_ml_descriptor_t,
ompi_request_t,
mca_coll_ml_descriptor_constructor,
mca_coll_ml_descriptor_destructor);

Просмотреть файл

@ -1,132 +0,0 @@
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file */
#ifndef MCA_COLL_ML_FUNCTIONS_H
#define MCA_COLL_ML_FUNCTIONS_H
#include "ompi_config.h"
BEGIN_C_DECLS
#define ML_MEMSYNC -100
enum {
ML_BARRIER_DEFAULT
};
/* small data algorithm */
/* broadcast functions */
enum {
/* small data algorithm */
ML_BCAST_SMALL_DATA_KNOWN,
/* small data - dynamic decision making supported */
ML_BCAST_SMALL_DATA_UNKNOWN,
/* Sequential algorithm */
ML_BCAST_SMALL_DATA_SEQUENTIAL,
ML_BCAST_LARGE_DATA_KNOWN,
ML_BCAST_LARGE_DATA_UNKNOWN,
ML_BCAST_LARGE_DATA_SEQUENTIAL,
/* marker - all routines about this are expected to be used in
* selection logic that is based on size of the data */
ML_N_DATASIZE_BINS,
/* number of functions - also counts some markers, but ... */
ML_NUM_BCAST_FUNCTIONS
};
/* scatter functions */
enum {
/* small data algorithm */
ML_SCATTER_SMALL_DATA_KNOWN,
/* marker - all routines about this are expected to be used in
* selection logic that is based on size of the data */
ML_SCATTER_N_DATASIZE_BINS,
/* small data - dynamic decision making supported */
ML_SCATTER_SMALL_DATA_UNKNOWN,
/* Sequential algorithm */
ML_SCATTER_SMALL_DATA_SEQUENTIAL,
/* number of functions - also counts some markers, but ... */
ML_NUM_SCATTER_FUNCTIONS
};
/* Allreduce functions */
enum {
/* small data algorithm */
ML_SMALL_DATA_ALLREDUCE,
/* Large data algorithm */
ML_LARGE_DATA_ALLREDUCE,
/* If some of bcols doesn't support
all possibles types, use these extra algthms */
/* small data algorithm */
ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE,
/* large data algorithm */
ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE,
/* number of functions */
ML_NUM_ALLREDUCE_FUNCTIONS
};
/* Reduce functions */
enum {
/* small data algorithm */
ML_SMALL_DATA_REDUCE,
/* Large data algorithm */
ML_LARGE_DATA_REDUCE,
/* number of functions */
ML_NUM_REDUCE_FUNCTIONS
};
/* Alltoall functions */
enum {
/* small data algorithm */
ML_SMALL_DATA_ALLTOALL,
/* large all to all */
ML_LARGE_DATA_ALLTOALL,
/* number of functions */
ML_NUM_ALLTOALL_FUNCTIONS
};
/* Allgather functions */
enum {
/* small data */
ML_SMALL_DATA_ALLGATHER,
/* large data */
ML_LARGE_DATA_ALLGATHER,
/* number of functions */
ML_NUM_ALLGATHER_FUNCTIONS
};
/* gather functions */
enum {
/* small data */
ML_SMALL_DATA_GATHER,
/* large data */
ML_LARGE_DATA_GATHER,
/* number of functions */
ML_NUM_GATHER_FUNCTIONS
};
END_C_DECLS
#endif /* MCA_COLL_ML_FUNCTIONS_H */

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше