Merge pull request #2658 from rhc54/topic/removal
Remove the bcol, coll/ml, and sbgp code as stale and lacking a maintainer
Этот коммит содержится в:
Коммит
5737a45b35
@ -1,35 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# main library setup
|
||||
noinst_LTLIBRARIES = libmca_bcol.la
|
||||
libmca_bcol_la_SOURCES =
|
||||
|
||||
# header setup
|
||||
nobase_ompi_HEADERS =
|
||||
nobase_nodist_ompi_HEADERS =
|
||||
|
||||
# local files
|
||||
headers = bcol.h
|
||||
libmca_bcol_la_SOURCES += $(headers) $(nodist_headers)
|
||||
|
||||
# Conditionally install the header files
|
||||
if WANT_INSTALL_HEADERS
|
||||
nobase_ompi_HEADERS += $(headers)
|
||||
nobase_nodist_ompi_HEADERS += $(nodist_headers)
|
||||
ompidir = $(ompiincludedir)/ompi/mca/bcol
|
||||
else
|
||||
ompidir = $(includedir)
|
||||
endif
|
||||
|
||||
include base/Makefile.am
|
||||
|
||||
distclean-local:
|
||||
rm -f base/static-components.h
|
@ -1,16 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
|
||||
headers += \
|
||||
base/base.h
|
||||
libmca_bcol_la_SOURCES += \
|
||||
base/bcol_base_frame.c \
|
||||
base/bcol_base_init.c
|
@ -1,49 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_BASE_H
|
||||
#define MCA_BCOL_BASE_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/mca/mca.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
|
||||
/*
|
||||
* Global functions for BCOL
|
||||
*/
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
OMPI_DECLSPEC extern opal_list_t mca_bcol_base_components_in_use;
|
||||
OMPI_DECLSPEC extern char *ompi_bcol_bcols_string;
|
||||
|
||||
OMPI_DECLSPEC extern mca_base_framework_t ompi_bcol_base_framework;
|
||||
|
||||
OMPI_DECLSPEC int mca_bcol_base_init(bool enable_progress_threads, bool enable_mpi_threads);
|
||||
|
||||
struct mca_bcol_base_module_t;
|
||||
OMPI_DECLSPEC int mca_bcol_base_bcol_fns_table_init(struct mca_bcol_base_module_t *bcol_module);
|
||||
|
||||
OMPI_DECLSPEC int mca_bcol_base_fn_table_construct(struct mca_bcol_base_module_t *bcol_module);
|
||||
|
||||
OMPI_DECLSPEC int mca_bcol_base_fn_table_destroy(struct mca_bcol_base_module_t *bcol_module);
|
||||
|
||||
OMPI_DECLSPEC int mca_bcol_base_set_attributes(struct mca_bcol_base_module_t *bcol_module,
|
||||
mca_bcol_base_coll_fn_comm_attributes_t *comm_attribs,
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs,
|
||||
mca_bcol_base_module_collective_fn_primitives_t bcol_fn,
|
||||
mca_bcol_base_module_collective_fn_primitives_t progress_fn);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_BCOL_BASE_H */
|
@ -1,374 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNIST_H */
|
||||
#include "ompi/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
* component's public mca_base_component_t struct.
|
||||
*/
|
||||
|
||||
#include "ompi/mca/bcol/base/static-components.h"
|
||||
|
||||
static int mca_bcol_base_open(mca_base_open_flag_t flags);
|
||||
static int mca_bcol_base_close (void);
|
||||
static int mca_bcol_base_register(mca_base_register_flag_t flags);
|
||||
|
||||
/*
|
||||
** * Global variables
|
||||
** */
|
||||
MCA_BASE_FRAMEWORK_DECLARE(ompi, bcol, NULL, mca_bcol_base_register, mca_bcol_base_open, mca_bcol_base_close,
|
||||
mca_bcol_base_static_components, 0);
|
||||
|
||||
OMPI_DECLSPEC opal_list_t mca_bcol_base_components_in_use = {{0}};
|
||||
OMPI_DECLSPEC char *ompi_bcol_bcols_string = NULL;
|
||||
OMPI_DECLSPEC int bcol_mpool_compatibility[BCOL_SIZE][BCOL_SIZE] = {{0}};
|
||||
OMPI_DECLSPEC int bcol_mpool_index[BCOL_SIZE][BCOL_SIZE] = {{0}};
|
||||
|
||||
static void bcol_base_module_constructor(mca_bcol_base_module_t *module)
|
||||
{
|
||||
int fnc;
|
||||
|
||||
module->bcol_component = NULL;
|
||||
module->network_context = NULL;
|
||||
module->context_index = -1;
|
||||
module->supported_mode = 0;
|
||||
module->init_module = NULL;
|
||||
module->sbgp_partner_module = NULL;
|
||||
module->squence_number_offset = 0;
|
||||
module->n_poll_loops = 0;
|
||||
|
||||
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
|
||||
module->bcol_function_table[fnc] = NULL;
|
||||
module->small_message_thresholds[fnc] = BCOL_THRESHOLD_UNLIMITED;
|
||||
}
|
||||
|
||||
module->set_small_msg_thresholds = NULL;
|
||||
|
||||
module->header_size = 0;
|
||||
module->bcol_memory_init = NULL;
|
||||
|
||||
module->next_inorder = NULL;
|
||||
|
||||
mca_bcol_base_fn_table_construct(module);
|
||||
}
|
||||
|
||||
static void bcol_base_module_destructor(mca_bcol_base_module_t *module)
|
||||
{
|
||||
int fnc;
|
||||
|
||||
module->bcol_component = NULL;
|
||||
|
||||
module->context_index = -1;
|
||||
module->init_module = NULL;
|
||||
module->sbgp_partner_module = NULL;
|
||||
module->squence_number_offset = 0;
|
||||
module->n_poll_loops = 0;
|
||||
|
||||
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
|
||||
module->bcol_function_table[fnc] = NULL;
|
||||
}
|
||||
|
||||
module->bcol_memory_init = NULL;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_bcol_base_module_t,
|
||||
opal_object_t,
|
||||
bcol_base_module_constructor,
|
||||
bcol_base_module_destructor);
|
||||
|
||||
static void bcol_base_network_context_constructor(bcol_base_network_context_t *nc)
|
||||
{
|
||||
nc->context_id = -1;
|
||||
nc->context_data = NULL;
|
||||
}
|
||||
|
||||
static void bcol_base_network_context_destructor(bcol_base_network_context_t *nc)
|
||||
{
|
||||
nc->context_id = -1;
|
||||
nc->context_data = NULL;
|
||||
nc->register_memory_fn = NULL;
|
||||
nc->deregister_memory_fn = NULL;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(bcol_base_network_context_t,
|
||||
opal_object_t,
|
||||
bcol_base_network_context_constructor,
|
||||
bcol_base_network_context_destructor);
|
||||
|
||||
/* get list of subgrouping coponents to use */
|
||||
static int mca_bcol_base_set_components_to_use(opal_list_t *bcol_components_avail,
|
||||
opal_list_t *bcol_components_in_use)
|
||||
{
|
||||
/* local variables */
|
||||
const mca_base_component_t *b_component;
|
||||
|
||||
mca_base_component_list_item_t *b_cli;
|
||||
mca_base_component_list_item_t *b_clj;
|
||||
|
||||
char **bcols_requested;
|
||||
const char *b_component_name;
|
||||
|
||||
/* split the requst for the bcol modules */
|
||||
bcols_requested = opal_argv_split(ompi_bcol_bcols_string, ',');
|
||||
if (NULL == bcols_requested) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Initialize list */
|
||||
OBJ_CONSTRUCT(bcol_components_in_use, opal_list_t);
|
||||
|
||||
/* figure out basic collective modules to use */
|
||||
/* loop over list of components requested */
|
||||
for (int i = 0 ; bcols_requested[i] ; ++i) {
|
||||
/* loop over discovered components */
|
||||
OPAL_LIST_FOREACH(b_cli, bcol_components_avail, mca_base_component_list_item_t) {
|
||||
b_component = b_cli->cli_component;
|
||||
b_component_name = b_component->mca_component_name;
|
||||
|
||||
if (0 == strcmp (b_component_name, bcols_requested[i])) {
|
||||
/* found selected component */
|
||||
b_clj = OBJ_NEW(mca_base_component_list_item_t);
|
||||
if (NULL == b_clj) {
|
||||
opal_argv_free (bcols_requested);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
b_clj->cli_component = b_component;
|
||||
opal_list_append(bcol_components_in_use,
|
||||
(opal_list_item_t *) b_clj);
|
||||
break;
|
||||
} /* end check for bcol component */
|
||||
}
|
||||
}
|
||||
|
||||
/* Note: Need to add error checking to make sure all requested functions
|
||||
** were found */
|
||||
|
||||
/*
|
||||
** release resources
|
||||
** */
|
||||
|
||||
opal_argv_free (bcols_requested);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_bcol_base_register(mca_base_register_flag_t flags)
|
||||
{
|
||||
/* figure out which bcol and sbgp components will actually be used */
|
||||
/* get list of sub-grouping functions to use */
|
||||
ompi_bcol_bcols_string = "basesmuma,basesmuma,iboffload,ptpcoll,ugni";
|
||||
(void) mca_base_var_register("ompi", "bcol", "base", "string",
|
||||
"Default set of basic collective components to use",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_bcol_bcols_string);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components, or the one
|
||||
* that was specifically requested via a MCA parameter.
|
||||
*/
|
||||
static int mca_bcol_base_open(mca_base_open_flag_t flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Open up all available components */
|
||||
if (OMPI_SUCCESS !=
|
||||
(ret = mca_base_framework_components_open(&ompi_bcol_base_framework, flags))) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = mca_bcol_base_set_components_to_use(&ompi_bcol_base_framework.framework_components,
|
||||
&mca_bcol_base_components_in_use);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* memory registration compatibilities */
|
||||
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_SHARED_MEMORY_UMA]=1;
|
||||
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_SHARED_MEMORY_SOCKET]=1;
|
||||
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_POINT_TO_POINT]=1;
|
||||
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_UMA][BCOL_IB_OFFLOAD]=1;
|
||||
bcol_mpool_compatibility[BCOL_SHARED_MEMORY_SOCKET][BCOL_SHARED_MEMORY_UMA]=1;
|
||||
bcol_mpool_compatibility[BCOL_POINT_TO_POINT] [BCOL_SHARED_MEMORY_UMA]=1;
|
||||
bcol_mpool_compatibility[BCOL_IB_OFFLOAD] [BCOL_SHARED_MEMORY_UMA]=1;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_bcol_base_close (void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
while (NULL != (item = opal_list_remove_first (&mca_bcol_base_components_in_use))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&mca_bcol_base_components_in_use);
|
||||
|
||||
return mca_base_framework_components_close(&ompi_bcol_base_framework, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Prototype implementation of selection logic
|
||||
*/
|
||||
int mca_bcol_base_fn_table_construct(struct mca_bcol_base_module_t *bcol_module){
|
||||
|
||||
int bcol_fn;
|
||||
/* Call all init functions */
|
||||
|
||||
/* Create a function table */
|
||||
for (bcol_fn = 0; bcol_fn < BCOL_NUM_OF_FUNCTIONS; bcol_fn++){
|
||||
/* Create a list object for each bcol type list */
|
||||
OBJ_CONSTRUCT(&(bcol_module->bcol_fns_table[bcol_fn]), opal_list_t);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_bcol_base_fn_table_destroy(struct mca_bcol_base_module_t *bcol_module){
|
||||
|
||||
int bcol_fn;
|
||||
|
||||
for (bcol_fn = 0; bcol_fn < BCOL_NUM_OF_FUNCTIONS; bcol_fn++){
|
||||
/* gvm FIX: Go through the list and destroy each item */
|
||||
/* Destroy the function table object for each bcol type list */
|
||||
OBJ_DESTRUCT(&(bcol_module->bcol_fns_table[bcol_fn]));
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_bcol_base_set_attributes(struct mca_bcol_base_module_t *bcol_module,
|
||||
mca_bcol_base_coll_fn_comm_attributes_t *arg_comm_attribs,
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t *arg_inv_attribs,
|
||||
mca_bcol_base_module_collective_fn_primitives_t bcol_fn,
|
||||
mca_bcol_base_module_collective_fn_primitives_t progress_fn
|
||||
)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t *comm_attribs = NULL;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs = NULL;
|
||||
struct mca_bcol_base_coll_fn_desc_t *fn_filtered = NULL;
|
||||
int coll_type;
|
||||
|
||||
comm_attribs = malloc(sizeof(mca_bcol_base_coll_fn_comm_attributes_t));
|
||||
if (NULL == comm_attribs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
inv_attribs = malloc(sizeof(mca_bcol_base_coll_fn_invoke_attributes_t));
|
||||
|
||||
if (NULL == inv_attribs) {
|
||||
free(comm_attribs);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
coll_type = comm_attribs->bcoll_type = arg_comm_attribs->bcoll_type;
|
||||
comm_attribs->comm_size_min = arg_comm_attribs->comm_size_min;
|
||||
comm_attribs->comm_size_max = arg_comm_attribs->comm_size_max;
|
||||
comm_attribs->data_src = arg_comm_attribs->data_src;
|
||||
comm_attribs->waiting_semantics = arg_comm_attribs->waiting_semantics;
|
||||
|
||||
inv_attribs->bcol_msg_min = arg_inv_attribs->bcol_msg_min;
|
||||
inv_attribs->bcol_msg_max = arg_inv_attribs->bcol_msg_max ;
|
||||
inv_attribs->datatype_bitmap = arg_inv_attribs->datatype_bitmap ;
|
||||
inv_attribs->op_types_bitmap = arg_inv_attribs->op_types_bitmap;
|
||||
|
||||
fn_filtered = OBJ_NEW(mca_bcol_base_coll_fn_desc_t);
|
||||
|
||||
fn_filtered->coll_fn = bcol_fn;
|
||||
fn_filtered->progress_fn = progress_fn;
|
||||
|
||||
fn_filtered->comm_attr = comm_attribs;
|
||||
fn_filtered->inv_attr = inv_attribs;
|
||||
|
||||
|
||||
opal_list_append(&(bcol_module->bcol_fns_table[coll_type]),(opal_list_item_t*)fn_filtered);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_bcol_base_bcol_fns_table_init(struct mca_bcol_base_module_t *bcol_module){
|
||||
|
||||
int ret, bcol_init_fn;
|
||||
|
||||
for (bcol_init_fn =0; bcol_init_fn < BCOL_NUM_OF_FUNCTIONS; bcol_init_fn++) {
|
||||
if (NULL != bcol_module->bcol_function_init_table[bcol_init_fn]) {
|
||||
ret = (bcol_module->bcol_function_init_table[bcol_init_fn]) (bcol_module);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static void mca_bcol_base_coll_fn_desc_constructor(mca_bcol_base_coll_fn_desc_t *fn)
|
||||
{
|
||||
fn->comm_attr = NULL;
|
||||
fn->inv_attr = NULL;
|
||||
}
|
||||
|
||||
static void mca_bcol_base_coll_fn_desc_destructor(mca_bcol_base_coll_fn_desc_t *fn)
|
||||
{
|
||||
if (fn->comm_attr) {
|
||||
free(fn->comm_attr);
|
||||
}
|
||||
|
||||
if (fn->inv_attr) {
|
||||
free(fn->inv_attr);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_bcol_base_coll_fn_desc_t,
|
||||
opal_list_item_t,
|
||||
mca_bcol_base_coll_fn_desc_constructor,
|
||||
mca_bcol_base_coll_fn_desc_destructor);
|
||||
|
||||
static void lmngr_block_constructor(mca_bcol_base_lmngr_block_t *item)
|
||||
{
|
||||
item->base_addr = NULL;
|
||||
}
|
||||
|
||||
static void lnmgr_block_destructor(mca_bcol_base_lmngr_block_t *item)
|
||||
{
|
||||
/* I have nothing to do here */
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(mca_bcol_base_lmngr_block_t,
|
||||
opal_list_item_t,
|
||||
lmngr_block_constructor,
|
||||
lnmgr_block_destructor);
|
@ -1,45 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
|
||||
int mca_bcol_base_init(bool enable_progress_threads, bool enable_mpi_threads)
|
||||
{
|
||||
mca_bcol_base_component_t *bcol_component;
|
||||
mca_base_component_list_item_t *cli;
|
||||
int ret;
|
||||
|
||||
OPAL_LIST_FOREACH(cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
|
||||
bcol_component = (mca_bcol_base_component_t *) cli->cli_component;
|
||||
|
||||
if (false == bcol_component->init_done) {
|
||||
ret = bcol_component->collm_init_query(true, true);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
bcol_component->init_done = true;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,7 +0,0 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: ORNL
|
||||
status: unmaintained
|
@ -1,66 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
bcol_basesmuma.h \
|
||||
bcol_basesmuma_utils.h \
|
||||
bcol_basesmuma_bcast.c \
|
||||
bcol_basesmuma_component.c \
|
||||
bcol_basesmuma_module.c \
|
||||
bcol_basesmuma_buf_mgmt.c \
|
||||
bcol_basesmuma_mem_mgmt.c \
|
||||
bcol_basesmuma_fanin.c \
|
||||
bcol_basesmuma_fanout.c \
|
||||
bcol_basesmuma_progress.c \
|
||||
bcol_basesmuma_reduce.h \
|
||||
bcol_basesmuma_reduce.c \
|
||||
bcol_basesmuma_allreduce.c \
|
||||
bcol_basesmuma_setup.c \
|
||||
bcol_basesmuma_rd_barrier.c \
|
||||
bcol_basesmuma_rd_nb_barrier.c \
|
||||
bcol_basesmuma_rk_barrier.c \
|
||||
bcol_basesmuma_utils.c \
|
||||
bcol_basesmuma_bcast_prime.c \
|
||||
bcol_basesmuma_lmsg_knomial_bcast.c \
|
||||
bcol_basesmuma_lmsg_bcast.c \
|
||||
bcol_basesmuma_gather.c \
|
||||
bcol_basesmuma_allgather.c \
|
||||
bcol_basesmuma_smcm.h \
|
||||
bcol_basesmuma_smcm.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
component_noinst =
|
||||
component_install =
|
||||
if MCA_BUILD_ompi_bcol_basesmuma_DSO
|
||||
component_install += mca_bcol_basesmuma.la
|
||||
else
|
||||
component_noinst += libmca_bcol_basesmuma.la
|
||||
endif
|
||||
|
||||
# See ompi/mca/btl/sm/Makefile.am for an explanation of
|
||||
# libmca_common_sm.la.
|
||||
|
||||
AM_CPPFLAGS = $(btl_portals_CPPFLAGS)
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_bcol_basesmuma_la_SOURCES = $(sources)
|
||||
mca_bcol_basesmuma_la_LDFLAGS = -module -avoid-version $(btl_portals_LDFLAGS)
|
||||
mca_bcol_basesmuma_la_LIBADD = \
|
||||
$(btl_portals_LIBS)
|
||||
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_bcol_basesmuma_la_SOURCES =$(sources)
|
||||
libmca_bcol_basesmuma_la_LDFLAGS = -module -avoid-version $(btl_portals_LDFLAGS)
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,352 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
|
||||
/*
|
||||
#define IS_AGDATA_READY(peer, my_flag, my_sequence_number)\
|
||||
(((peer)->sequence_number == (my_sequence_number) && \
|
||||
(peer)->flags[ALLGATHER_FLAG][bcol_id] >= (my_flag) \
|
||||
)? true : false )
|
||||
*/
|
||||
|
||||
#define CALC_ACTIVE_REQUESTS(active_requests,peers, tree_order) \
|
||||
do{ \
|
||||
for( j = 0; j < (tree_order - 1); j++){ \
|
||||
if( 0 > peers[j] ) { \
|
||||
/* set the bit */ \
|
||||
*active_requests ^= (1<<j); \
|
||||
} \
|
||||
} \
|
||||
}while(0)
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Recursive K-ing allgather
|
||||
*/
|
||||
|
||||
/*
|
||||
*
|
||||
* Recurssive k-ing algorithm
|
||||
* Example k=3 n=9
|
||||
*
|
||||
*
|
||||
* Number of Exchange steps = log (basek) n
|
||||
* Number of steps in exchange step = k (radix)
|
||||
*
|
||||
*/
|
||||
int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
|
||||
netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
uint32_t buffer_index = input_args->buffer_index;
|
||||
int *active_requests =
|
||||
&(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
|
||||
|
||||
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
|
||||
int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
|
||||
int leading_dim, buff_idx, idx;
|
||||
|
||||
int64_t sequence_number = input_args->sequence_num;
|
||||
int my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
|
||||
/* control structures */
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
|
||||
volatile int8_t ready_flag;
|
||||
|
||||
/* initialize the iteration counter */
|
||||
buff_idx = input_args->src_desc->buffer_index;
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
|
||||
/* Set pointer to current proc ctrl region */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
|
||||
/* initialize headers and ready flag */
|
||||
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
|
||||
|
||||
/* initialize these */
|
||||
*iteration = -1;
|
||||
*active_requests = 0;
|
||||
*status = ready_flag;
|
||||
|
||||
if (EXTRA_NODE == exchange_node->node_type) {
|
||||
/* I am ready at this level */
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
|
||||
}
|
||||
|
||||
return bcol_basesmuma_k_nomial_allgather_progress (input_args, const_args);
|
||||
}
|
||||
|
||||
|
||||
/* allgather progress function */
|
||||
|
||||
int bcol_basesmuma_k_nomial_allgather_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variables */
|
||||
int8_t flag_offset;
|
||||
uint32_t buffer_index = input_args->buffer_index;
|
||||
volatile int8_t ready_flag;
|
||||
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
|
||||
netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
|
||||
int group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
int *list_connected = bcol_module->super.list_n_connected; /* critical for hierarchical colls */
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
|
||||
int *active_requests =
|
||||
&(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
|
||||
|
||||
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
|
||||
int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
|
||||
int leading_dim, idx, buff_idx;
|
||||
|
||||
int i, j, probe;
|
||||
int knt;
|
||||
int src;
|
||||
int recv_offset, recv_len;
|
||||
int max_requests = 0; /* critical to set this */
|
||||
int pow_k, tree_order;
|
||||
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
int my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
|
||||
int pack_len = input_args->count * input_args->dtype->super.size;
|
||||
|
||||
void *data_addr = (void*)(
|
||||
(unsigned char *) input_args->sbuf +
|
||||
(size_t) input_args->sbuf_offset);
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile char *peer_data_pointer;
|
||||
|
||||
/* control structures */
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n", my_rank,
|
||||
*active_requests, *iteration, *status);
|
||||
#endif
|
||||
|
||||
buff_idx = input_args->src_desc->buffer_index;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
|
||||
/* Set pointer to current proc ctrl region */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
|
||||
/* increment the starting flag by one and return */
|
||||
/* flag offset seems unnecessary here */
|
||||
flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];
|
||||
ready_flag = *status;
|
||||
my_ctl_pointer->sequence_number = sequence_number;
|
||||
/* k-nomial parameters */
|
||||
tree_order = exchange_node->tree_order;
|
||||
pow_k = exchange_node->log_tree_order;
|
||||
|
||||
/* calculate the maximum number of requests
|
||||
* at each level each rank communicates with
|
||||
* at most (k - 1) peers
|
||||
* so if we set k - 1 bit fields in "max_requests", then
|
||||
* we have max_request == 2^(k - 1) -1
|
||||
*/
|
||||
for(i = 0; i < (tree_order - 1); i++){
|
||||
max_requests ^= (1<<i);
|
||||
}
|
||||
|
||||
/* let's begin the collective, starting with extra ranks and their
|
||||
* respective proxies
|
||||
*/
|
||||
|
||||
if (OPAL_UNLIKELY(-1 == *iteration)) {
|
||||
if (EXTRA_NODE == exchange_node->node_type) {
|
||||
/* If I'm in here, then I must be looking for data */
|
||||
ready_flag = flag_offset + 1 + pow_k + 2;
|
||||
|
||||
src = exchange_node->rank_extra_sources_array[0];
|
||||
peer_data_pointer = data_buffs[src].payload;
|
||||
peer_ctl_pointer = data_buffs[src].ctl_struct;
|
||||
|
||||
/* calculate the count */
|
||||
for (i = 0, knt = 0 ; i < group_size ; ++i){
|
||||
knt += list_connected[i];
|
||||
}
|
||||
|
||||
for (i = 0 ; i < cm->num_to_probe ; ++i) {
|
||||
if (IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)) {
|
||||
/* we receive the entire message */
|
||||
opal_atomic_mb ();
|
||||
memcpy (data_addr, (void *) peer_data_pointer, knt * pack_len);
|
||||
|
||||
goto FINISHED;
|
||||
}
|
||||
}
|
||||
|
||||
/* haven't found it, state is saved, bail out */
|
||||
return BCOL_FN_STARTED;
|
||||
} else if (0 < exchange_node->n_extra_sources) {
|
||||
/* I am a proxy for someone */
|
||||
src = exchange_node->rank_extra_sources_array[0];
|
||||
peer_data_pointer = data_buffs[src].payload;
|
||||
peer_ctl_pointer = data_buffs[src].ctl_struct;
|
||||
|
||||
/* calculate the offset */
|
||||
for (i = 0, knt = 0 ; i < src ; ++i){
|
||||
knt += list_connected[i];
|
||||
}
|
||||
|
||||
/* probe for extra rank's arrival */
|
||||
for (i = 0 ; i < cm->num_to_probe ; ++i) {
|
||||
if (IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)) {
|
||||
opal_atomic_mb ();
|
||||
/* copy it in */
|
||||
memcpy ((void *) ((uintptr_t) data_addr + knt * pack_len),
|
||||
(void *) ((uintptr_t) peer_data_pointer + knt * pack_len),
|
||||
pack_len * list_connected[src]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i == cm->num_to_probe) {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
/* bump the ready flag to indicate extra node exchange complete */
|
||||
++ready_flag;
|
||||
*iteration = 0;
|
||||
}
|
||||
|
||||
/* start the recursive k - ing phase */
|
||||
for (i = *iteration ; i < pow_k ; ++i) {
|
||||
/* I am ready at this level */
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
|
||||
|
||||
if (0 == *active_requests) {
|
||||
/* flip some bits, if we don't have active requests from a previous visit */
|
||||
CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[i],tree_order);
|
||||
}
|
||||
|
||||
for (j = 0; j < (tree_order - 1); ++j) {
|
||||
|
||||
/* recv phase */
|
||||
src = exchange_node->rank_exchanges[i][j];
|
||||
|
||||
if (src < 0) {
|
||||
/* then not a valid rank, continue */
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!(*active_requests&(1<<j))) {
|
||||
/* then this peer hasn't been processed at this level */
|
||||
peer_data_pointer = data_buffs[src].payload;
|
||||
peer_ctl_pointer = data_buffs[src].ctl_struct;
|
||||
|
||||
recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len;
|
||||
recv_len = exchange_node->payload_info[i][j].r_len * pack_len;
|
||||
|
||||
/* I am putting the probe loop as the inner most loop to achieve
|
||||
* better temporal locality
|
||||
*/
|
||||
for (probe = 0 ; probe < cm->num_to_probe ; ++probe) {
|
||||
if (IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)) {
|
||||
/* flip the request's bit */
|
||||
*active_requests ^= (1<<j);
|
||||
/* copy the data */
|
||||
memcpy((void *)((unsigned char *) data_addr + recv_offset),
|
||||
(void *)((unsigned char *) peer_data_pointer + recv_offset),
|
||||
recv_len);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( max_requests == *active_requests ){
|
||||
/* bump the ready flag */
|
||||
ready_flag++;
|
||||
/* reset the active requests for the next level */
|
||||
*active_requests = 0;
|
||||
/* calculate the number of active requests
|
||||
* logically makes sense to do it here. We don't
|
||||
* want to inadvertantly flip a bit to zero that we
|
||||
* set previously
|
||||
*/
|
||||
} else {
|
||||
/* state is saved hop out
|
||||
*/
|
||||
*status = my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id];
|
||||
*iteration = i;
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
/* bump the flag one more time for the extra rank */
|
||||
ready_flag = flag_offset + 1 + pow_k + 2;
|
||||
|
||||
/* finish off the last piece, send the data back to the extra */
|
||||
if( 0 < exchange_node->n_extra_sources ) {
|
||||
/* simply announce my arrival */
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
|
||||
}
|
||||
|
||||
FINISHED:
|
||||
/* bump this up for others to see */
|
||||
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
/* Register allreduce functions to the BCOL function table,
|
||||
* so they can be selected
|
||||
*/
|
||||
int bcol_basesmuma_allgather_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_ALLGATHER;
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_k_nomial_allgather_init,
|
||||
bcol_basesmuma_k_nomial_allgather_progress);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,611 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
|
||||
#include "opal/include/opal_stdint.h"
|
||||
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args);
|
||||
|
||||
int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_ALLREDUCE;
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1048576;
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
/* selection logic at the ml level specifies a
|
||||
* request for a non-blocking algorithm
|
||||
* however, these algorithms are blocking
|
||||
* following what was done at the p2p level
|
||||
* we will specify non-blocking, but beware,
|
||||
* these algorithms are blocking and will not make use
|
||||
* of the progress engine
|
||||
*/
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000;
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
/* Set attributes for fanin fanout algorithm */
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_allreduce_intra_fanin_fanout,
|
||||
bcol_basesmuma_allreduce_intra_fanin_fanout_progress);
|
||||
|
||||
inv_attribs.bcol_msg_min = 20000;
|
||||
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_allreduce_intra_fanin_fanout,
|
||||
bcol_basesmuma_allreduce_intra_fanin_fanout_progress);
|
||||
|
||||
/* Differs only in comm size */
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_UNKNOWN;
|
||||
comm_attribs.waiting_semantics = BLOCKING;
|
||||
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 8;
|
||||
|
||||
/* Set attributes for recursive doubling algorithm */
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_allreduce_intra_recursive_doubling,
|
||||
NULL);
|
||||
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Small data fanin reduce
|
||||
* ML buffers are used for both payload and control structures
|
||||
* This functions works with hierarchical allreduce and
|
||||
* progress engine
|
||||
*/
|
||||
static inline int reduce_children (mca_bcol_basesmuma_module_t *bcol_module, volatile void *rbuf, netpatterns_tree_node_t *my_reduction_node,
|
||||
int *iteration, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, ompi_datatype_t *dtype,
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs, int count, struct ompi_op_t *op, int process_shift)
|
||||
{
|
||||
volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
int64_t sequence_number = my_ctl_pointer->sequence_number;
|
||||
int8_t ready_flag = my_ctl_pointer->ready_flag;
|
||||
int group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
|
||||
if (LEAF_NODE != my_reduction_node->my_node_type) {
|
||||
volatile char *child_data_pointer;
|
||||
volatile void *child_rbuf;
|
||||
|
||||
/* for each child */
|
||||
/* my_result_data = child_result_data (op) my_source_data */
|
||||
|
||||
for (int child = *iteration ; child < my_reduction_node->n_children ; ++child) {
|
||||
int child_rank = my_reduction_node->children_ranks[child] + process_shift;
|
||||
|
||||
if (group_size <= child_rank){
|
||||
child_rank -= group_size;
|
||||
}
|
||||
|
||||
child_ctl_pointer = data_buffs[child_rank].ctl_struct;
|
||||
|
||||
if (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) {
|
||||
*iteration = child;
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
child_data_pointer = data_buffs[child_rank].payload;
|
||||
child_rbuf = child_data_pointer + child_ctl_pointer->roffsets[bcol_id];
|
||||
|
||||
ompi_op_reduce(op, (void *)child_rbuf, (void *)rbuf, count, dtype);
|
||||
} /* end child loop */
|
||||
}
|
||||
|
||||
if (ROOT_NODE != my_reduction_node->my_node_type) {
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag;
|
||||
}
|
||||
|
||||
/* done with this step. move on to fan out */
|
||||
*iteration = -1;
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static int allreduce_fanout (mca_bcol_basesmuma_module_t *bcol_module, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer,
|
||||
volatile void *my_data_pointer, int process_shift, volatile mca_bcol_basesmuma_payload_t *data_buffs,
|
||||
int sequence_number, int group_size, int rbuf_offset, size_t pack_len)
|
||||
{
|
||||
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
int8_t ready_flag = my_ctl_pointer->ready_flag + 1;
|
||||
netpatterns_tree_node_t *my_fanout_read_tree;
|
||||
volatile void *parent_data_pointer;
|
||||
int my_fanout_parent, my_rank;
|
||||
void *parent_rbuf, *rbuf;
|
||||
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_rank]);
|
||||
|
||||
if (ROOT_NODE != my_fanout_read_tree->my_node_type) {
|
||||
my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
|
||||
if (group_size <= my_fanout_parent) {
|
||||
my_fanout_parent -= group_size;
|
||||
}
|
||||
|
||||
rbuf = (void *)((char *) my_data_pointer + rbuf_offset);
|
||||
|
||||
/*
|
||||
* Get parent payload data and control data.
|
||||
* Get the pointer to the base address of the parent's payload buffer.
|
||||
* Get the parent's control buffer.
|
||||
*/
|
||||
parent_data_pointer = data_buffs[my_fanout_parent].payload;
|
||||
parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
|
||||
|
||||
parent_rbuf = (void *) ((char *) parent_data_pointer + rbuf_offset);
|
||||
|
||||
/* Wait until parent signals that data is ready */
|
||||
/* The order of conditions checked in this loop is important, as it can
|
||||
* result in a race condition.
|
||||
*/
|
||||
if (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
assert (parent_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] == ready_flag);
|
||||
|
||||
/* Copy the rank to a shared buffer writable by the current rank */
|
||||
memcpy ((void *) rbuf, (const void*) parent_rbuf, pack_len);
|
||||
}
|
||||
|
||||
if (LEAF_NODE != my_fanout_read_tree->my_node_type) {
|
||||
opal_atomic_wmb ();
|
||||
|
||||
/* Signal to children that they may read the data from my shared buffer (bump the ready flag) */
|
||||
my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag;
|
||||
}
|
||||
|
||||
my_ctl_pointer->starting_flag_value[bcol_id] += 1;
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
|
||||
}
|
||||
|
||||
static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
|
||||
int buff_idx = input_args->src_desc->buffer_index;
|
||||
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
|
||||
void *data_addr = (void *) input_args->src_desc->data_addr;
|
||||
int my_node_index, my_rank, group_size, leading_dim, idx;
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
int64_t sequence_number = input_args->sequence_num;
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
struct ompi_datatype_t *dtype = input_args->dtype;
|
||||
netpatterns_tree_node_t *my_reduction_node;
|
||||
struct ompi_op_t *op = input_args->op;
|
||||
volatile void *my_data_pointer;
|
||||
int count = input_args->count;
|
||||
int rc, process_shift;
|
||||
ptrdiff_t lb, extent;
|
||||
volatile void *rbuf;
|
||||
|
||||
/* get addressing information */
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
|
||||
/* Align node index to around sbgp root */
|
||||
process_shift = input_args->root;
|
||||
my_node_index = my_rank - input_args->root;
|
||||
if (0 > my_node_index ) {
|
||||
my_node_index += group_size;
|
||||
}
|
||||
|
||||
data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx;
|
||||
/* Get control structure and payload buffer */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
my_data_pointer = (volatile char *) data_addr;
|
||||
|
||||
my_data_pointer = (volatile char *) data_addr;
|
||||
rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset);
|
||||
|
||||
/***************************
|
||||
* Fan into root phase
|
||||
***************************/
|
||||
|
||||
my_reduction_node = &(bcol_module->reduction_tree[my_node_index]);
|
||||
if (-1 != *iteration) {
|
||||
rc = reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer,
|
||||
dtype, data_buffs, count, op, process_shift);
|
||||
if (BCOL_FN_COMPLETE != rc) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* there might be non-contig dtype - so compute the length with get_extent */
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
|
||||
/***************************
|
||||
* Fan out from root
|
||||
***************************/
|
||||
|
||||
/* all nodes will have the result after fanout */
|
||||
input_args->result_in_rbuf = true;
|
||||
|
||||
/* Signal that you are ready for fanout phase */
|
||||
return allreduce_fanout (bcol_module, my_ctl_pointer, my_data_pointer, process_shift, data_buffs,
|
||||
sequence_number, group_size, input_args->rbuf_offset, count * (size_t) extent);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared memory blocking allreduce.
|
||||
*/
|
||||
int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
|
||||
int buff_idx = input_args->src_desc->buffer_index;
|
||||
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
|
||||
void *data_addr = (void *) input_args->src_desc->data_addr;
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
struct ompi_datatype_t *dtype = input_args->dtype;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
int rc, my_rank, leading_dim, idx;
|
||||
volatile void *my_data_pointer;
|
||||
volatile void *sbuf, *rbuf;
|
||||
int8_t ready_flag;
|
||||
|
||||
/* get addressing information */
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
idx = SM_ARRAY_INDEX(leading_dim, buff_idx, 0);
|
||||
|
||||
data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx;
|
||||
/* Get control structure */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
|
||||
my_data_pointer = (volatile char *) data_addr;
|
||||
rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset);
|
||||
sbuf = (volatile void *)((char *) my_data_pointer + input_args->sbuf_offset);
|
||||
|
||||
/* Setup resource recycling */
|
||||
/* Set for multiple instances of bcols */
|
||||
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, input_args->sequence_num, bcol_id);
|
||||
|
||||
if (sbuf != rbuf) {
|
||||
rc = ompi_datatype_copy_content_same_ddt (dtype, input_args->count, (char *)rbuf,
|
||||
(char *)sbuf);
|
||||
if( 0 != rc ) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
*iteration = 0;
|
||||
my_ctl_pointer->ready_flag = ready_flag;
|
||||
|
||||
return bcol_basesmuma_allreduce_intra_fanin_fanout_progress (input_args, c_input_args);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* this thing uses the old bcol private control structures */
|
||||
int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
|
||||
int my_rank,group_size,my_node_index;
|
||||
int pair_rank, exchange, extra_rank, payload_len;
|
||||
size_t dt_size;
|
||||
int read_offset, write_offset;
|
||||
volatile void *my_data_pointer;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer = NULL,
|
||||
*partner_ctl_pointer = NULL,
|
||||
*extra_ctl_pointer = NULL;
|
||||
volatile void *my_read_pointer, *my_write_pointer, *partner_read_pointer,
|
||||
*extra_rank_readwrite_data_pointer,*extra_rank_read_data_pointer;
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
|
||||
int8_t ready_flag;
|
||||
int sbuf_offset,rbuf_offset,flag_offset;
|
||||
int root,count;
|
||||
struct ompi_op_t *op;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
struct ompi_datatype_t *dtype;
|
||||
int first_instance = 0;
|
||||
int leading_dim,idx;
|
||||
int buff_idx;
|
||||
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
||||
/*volatile void **data_buffs;*/
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
netpatterns_pair_exchange_node_t *my_exchange_node;
|
||||
|
||||
|
||||
/*
|
||||
* Get addressing information
|
||||
*/
|
||||
buff_idx = input_args->src_desc->buffer_index;
|
||||
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
|
||||
/*
|
||||
* Get SM control structures and payload buffers
|
||||
*/
|
||||
ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
|
||||
bcol_module->colls_with_user_data.ctl_buffs+idx;
|
||||
/*data_buffs = (volatile void **)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;*/
|
||||
|
||||
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs + idx;
|
||||
|
||||
|
||||
/*
|
||||
* Get control structure and payload buffer
|
||||
*/
|
||||
my_ctl_pointer = ctl_structs[my_rank];
|
||||
if (my_ctl_pointer->sequence_number < sequence_number) {
|
||||
first_instance=1;
|
||||
}
|
||||
my_data_pointer = data_buffs[my_rank].payload;
|
||||
|
||||
/*
|
||||
* Align node index to around sbgp root
|
||||
*/
|
||||
root = input_args->root;
|
||||
my_node_index = my_rank - root;
|
||||
if (0 > my_node_index) {
|
||||
my_node_index += group_size;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get data from arguments
|
||||
*/
|
||||
sbuf_offset = input_args->sbuf_offset;
|
||||
rbuf_offset = input_args->rbuf_offset;
|
||||
op = input_args->op;
|
||||
count = input_args->count;
|
||||
dtype = input_args->dtype;
|
||||
|
||||
/*
|
||||
* Get my node for the reduction tree
|
||||
*/
|
||||
my_exchange_node = &(bcol_module->recursive_doubling_tree);
|
||||
|
||||
|
||||
if (first_instance) {
|
||||
my_ctl_pointer->index = 1;
|
||||
my_ctl_pointer->starting_flag_value = 0;
|
||||
flag_offset = 0;
|
||||
my_ctl_pointer->flag = -1;
|
||||
/*
|
||||
for( i = 0; i < NUM_SIGNAL_FLAGS; i++){
|
||||
my_ctl_pointer->flags[ALLREDUCE_FLAG] = -1;
|
||||
}
|
||||
*/
|
||||
} else {
|
||||
my_ctl_pointer->index++;
|
||||
flag_offset = my_ctl_pointer->starting_flag_value;
|
||||
}
|
||||
|
||||
/* signal that I have arrived */
|
||||
/* opal_atomic_wmb (); */
|
||||
my_ctl_pointer->sequence_number = sequence_number;
|
||||
|
||||
/* If we use this buffer more than once by an sm module in
|
||||
* a given collective, will need to distinguish between instances, so
|
||||
* we pick up the right data.
|
||||
*/
|
||||
ready_flag = flag_offset + sequence_number + 1;
|
||||
|
||||
/*
|
||||
* Set up pointers for using during recursive doubling phase
|
||||
*/
|
||||
read_offset = sbuf_offset;
|
||||
write_offset = rbuf_offset;
|
||||
fprintf(stderr,"read offset %d write offset %d\n",read_offset,write_offset);
|
||||
my_read_pointer = (volatile void *)((char *) my_data_pointer + read_offset);
|
||||
my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset);
|
||||
|
||||
/*
|
||||
* When there are non-power 2 nodes, the extra nodes' data is copied and
|
||||
* reduced by partner exchange nodes.
|
||||
* Extra nodes: Nodes with rank greater nearest power of 2
|
||||
* Exchange nodes: Nodes with rank lesser than nearest power of 2 that
|
||||
* partner with extras nodes during reduction
|
||||
*/
|
||||
|
||||
if (0 < my_exchange_node->n_extra_sources) {
|
||||
/*
|
||||
* Signal extra node that data is ready
|
||||
*/
|
||||
opal_atomic_wmb ();
|
||||
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
if (EXCHANGE_NODE == my_exchange_node->node_type) {
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
extra_ctl_pointer = ctl_structs[extra_rank];
|
||||
extra_rank_readwrite_data_pointer = (void *) ((char *) data_buffs[extra_rank].payload +
|
||||
read_offset);
|
||||
|
||||
/*
|
||||
* Wait for data to get ready
|
||||
*/
|
||||
while (!((sequence_number == extra_ctl_pointer->sequence_number) &&
|
||||
(extra_ctl_pointer->flag >= ready_flag))){
|
||||
}
|
||||
|
||||
ompi_op_reduce(op,(void *)extra_rank_readwrite_data_pointer,
|
||||
(void *)my_read_pointer, count, dtype);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* --Exchange node that reduces with extra node --: Signal to extra node that data is read
|
||||
* --Exchange node that doesn't reduce data with extra node --: This assignment
|
||||
* is used so it can sync with other nodes during exchange phase
|
||||
* --Extra node--: It can pass to next phase
|
||||
*/
|
||||
ready_flag++;
|
||||
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
|
||||
/*
|
||||
* Exchange data with all the nodes that are less than max_power_2
|
||||
*/
|
||||
for (exchange=0 ; exchange < my_exchange_node->n_exchanges ; exchange++) {
|
||||
int tmp=0;
|
||||
|
||||
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
pair_rank=my_exchange_node->rank_exchanges[exchange];
|
||||
partner_ctl_pointer = ctl_structs[pair_rank];
|
||||
partner_read_pointer = (volatile void *) ((char *)data_buffs[pair_rank].payload + read_offset);
|
||||
|
||||
my_read_pointer = (volatile void *)((char *) my_data_pointer + read_offset);
|
||||
my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset);
|
||||
|
||||
/*
|
||||
* Wait for partner to be ready, so we can read
|
||||
*/
|
||||
/*
|
||||
JSL ---- FIX ME !!!!! MAKE ME COMPLIANT WITH NEW BUFFERS
|
||||
while (!IS_ALLREDUCE_PEER_READY(partner_ctl_pointer,
|
||||
ready_flag, sequence_number)) {
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
* Perform reduction operation
|
||||
*/
|
||||
ompi_3buff_op_reduce(op,(void *)my_read_pointer, (void *)partner_read_pointer,
|
||||
(void *)my_write_pointer, count, dtype);
|
||||
|
||||
|
||||
/*
|
||||
* Signal that I am done reading my partner's data
|
||||
*/
|
||||
ready_flag++;
|
||||
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
while (ready_flag > partner_ctl_pointer->flag){
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
/*
|
||||
* Swap read and write offsets
|
||||
*/
|
||||
tmp = read_offset;
|
||||
read_offset = write_offset;
|
||||
write_offset = tmp;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Copy data in from the "extra" source, if need be
|
||||
*/
|
||||
|
||||
if (0 < my_exchange_node->n_extra_sources) {
|
||||
|
||||
if (EXTRA_NODE == my_exchange_node->node_type) {
|
||||
|
||||
int extra_rank_read_offset=-1,my_write_offset=-1;
|
||||
|
||||
/* Offset the ready flag to sync with
|
||||
* exchange node which might going through exchange phases
|
||||
* unlike the extra node
|
||||
*/
|
||||
ready_flag = ready_flag + my_exchange_node->log_2;
|
||||
|
||||
if (my_exchange_node->log_2%2) {
|
||||
extra_rank_read_offset = rbuf_offset;
|
||||
my_write_offset = rbuf_offset;
|
||||
|
||||
} else {
|
||||
extra_rank_read_offset = sbuf_offset;
|
||||
my_write_offset = sbuf_offset;
|
||||
|
||||
}
|
||||
|
||||
my_write_pointer = (volatile void*)((char *)my_data_pointer + my_write_offset);
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
extra_ctl_pointer = ctl_structs[extra_rank];
|
||||
|
||||
extra_rank_read_data_pointer = (volatile void *) ((char *)data_buffs[extra_rank].payload +
|
||||
extra_rank_read_offset);
|
||||
|
||||
/*
|
||||
* Wait for the exchange node to be ready
|
||||
*/
|
||||
ompi_datatype_type_size(dtype, &dt_size);
|
||||
payload_len = count*dt_size;
|
||||
#if 0
|
||||
fix me JSL !!!!!
|
||||
while (!IS_DATA_READY(extra_ctl_pointer, ready_flag, sequence_number)){
|
||||
}
|
||||
#endif
|
||||
memcpy((void *)my_write_pointer,(const void *)
|
||||
extra_rank_read_data_pointer, payload_len);
|
||||
|
||||
ready_flag++;
|
||||
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
|
||||
} else {
|
||||
|
||||
/*
|
||||
* Signal parent that data is ready
|
||||
*/
|
||||
opal_atomic_wmb ();
|
||||
/*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
/* wait until child is done to move on - this buffer will
|
||||
* be reused for the next stripe, so don't want to move
|
||||
* on too quick.
|
||||
*/
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
extra_ctl_pointer = ctl_structs[extra_rank];
|
||||
}
|
||||
}
|
||||
|
||||
input_args->result_in_rbuf = my_exchange_node->log_2 & 1;
|
||||
|
||||
my_ctl_pointer->starting_flag_value += 1;
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
@ -1,487 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
#define __TEST_BLOCKING__ 1
|
||||
#define __TEST_WAIT__ 0
|
||||
#define __TEST_TEST__ 0
|
||||
|
||||
/* debug
|
||||
* #include "opal/sys/timer.h"
|
||||
*
|
||||
* extern uint64_t timers[7];
|
||||
* end debug */
|
||||
|
||||
/* debug */
|
||||
/* end debug */
|
||||
int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_BCAST;
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1048576;
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_bcast_k_nomial_knownroot,
|
||||
bcol_basesmuma_bcast_k_nomial_knownroot);
|
||||
|
||||
inv_attribs.bcol_msg_min = 10000000;
|
||||
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_bcast_k_nomial_knownroot,
|
||||
bcol_basesmuma_bcast_k_nomial_knownroot);
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_UNKNOWN;
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_bcast_k_nomial_anyroot,
|
||||
bcol_basesmuma_bcast_k_nomial_anyroot);
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_UNKNOWN;
|
||||
inv_attribs.bcol_msg_min = 10000000;
|
||||
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
|
||||
|
||||
#ifdef __PORTALS_AVAIL__
|
||||
|
||||
comm_attribs.waiting_semantics = BLOCKING;
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_lmsg_scatter_allgather_portals_bcast,
|
||||
bcol_basesmuma_lmsg_scatter_allgather_portals_bcast);
|
||||
|
||||
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast,
|
||||
bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast);
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast,
|
||||
bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast);
|
||||
|
||||
#else
|
||||
/*
|
||||
if (super->use_hdl) {
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_hdl_zerocopy_bcast,
|
||||
bcol_basesmuma_hdl_zerocopy_bcast);
|
||||
} else { */
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, NULL, NULL);
|
||||
/*
|
||||
bcol_basesmuma_binary_scatter_allgather_segment,
|
||||
bcol_basesmuma_binary_scatter_allgather_segment);
|
||||
*/
|
||||
/* } */
|
||||
#endif
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* includes shared memory optimization */
|
||||
|
||||
/**
|
||||
* Shared memory blocking Broadcast - fanin, for small data buffers.
|
||||
* This routine assumes that buf (the input buffer) is a single writer
|
||||
* multi reader (SWMR) shared memory buffer owned by the calling rank
|
||||
* which is the only rank that can write to this buffers.
|
||||
* It is also assumed that the buffers are registered and fragmented
|
||||
* at the ML level and that buf is sufficiently large to hold the data.
|
||||
*
|
||||
*
|
||||
* @param buf - SWMR shared buffer within a sbgp that the
|
||||
* executing rank can write to.
|
||||
* @param count - the number of elements in the shared buffer.
|
||||
* @param dtype - the datatype of a shared buffer element.
|
||||
* @param root - the index within the sbgp of the root.
|
||||
* @param module - basesmuma module.
|
||||
*/
|
||||
int bcol_basesmuma_bcast(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
int group_size, process_shift, my_node_index;
|
||||
int my_rank;
|
||||
int rc = OMPI_SUCCESS;
|
||||
int my_fanout_parent;
|
||||
int leading_dim, buff_idx, idx;
|
||||
volatile int8_t ready_flag;
|
||||
int count=input_args->count;
|
||||
struct ompi_datatype_t* dtype=input_args->dtype;
|
||||
int root=input_args->root;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
mca_bcol_basesmuma_module_t* bcol_module=
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile char* parent_data_pointer;
|
||||
mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
|
||||
netpatterns_tree_node_t* my_fanout_read_tree;
|
||||
size_t pack_len = 0, dt_size;
|
||||
|
||||
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr );
|
||||
|
||||
#if 0
|
||||
fprintf(stderr,"Entering sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
|
||||
fflush(stderr);
|
||||
#endif
|
||||
|
||||
|
||||
/* we will work only on packed data - so compute the length*/
|
||||
ompi_datatype_type_size(dtype, &dt_size);
|
||||
pack_len=count*dt_size;
|
||||
|
||||
buff_idx = input_args->src_desc->buffer_index;
|
||||
|
||||
/* Get addressing information */
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
|
||||
/* Align node index to around sbgp root */
|
||||
process_shift = root;
|
||||
my_node_index = my_rank - root;
|
||||
if(0 > my_node_index ) {
|
||||
my_node_index += group_size;
|
||||
}
|
||||
|
||||
/* get my node for the bcast tree */
|
||||
my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]);
|
||||
my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
|
||||
if(group_size <= my_fanout_parent){
|
||||
my_fanout_parent -= group_size;
|
||||
}
|
||||
|
||||
/* Set pointer to current proc ctrl region */
|
||||
/*my_ctl_pointer = ctl_structs[my_rank]; */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
|
||||
/* setup resource recycling */
|
||||
|
||||
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
|
||||
|
||||
/*
|
||||
* Fan out from root
|
||||
*/
|
||||
if(ROOT_NODE == my_fanout_read_tree->my_node_type) {
|
||||
input_args->result_in_rbuf = false;
|
||||
/* Root should only signal it is ready */
|
||||
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
|
||||
|
||||
}else if(LEAF_NODE == my_fanout_read_tree->my_node_type) {
|
||||
input_args->result_in_rbuf = false;
|
||||
/*
|
||||
* Get parent payload data and control data.
|
||||
* Get the pointer to the base address of the parent's payload buffer.
|
||||
* Get the parent's control buffer.
|
||||
*/
|
||||
parent_data_pointer = data_buffs[my_fanout_parent].payload;
|
||||
parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
|
||||
|
||||
/* Wait until parent signals that data is ready */
|
||||
/* The order of conditions checked in this loop is important, as it can
|
||||
* result in a race condition.
|
||||
*/
|
||||
while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
/* Copy the rank to a shared buffer writable by the current rank */
|
||||
memcpy(data_addr, (void *)parent_data_pointer, pack_len);
|
||||
|
||||
if( 0 != rc ) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
}else{
|
||||
input_args->result_in_rbuf = false;
|
||||
/* Interior node */
|
||||
|
||||
/* Get parent payload data and control data */
|
||||
parent_data_pointer = data_buffs[my_fanout_parent].payload;
|
||||
parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
|
||||
|
||||
|
||||
/* Wait until parent signals that data is ready */
|
||||
/* The order of conditions checked in this loop is important, as it can
|
||||
* result in a race condition.
|
||||
*/
|
||||
while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
/* Copy the rank to a shared buffer writable by the current rank */
|
||||
memcpy(data_addr, (void *)parent_data_pointer,pack_len);
|
||||
|
||||
/* Signal to children that they may read the data from my shared buffer */
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
|
||||
}
|
||||
|
||||
/* if I am the last instance of a basesmuma function in this collectie,
|
||||
* release the resrouces */
|
||||
|
||||
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*zero-copy large massage communication methods*/
|
||||
#if 0
|
||||
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
int group_size, process_shift, my_node_index;
|
||||
int my_rank, first_instance=0, flag_offset;
|
||||
int rc = OMPI_SUCCESS;
|
||||
int my_fanout_parent;
|
||||
int leading_dim, buff_idx, idx;
|
||||
volatile int64_t ready_flag;
|
||||
int count=input_args->count;
|
||||
struct ompi_datatype_t* dtype=input_args->dtype;
|
||||
int root=input_args->root;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
mca_bcol_basesmuma_module_t* bcol_module=
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
|
||||
netpatterns_tree_node_t* my_fanout_read_tree;
|
||||
size_t pack_len = 0, dt_size;
|
||||
|
||||
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr);
|
||||
|
||||
struct mca_hdl_base_descriptor_t *hdl_desc;
|
||||
struct mca_hdl_base_segment_t *hdl_seg;
|
||||
int ret, completed, ridx/*remote rank index*/;
|
||||
bool status;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
||||
mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer= NULL;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer= NULL;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *child_ctl_pointer= NULL;
|
||||
struct mca_hdl_base_module_t* hdl = bcol_module->hdl_module[0];
|
||||
|
||||
|
||||
/* we will work only on packed data - so compute the length*/
|
||||
ompi_datatype_type_size(dtype, &dt_size);
|
||||
pack_len = count * dt_size;
|
||||
|
||||
buff_idx = input_args->src_desc->buffer_index;
|
||||
|
||||
/* Get addressing information */
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **)
|
||||
bcol_module->colls_with_user_data.ctl_buffs+idx;
|
||||
my_ctl_pointer = ctl_structs[my_rank];
|
||||
|
||||
/* Align node index to around sbgp root */
|
||||
process_shift = root;
|
||||
my_node_index = my_rank - root;
|
||||
if(0 > my_node_index ) {
|
||||
my_node_index += group_size;
|
||||
}
|
||||
|
||||
/* get my node for the bcast tree */
|
||||
my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]);
|
||||
my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
|
||||
if(group_size <= my_fanout_parent){
|
||||
my_fanout_parent -= group_size;
|
||||
}
|
||||
|
||||
/* setup resource recycling */
|
||||
if( my_ctl_pointer->sequence_number < sequence_number ) {
|
||||
first_instance = 1;
|
||||
}
|
||||
|
||||
if( first_instance ) {
|
||||
/* Signal arrival */
|
||||
my_ctl_pointer->flag = -1;
|
||||
my_ctl_pointer->index = 1;
|
||||
/* this does not need to use any flag values , so only need to
|
||||
* set the value for subsequent values that may need this */
|
||||
my_ctl_pointer->starting_flag_value = 0;
|
||||
flag_offset = 0;
|
||||
} else {
|
||||
/* only one thread at a time will be making progress on this
|
||||
* collective, so no need to make this atomic */
|
||||
my_ctl_pointer->index++;
|
||||
}
|
||||
|
||||
|
||||
/* increment the starting flag by one and return */
|
||||
flag_offset = my_ctl_pointer->starting_flag_value;
|
||||
ready_flag = flag_offset + sequence_number + 1;
|
||||
my_ctl_pointer->sequence_number = sequence_number;
|
||||
|
||||
hdl_desc = (mca_hdl_base_descriptor_t *)
|
||||
malloc (sizeof (mca_hdl_base_descriptor_t) * 1);
|
||||
|
||||
/*prepare a hdl data segment*/
|
||||
hdl_seg = (mca_hdl_base_segment_t*)
|
||||
malloc ( sizeof (mca_hdl_base_segment_t) * 1);
|
||||
hdl_seg->seg_addr.pval = input_args->sbuf;
|
||||
hdl_seg->seg_len = pack_len;
|
||||
|
||||
|
||||
hdl->endpoint->ready_flag = ready_flag;
|
||||
hdl->endpoint->local_ctrl = my_ctl_pointer;
|
||||
hdl->endpoint->sbgp_contextid =
|
||||
bcol_module->super.sbgp_partner_module->group_comm->c_contextid;
|
||||
|
||||
/*
|
||||
* Fan out from root
|
||||
*/
|
||||
if(ROOT_NODE == my_fanout_read_tree->my_node_type) {
|
||||
input_args->result_in_rbuf = false;
|
||||
|
||||
hdl_desc->des_src = hdl_seg;
|
||||
hdl_desc->des_src_cnt = 1;
|
||||
hdl_desc->isroot = true;
|
||||
|
||||
/*As the general semantics, there might multiple pairs of send/recv
|
||||
*on the topology tree*/
|
||||
for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
|
||||
child_ctl_pointer =
|
||||
ctl_structs[my_fanout_read_tree->children_ranks[ridx]];
|
||||
hdl->endpoint->remote_ctrl = child_ctl_pointer;
|
||||
ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
|
||||
goto exit_ERROR;
|
||||
}
|
||||
}
|
||||
}else if(LEAF_NODE == my_fanout_read_tree->my_node_type) {
|
||||
input_args->result_in_rbuf = false;
|
||||
/*
|
||||
* Get parent payload data and control data.
|
||||
* Get the pointer to the base address of the parent's payload buffer.
|
||||
* Get the parent's control buffer.
|
||||
*/
|
||||
parent_ctl_pointer = ctl_structs[my_fanout_parent];
|
||||
|
||||
hdl_desc->des_dst = hdl_seg;
|
||||
hdl_desc->des_dst_cnt = 1;
|
||||
hdl_desc->isroot = false;
|
||||
hdl->endpoint->remote_ctrl = parent_ctl_pointer;
|
||||
|
||||
#if __TEST_BLOCKING__
|
||||
ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
|
||||
#else
|
||||
ret = hdl->hdl_recvi(hdl, hdl->endpoint, NULL, 0, 0, &hdl_desc);
|
||||
#endif
|
||||
|
||||
#if __TEST_WAIT__
|
||||
ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc);
|
||||
BASESMUMA_VERBOSE(1,("wait on rank %d is done!", my_rank));
|
||||
#endif
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
status = false;
|
||||
#if __TEST_TEST__
|
||||
while (!status) {
|
||||
hdl->hdl_test(&hdl_desc, &completed, &status);
|
||||
opal_progress();
|
||||
BASESMUMA_VERBOSE(1, ("test on rank %d ........", my_rank));
|
||||
}
|
||||
#endif
|
||||
|
||||
goto Release;
|
||||
|
||||
}else{
|
||||
input_args->result_in_rbuf = false;
|
||||
/* Interior node */
|
||||
|
||||
/* Get parent payload data and control data */
|
||||
parent_ctl_pointer = ctl_structs[my_fanout_parent];
|
||||
|
||||
hdl_desc->des_dst = hdl_seg;
|
||||
hdl_desc->des_dst_cnt = 1;
|
||||
hdl_desc->isroot = false;
|
||||
|
||||
hdl->endpoint->remote_ctrl = parent_ctl_pointer;
|
||||
|
||||
ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
goto exit_ERROR;
|
||||
}
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* Signal to children that they may read the data from my shared buffer */
|
||||
opal_atomic_wmb ();
|
||||
hdl_desc->des_src = hdl_seg;
|
||||
hdl_desc->des_src_cnt = 1;
|
||||
for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
|
||||
child_ctl_pointer =
|
||||
ctl_structs[my_fanout_read_tree->children_ranks[ridx]];
|
||||
hdl->endpoint->remote_ctrl = child_ctl_pointer;
|
||||
|
||||
ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
|
||||
goto exit_ERROR;
|
||||
}
|
||||
}
|
||||
goto Release;
|
||||
}
|
||||
|
||||
Release:
|
||||
/* if I am the last instance of a basesmuma function in this collectie,
|
||||
* release the resrouces */
|
||||
if (IS_LAST_BCOL_FUNC(c_input_args)) {
|
||||
rc = bcol_basesmuma_free_buff(
|
||||
&(bcol_module->colls_with_user_data),
|
||||
sequence_number);
|
||||
}
|
||||
|
||||
my_ctl_pointer->starting_flag_value += 1;
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
exit_ERROR:
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#endif
|
@ -1,895 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
|
||||
#include "bcol_basesmuma_utils.h"
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
/* debug
|
||||
* #include "opal/sys/timer.h"
|
||||
*
|
||||
* extern uint64_t timers[7];
|
||||
* end debug */
|
||||
|
||||
/* debug */
|
||||
#include <unistd.h>
|
||||
/* end debug */
|
||||
|
||||
/* includes shared memory optimization */
|
||||
|
||||
#define BCOL_BASESMUMA_SM_PROBE(src_list, n_src, my_index, matched, src) \
|
||||
do { \
|
||||
int j; \
|
||||
for( j = 0; j < n_src; j++) { \
|
||||
parent_ctl_pointer = data_buffs[src_list[j]].ctl_struct; \
|
||||
parent_data_pointer = data_buffs[src_list[j]].payload; \
|
||||
if( IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) { \
|
||||
src = src_list[j]; \
|
||||
matched = 1; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define IS_LARGE_DATA_READY(peer, my_flag, my_sequence_number) \
|
||||
(((peer)->sequence_number == (my_sequence_number) && \
|
||||
(peer)->flags[BCAST_FLAG] >= (my_flag) \
|
||||
)? true : false )
|
||||
*/
|
||||
|
||||
/*
|
||||
#define IS_KNOWN_ROOT_DATA_READY(peer, my_flag, my_sequence_number) \
|
||||
(((peer)->sequence_number == (my_sequence_number) && \
|
||||
(peer)->flags[BCAST_FLAG][bcol_id] >= (my_flag) \
|
||||
)? true : false )
|
||||
*/
|
||||
|
||||
#define BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(src_list, n_src, my_index, matched, src, flag_index, bcol_id) \
|
||||
do { \
|
||||
int j; \
|
||||
for( j = 0; j < n_src; j++) { \
|
||||
/* fprintf(stderr,"my_rank %d and %d\n",my_rank,1); */ \
|
||||
if(src_list[j] != -1) { \
|
||||
parent_ctl_pointer = ctl_structs[src_list[j]]; \
|
||||
parent_data_pointer = (void *) data_buffs[src_list[j]].ctl_struct; \
|
||||
/*fprintf(stderr,"my_rank %d ready flag %d partner flag %d and %d\n",my_rank,ready_flag,parent_ctl_pointer->flag,2); */ \
|
||||
if( IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, flag_index, bcol_id)) { \
|
||||
src = src_list[j]; \
|
||||
matched = 1; \
|
||||
index = j; \
|
||||
/* fprintf(stderr,"found it from %d!\n",src);*/ \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define K_NOMIAL_DATA_SRC(radix, my_group_index, group_size, group_root, data_src, radix_mask) \
|
||||
do { \
|
||||
int relative_rank = (my_group_index >= group_root) ? my_group_index - group_root : \
|
||||
my_group_index - group_root + group_size; \
|
||||
radix_mask = 1; \
|
||||
while (radix_mask < group_size) { \
|
||||
if (relative_rank % (radix * radix_mask)) { \
|
||||
data_src = relative_rank/(radix * radix_mask) * (radix * radix_mask) + group_root; \
|
||||
if (data_src >= group_size) data_src -= group_size; \
|
||||
break; \
|
||||
} \
|
||||
radix_mask *= radix; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
mca_bcol_basesmuma_module_t* bcol_module=
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
int i, matched = 0;
|
||||
int group_size;
|
||||
int my_rank;
|
||||
int leading_dim,
|
||||
buff_idx,
|
||||
idx;
|
||||
int count = input_args->count;
|
||||
struct ompi_datatype_t* dtype = input_args->dtype;
|
||||
int64_t sequence_number = input_args->sequence_num;
|
||||
int radix =
|
||||
mca_bcol_basesmuma_component.k_nomial_radix;
|
||||
int radix_mask;
|
||||
int16_t data_src = -1;
|
||||
|
||||
volatile int8_t ready_flag;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile char* parent_data_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
|
||||
size_t pack_len = 0;
|
||||
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr +
|
||||
input_args->sbuf_offset);
|
||||
|
||||
#if 0
|
||||
fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
|
||||
fflush(stderr);
|
||||
#endif
|
||||
|
||||
|
||||
/* we will work only on packed data - so compute the length*/
|
||||
BASESMUMA_VERBOSE(3, ("Calling bcol_basesmuma_bcast_k_nomial_knownroot"));
|
||||
|
||||
pack_len = mca_bcol_base_get_buff_length(dtype, count);
|
||||
/* Some hierarchical algorithms have data that is accumulated at each step
|
||||
* this factor accounts for this
|
||||
*/
|
||||
pack_len = pack_len*input_args->hier_factor;
|
||||
buff_idx = input_args->buffer_index;
|
||||
|
||||
/* Get addressing information */
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs + idx;
|
||||
|
||||
/* Set pointer to current proc ctrl region */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
|
||||
/* setup resource recycling */
|
||||
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
|
||||
/* removing dependence on sequence number */
|
||||
/* I believe this is resolved now with the signaling flags */
|
||||
/*
|
||||
ready_temp = 1 + (int8_t) flag_offset + (int8_t) bcol_id;
|
||||
if( ready_temp >= my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) {
|
||||
ready_flag = ready_temp;
|
||||
} else {
|
||||
ready_flag = my_ctl_pointer->flags[BCAST_FLAG][bcol_id];
|
||||
}
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->sequence_number = sequence_number;
|
||||
*/
|
||||
|
||||
|
||||
/* non-blocking broadcast algorithm */
|
||||
|
||||
/* If I am the root, then signal ready flag */
|
||||
if(input_args->root_flag) {
|
||||
BASESMUMA_VERBOSE(10,("I am the root of the data"));
|
||||
/*
|
||||
* signal ready flag
|
||||
*/
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
|
||||
|
||||
/* root is finished */
|
||||
goto Release;
|
||||
}
|
||||
|
||||
|
||||
/* Calculate source of the data */
|
||||
K_NOMIAL_DATA_SRC(radix, my_rank, group_size,
|
||||
input_args->root_route->rank, data_src, radix_mask);
|
||||
|
||||
|
||||
parent_ctl_pointer = data_buffs[data_src].ctl_struct;
|
||||
parent_data_pointer = data_buffs[data_src].payload;
|
||||
|
||||
for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
|
||||
|
||||
if(IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, BCAST_FLAG, bcol_id)) {
|
||||
matched = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* If not matched, then hop out and put me on progress list */
|
||||
if(0 == matched ) {
|
||||
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
}
|
||||
|
||||
/* else, we found our root within the group ... */
|
||||
BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", data_src));
|
||||
|
||||
/* copy the data */
|
||||
memcpy(data_addr, (void *) parent_data_pointer, pack_len);
|
||||
/* set the memory barrier to ensure completion */
|
||||
opal_atomic_wmb ();
|
||||
/* signal that I am done */
|
||||
my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
|
||||
|
||||
|
||||
Release:
|
||||
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Shared memory non-blocking Broadcast - K-nomial fan-out for small data buffers.
|
||||
* This routine assumes that buf (the input buffer) is a single writer
|
||||
* multi reader (SWMR) shared memory buffer owned by the calling rank
|
||||
* which is the only rank that can write to this buffers.
|
||||
* It is also assumed that the buffers are registered and fragmented
|
||||
* at the ML level and that buf is sufficiently large to hold the data.
|
||||
*
|
||||
*
|
||||
* @param buf - SWMR shared buffer within a sbgp that the
|
||||
* executing rank can write to.
|
||||
* @param count - the number of elements in the shared buffer.
|
||||
* @param dtype - the datatype of a shared buffer element.
|
||||
* @param root - the index within the sbgp of the root.
|
||||
* @param module - basesmuma module.
|
||||
*/
|
||||
int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
mca_bcol_basesmuma_module_t* bcol_module=
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
int i;
|
||||
int group_size;
|
||||
int my_rank;
|
||||
int leading_dim, buff_idx, idx;
|
||||
int count=input_args->count;
|
||||
struct ompi_datatype_t* dtype=input_args->dtype;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
int radix = cs->k_nomial_radix;
|
||||
int radix_mask;
|
||||
int relative_rank;
|
||||
int pow_k_group_size;
|
||||
|
||||
volatile int8_t ready_flag;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile void* parent_data_pointer;
|
||||
|
||||
volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
|
||||
size_t pack_len = 0;
|
||||
void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr +
|
||||
input_args->sbuf_offset);
|
||||
|
||||
#if 0
|
||||
fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
|
||||
fflush(stderr);
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* we will work only on packed data - so compute the length*/
|
||||
pack_len = mca_bcol_base_get_buff_length(dtype, count);
|
||||
|
||||
buff_idx = input_args->buffer_index;
|
||||
|
||||
/* Get addressing information */
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
|
||||
/* get pow_k_levels and pow_k_group_size */
|
||||
pow_k_group_size = bcol_module->pow_k;
|
||||
|
||||
|
||||
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
|
||||
/* Set pointer to current proc ctrl region */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
|
||||
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
|
||||
|
||||
/* non-blocking broadcast algorithm */
|
||||
|
||||
/* If I am the root, then signal ready flag */
|
||||
if(input_args->root_flag) {
|
||||
|
||||
BASESMUMA_VERBOSE(10,("I am the root of the data"));
|
||||
/*
|
||||
* set the radix_mask */
|
||||
radix_mask = pow_k_group_size;
|
||||
/* send to children */
|
||||
opal_atomic_wmb ();
|
||||
BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,
|
||||
radix,0,
|
||||
my_rank,group_size, ready_flag);
|
||||
/* root is finished */
|
||||
goto Release;
|
||||
}
|
||||
|
||||
/* If I am not the root, then poll on possible "senders'" control structs */
|
||||
for( i = 0; i < cs->num_to_probe; i++) {
|
||||
|
||||
if( ready_flag == my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) {
|
||||
|
||||
/* else, we found our root within the group ... */
|
||||
parent_data_pointer = data_buffs[my_ctl_pointer->src].payload;
|
||||
BASESMUMA_VERBOSE(5,("%d found it from %d \n",my_rank,my_ctl_pointer->src));
|
||||
/* memcopy the data */
|
||||
memcpy(data_addr, (void *) parent_data_pointer, pack_len);
|
||||
/* compute my relative rank */
|
||||
relative_rank = (my_rank - my_ctl_pointer->src) < 0 ? my_rank -
|
||||
my_ctl_pointer->src + group_size : my_rank - my_ctl_pointer->src;
|
||||
|
||||
/* compute my radix mask */
|
||||
radix_mask = 1;
|
||||
while(radix_mask < group_size ){
|
||||
if( 0 != relative_rank % (radix*radix_mask)) {
|
||||
/* found it */
|
||||
break;
|
||||
}
|
||||
radix_mask *= radix;
|
||||
}
|
||||
/* go one step back */
|
||||
radix_mask /= radix;
|
||||
|
||||
/* send to children */
|
||||
opal_atomic_wmb ();
|
||||
BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,
|
||||
radix, relative_rank,
|
||||
my_rank, group_size, ready_flag);
|
||||
/* bail */
|
||||
|
||||
goto Release;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* If not matched, then hop out and put me on progress list */
|
||||
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
|
||||
/*fprintf(stderr,"bcol_id %d Not started\n",bcol_id);*/
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
|
||||
|
||||
|
||||
Release:
|
||||
|
||||
|
||||
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
|
||||
/* non-blocking binary scatter allgather anyroot algorithm for large data
|
||||
* broadcast
|
||||
*/
|
||||
|
||||
|
||||
#if 0
|
||||
/* prototype code for shared memory scatter/allgather algorithm. Signaling scheme
|
||||
* works, should be used as a reference for other types of shared memory scatter/allgather
|
||||
* algorithms.
|
||||
*/
|
||||
int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
|
||||
/* local variables */
|
||||
int i, j;
|
||||
int length;
|
||||
int start;
|
||||
int my_rank, parent_rank;
|
||||
int partner;
|
||||
int src = -1;
|
||||
int matched = 0;
|
||||
int group_size;
|
||||
int first_instance=0;
|
||||
int leading_dim, buff_idx, idx;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
|
||||
int64_t ready_flag;
|
||||
int64_t local_offset;
|
||||
|
||||
int flag_offset;
|
||||
int pow_2, pow_2_levels;
|
||||
int index = -1;
|
||||
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
mca_bcol_basesmuma_module_t *bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
|
||||
/* use the old control structs for large messages,
|
||||
* otherwise we will destroy the shared memory
|
||||
* optimization
|
||||
*/
|
||||
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
||||
mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer;
|
||||
mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* binomial fanout */
|
||||
mca_bcol_basesmuma_ctl_struct_t *partner_ctl_pointer; /* recursive double */
|
||||
|
||||
/* for now, we use the payload buffer for single fragment */
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile void *parent_data_pointer; /* binomial scatter */
|
||||
volatile void *partner_data_pointer; /* recursive double */
|
||||
|
||||
uint32_t fragment_size; /* ml buffer size for now */
|
||||
|
||||
/* we will transfer the entire buffer,
|
||||
* so start at the base address of the ml buffer
|
||||
*/
|
||||
void *data_addr = (void *) ((unsigned char *) input_args->src_desc->base_data_addr);
|
||||
#if 0
|
||||
fprintf(stderr,"AAA Entering nb-sm large msg broadcast input_args->frag_size %d \n",input_args->frag_size);
|
||||
fflush(stderr);
|
||||
#endif
|
||||
|
||||
buff_idx = input_args->src_desc->buffer_index;
|
||||
|
||||
group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
|
||||
/* get the largest power of two that is smaller than
|
||||
* or equal to the group size
|
||||
*/
|
||||
pow_2_levels = bcol_module->pow_2_levels;
|
||||
pow_2 = bcol_module->pow_2;
|
||||
|
||||
/* get the fragment size
|
||||
*/
|
||||
|
||||
/* still just the size of the entire buffer */
|
||||
fragment_size = input_args->buffer_size;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
|
||||
|
||||
/* grab the control structs */
|
||||
ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
|
||||
bcol_module->colls_with_user_data.ctl_buffs+idx;
|
||||
|
||||
/* grab the data buffs */
|
||||
data_buffs = (mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
|
||||
my_ctl_pointer = ctl_structs[my_rank];
|
||||
|
||||
if(my_ctl_pointer->sequence_number < sequence_number) {
|
||||
first_instance = 1;
|
||||
}
|
||||
|
||||
if(first_instance) {
|
||||
my_ctl_pointer->flag = -1;
|
||||
my_ctl_pointer->index = 1;
|
||||
|
||||
my_ctl_pointer->starting_flag_value = 0;
|
||||
|
||||
flag_offset = 0;
|
||||
|
||||
} else {
|
||||
|
||||
my_ctl_pointer->index++;
|
||||
}
|
||||
|
||||
/* increment the starting flag by one and return */
|
||||
flag_offset = my_ctl_pointer->starting_flag_value;
|
||||
ready_flag = flag_offset + sequence_number + 1;
|
||||
|
||||
my_ctl_pointer->sequence_number = sequence_number;
|
||||
|
||||
/* am I the root */
|
||||
if(input_args->root_flag) {
|
||||
/* if I've already been here, then
|
||||
* hop down to the allgather
|
||||
*/
|
||||
if(ALLGATHER == my_ctl_pointer->status) {
|
||||
goto Allgather;
|
||||
}
|
||||
BASESMUMA_VERBOSE(10,("I am the root of the data"));
|
||||
/* debug print */
|
||||
/*fprintf(stderr,"I am the root %d\n",my_rank);*/
|
||||
/*
|
||||
* signal ready flag
|
||||
*/
|
||||
/* set the offset into the buffer */
|
||||
my_ctl_pointer->offset = 0;
|
||||
/* how many children do I have */
|
||||
my_ctl_pointer->n_sends = pow_2_levels;
|
||||
/* my data length */
|
||||
my_ctl_pointer->length = fragment_size;
|
||||
|
||||
/* important that these be set before my children
|
||||
* see the ready flag raised
|
||||
*/
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
/* root is finished */
|
||||
if( my_rank < pow_2 ) {
|
||||
/* if I'm in the power of two group,
|
||||
* then goto the allgather
|
||||
*/
|
||||
my_ctl_pointer->status = ALLGATHER;
|
||||
goto Allgather;
|
||||
|
||||
} else {
|
||||
|
||||
/* if I'm not, then I'm done and release */
|
||||
goto Release;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* what phase am I participating in
|
||||
*/
|
||||
switch(my_ctl_pointer->status) {
|
||||
|
||||
case SCATTER:
|
||||
goto Scatter;
|
||||
break;
|
||||
|
||||
case ALLGATHER:
|
||||
goto Allgather;
|
||||
break;
|
||||
|
||||
case EXTRA_RANK:
|
||||
goto Extra;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
Extra:
|
||||
/* am I part of the non-power-of-2 group */
|
||||
if( my_rank >= pow_2 ) {
|
||||
/* find parent to copy from */
|
||||
parent_rank = my_rank&(pow_2-1);
|
||||
parent_ctl_pointer = ctl_structs[parent_rank];
|
||||
/* start at the base */
|
||||
parent_data_pointer = (void *) data_buffs[parent_rank].ctl_struct;
|
||||
|
||||
/* now, I need to do some arithmetic to
|
||||
* arrive at the value everyone else does
|
||||
* when they have completed the algorithm
|
||||
*/
|
||||
|
||||
/* compute ready flag value to poll on */
|
||||
ready_flag = ready_flag + pow_2_levels;
|
||||
|
||||
/* start to poll */
|
||||
for( i = 0; i< cs->num_to_probe; i++) {
|
||||
if(IS_LARGE_DATA_READY(parent_ctl_pointer,ready_flag, sequence_number)) {
|
||||
/* copy the data and bail */
|
||||
memcpy(data_addr,(void *)parent_data_pointer,fragment_size);
|
||||
goto Release;
|
||||
}
|
||||
/*
|
||||
else {
|
||||
opal_progress();
|
||||
}
|
||||
*/
|
||||
}
|
||||
my_ctl_pointer->status = EXTRA_RANK;
|
||||
|
||||
/* hop out and put me onto a progress queue */
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
}
|
||||
|
||||
Scatter:
|
||||
|
||||
/* on first entry, compute the list of possible sources */
|
||||
if( NULL == my_ctl_pointer->src_ptr ) {
|
||||
my_ctl_pointer->src_ptr = (int *) malloc(sizeof(int)*(pow_2_levels+1));
|
||||
|
||||
for( i = 0; i < pow_2_levels; i++) {
|
||||
my_ctl_pointer->src_ptr[i] = my_rank ^ (1<<i);
|
||||
}
|
||||
/* am I participating in the non-power of two */
|
||||
if((my_rank+pow_2) < group_size) {
|
||||
/* extra rank that I'm paired with */
|
||||
my_ctl_pointer->src_ptr[i] = my_rank + pow_2;
|
||||
} else {
|
||||
/* no extra rank to worry about */
|
||||
my_ctl_pointer->src_ptr[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* If I am not the root, then poll on possible "senders'" control structs */
|
||||
for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
|
||||
|
||||
/* Shared memory iprobe */
|
||||
BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(my_ctl_pointer->src_ptr, pow_2_levels+1,
|
||||
my_rank, matched, src);
|
||||
}
|
||||
|
||||
/* If not matched, then hop out and put me on progress list */
|
||||
if(0 == matched ) {
|
||||
|
||||
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
|
||||
|
||||
my_ctl_pointer->status = SCATTER;
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
|
||||
} else if ( src >= pow_2 ){
|
||||
|
||||
/* If matched from an extra rank, then get the whole message from partner */
|
||||
memcpy((void *) data_addr, (void *) parent_data_pointer,
|
||||
parent_ctl_pointer->length);
|
||||
|
||||
/* now I am the psuedo-root in the power-of-two group */
|
||||
my_ctl_pointer->offset = 0;
|
||||
my_ctl_pointer->length = parent_ctl_pointer->length;
|
||||
my_ctl_pointer->n_sends = parent_ctl_pointer->n_sends;
|
||||
|
||||
/* set the memory barrier */
|
||||
opal_atomic_wmb ();
|
||||
|
||||
/* fire the ready flag */
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
my_ctl_pointer->status = ALLGATHER;
|
||||
/* go to the allgather */
|
||||
goto Allgather;
|
||||
}
|
||||
|
||||
|
||||
/* we need to see whether this is really
|
||||
* who we are looking for
|
||||
*/
|
||||
for( i = 0; i < parent_ctl_pointer->n_sends; i++) {
|
||||
/* debug print */
|
||||
/*
|
||||
fprintf(stderr,"I am %d checking on a hit from %d with n_sends %d\n",my_rank,src,parent_ctl_pointer->n_sends);
|
||||
fflush(stderr);
|
||||
*/
|
||||
/* end debug */
|
||||
if( my_rank == (src^(1<<i))) {
|
||||
|
||||
/* we found our root within the group ... */
|
||||
BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
|
||||
/* this is who I've been looking for */
|
||||
my_ctl_pointer->n_sends = i;
|
||||
|
||||
if ( i > 0) {
|
||||
/* compute the size of the chunk to copy */
|
||||
length = (parent_ctl_pointer->length)/
|
||||
(1<<(parent_ctl_pointer->n_sends - my_ctl_pointer->n_sends));
|
||||
my_ctl_pointer->length = length;
|
||||
my_ctl_pointer->offset =
|
||||
parent_ctl_pointer->offset+length;
|
||||
|
||||
/*fprintf(stderr,"%d's offset %d and length %d \n",my_rank,my_ctl_pointer->offset,length);*/
|
||||
|
||||
/* now we can copy the data */
|
||||
memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset),
|
||||
(void *) ((uint64_t) parent_data_pointer+(uint64_t) parent_ctl_pointer->offset +
|
||||
(uint64_t) length),
|
||||
(size_t)length);
|
||||
} else {
|
||||
/* this "trick" takes care of the first level
|
||||
* of recurssive doubling
|
||||
*/
|
||||
length = parent_ctl_pointer->length/
|
||||
(1<<(parent_ctl_pointer->n_sends - 1));
|
||||
my_ctl_pointer->length = length;
|
||||
my_ctl_pointer->offset = parent_ctl_pointer->offset;
|
||||
|
||||
/*fprintf(stderr,"%d's offset %d and length %d\n",my_rank,my_ctl_pointer->offset,length);*/
|
||||
/* now we can copy the data */
|
||||
memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset),
|
||||
(void *) ((uint64_t) parent_data_pointer+(uint64_t) my_ctl_pointer->offset),
|
||||
(size_t)length);
|
||||
}
|
||||
/* set the memory barrier to ensure completion */
|
||||
opal_atomic_wmb ();
|
||||
/* signal that I am done */
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
/* set my status */
|
||||
my_ctl_pointer->status = ALLGATHER;
|
||||
/* time for allgather phase */
|
||||
goto Allgather;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* this is not who we are looking for,
|
||||
* mark as false positive so we don't
|
||||
* poll here again
|
||||
*/
|
||||
my_ctl_pointer->src_ptr[index] = -1;
|
||||
/* probably we should jump out and put onto progress list */
|
||||
my_ctl_pointer->status = SCATTER;
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
|
||||
Allgather:
|
||||
|
||||
/* zip it back up - we have already taken care of first level */
|
||||
/* needed for non-blocking conditional */
|
||||
matched = 0;
|
||||
|
||||
/* get my local_offset */
|
||||
local_offset = my_ctl_pointer->offset;
|
||||
|
||||
/* bump the ready flag */
|
||||
ready_flag++;
|
||||
|
||||
/* first level of zip up */
|
||||
length = 2*fragment_size/pow_2;
|
||||
|
||||
/* first level of zip-up
|
||||
* already includes first level of
|
||||
* recursive doubling
|
||||
*/
|
||||
start = 1;
|
||||
|
||||
/* for non-blocking, check to see if I need to reset the state */
|
||||
if(my_ctl_pointer->flag >= ready_flag) {
|
||||
/* then reset the state */
|
||||
ready_flag = my_ctl_pointer->flag;
|
||||
start = my_ctl_pointer->start;
|
||||
/* get the local offset */
|
||||
local_offset = my_ctl_pointer->offset_zip;
|
||||
/* compute the correct length */
|
||||
length = length*(1<<(start - 1));
|
||||
/* careful! skip over the opal_atomic_wmb () to avoid the
|
||||
* cost on every re-entry
|
||||
*/
|
||||
goto Loop;
|
||||
}
|
||||
|
||||
|
||||
opal_atomic_wmb ();
|
||||
/* I am ready, set the flag */
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
Loop:
|
||||
|
||||
for( i = start; i < pow_2_levels; i++) {
|
||||
/* get my partner for this level */
|
||||
partner = my_rank^(1<<i);
|
||||
partner_ctl_pointer = ctl_structs[partner];
|
||||
partner_data_pointer = (void *) data_buffs[partner].ctl_struct;
|
||||
|
||||
/* is data ready */
|
||||
for( j = 0; j < cs->num_to_probe && matched == 0; j++) {
|
||||
if(IS_LARGE_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) {
|
||||
|
||||
/* debug prints
|
||||
fprintf(stderr,"666 I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d partner_offset %d\n",
|
||||
my_rank,sequence_number,partner, ready_flag,partner_ctl_pointer->flag,buff_idx,partner_ctl_pointer->offset);
|
||||
*/
|
||||
/* debug print */
|
||||
#if 0
|
||||
fprintf(stderr,"I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d \n",
|
||||
my_rank,sequence_number,partner, ready_flag,parent_ctl_pointer->flag,buff_idx);
|
||||
#endif
|
||||
/* end debug prints */
|
||||
|
||||
assert(partner_ctl_pointer->flag >= ready_flag);
|
||||
/* found it */
|
||||
matched = 1;
|
||||
/* only copy it, if you sit at a lower level in the tree */
|
||||
if( my_ctl_pointer->n_sends <= partner_ctl_pointer->n_sends ) {
|
||||
|
||||
/* calculate the local offset based on partner's remote offset */
|
||||
if( partner_ctl_pointer->offset < my_ctl_pointer->offset ) {
|
||||
/* then I'm looking "up" the tree */
|
||||
local_offset -= length;
|
||||
/* debug print */
|
||||
/*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
|
||||
/* end debug */
|
||||
memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
|
||||
(void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset),
|
||||
length);
|
||||
} else {
|
||||
/* I'm looking "down" the tree */
|
||||
local_offset += length;
|
||||
/* debug print */
|
||||
/*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
|
||||
/* end debug */
|
||||
memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
|
||||
(void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset),
|
||||
length);
|
||||
/* reset my local offset */
|
||||
local_offset -= length;
|
||||
}
|
||||
|
||||
}
|
||||
/* bump the ready flag */
|
||||
ready_flag++;
|
||||
/* ensure completion */
|
||||
opal_atomic_wmb ();
|
||||
|
||||
/* fire the flag for the next level */
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
/* double the length */
|
||||
length *= 2;
|
||||
}
|
||||
}
|
||||
/* check to see what kind of progress I've made */
|
||||
if( 0 == matched ) {
|
||||
/* save state, hop out and try again later */
|
||||
my_ctl_pointer->start = i;
|
||||
/* save the local offset */
|
||||
my_ctl_pointer->offset_zip = local_offset;
|
||||
/* put in progress queue */
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
/* else, start next level of recursive doubling */
|
||||
matched = 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* cleanup */
|
||||
if(NULL != my_ctl_pointer->src_ptr) {
|
||||
free(my_ctl_pointer->src_ptr);
|
||||
my_ctl_pointer->src_ptr = NULL;
|
||||
}
|
||||
|
||||
Release:
|
||||
|
||||
|
||||
/* If I am the last instance, release the resource */
|
||||
/*
|
||||
if( IS_LAST_BCOL_FUNC(c_input_args)) {
|
||||
rc = bcol_basesmuma_free_buff(
|
||||
&(bcol_module->colls_with_user_data),
|
||||
sequence_number);
|
||||
}
|
||||
*/
|
||||
|
||||
my_ctl_pointer->starting_flag_value++;
|
||||
my_ctl_pointer->status = FINISHED;
|
||||
return BCOL_FN_COMPLETE;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
int mca_bcol_basesmuma_bcast_binomial_scatter_allgather(void *desc)
|
||||
{
|
||||
/* local variables */
|
||||
int rc, n_frags_sent;
|
||||
uint32_t stripe_number;
|
||||
int count, count_processed;
|
||||
size_t dt_size;
|
||||
uint32_t n_data_segments_to_schedule;
|
||||
ompi_datatype_t *dtype;
|
||||
message_descriptor_t *message_descriptor;
|
||||
mca_bcol_basesmuma_module_t *bcol_module;
|
||||
int pipe_depth;
|
||||
|
||||
|
||||
/* get the full message descriptor */
|
||||
|
||||
|
||||
/* compute the number of fragments to send */
|
||||
|
||||
|
||||
/* start to fill the pipeline */
|
||||
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
#endif
|
@ -1,486 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/patterns/comm/coll_ops.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "bcol_basesmuma.h"
|
||||
/*
|
||||
* With support for nonblocking collectives, we don't have an upper
|
||||
* limit on the number of outstanding collectives per communicator.
|
||||
* Also, since we want to avoid communication to figure out which
|
||||
* buffers other ranks in the group will use, we will rely on the
|
||||
* fact that collective operations are called in the same order
|
||||
* in each process, to assign a unique ID to each collective operation.
|
||||
* We use this to create a static mapping from the index to the buffer
|
||||
* that will be used. Also, because there is no limit to the number of
|
||||
* outstanding collective operations, we use a generation index for each
|
||||
* memory bank, so the collective will use the buffer only when the
|
||||
* correct generation of the bank is ready for use.
|
||||
*/
|
||||
int bcol_basesmuma_get_buff_index( sm_buffer_mgmt *buff_block,
|
||||
uint64_t buff_id )
|
||||
{
|
||||
/* local variables */
|
||||
int memory_bank;
|
||||
uint64_t generation;
|
||||
int index=-1;
|
||||
|
||||
|
||||
/* get the bank index that will be used */
|
||||
memory_bank=buff_id& buff_block->mask;
|
||||
memory_bank = memory_bank SHIFT_DOWN buff_block->log2_num_buffs_per_mem_bank;
|
||||
|
||||
/* get the generation of the bank this maps to */
|
||||
generation = buff_id SHIFT_DOWN (buff_block->log2_number_of_buffs);
|
||||
|
||||
/* check to see if the bank is available */
|
||||
if( generation == buff_block->ctl_buffs_mgmt[memory_bank].
|
||||
bank_gen_counter ) {
|
||||
|
||||
/* get the buffer index that will be returned */
|
||||
index=buff_id & buff_block->mask;
|
||||
|
||||
/* no in-use counter increment, as the mapping is static, and
|
||||
* all we need to know if the number of collectives that complete */
|
||||
|
||||
} else {
|
||||
/* progress communications so that resources can be freed up */
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
/* return */
|
||||
return index;
|
||||
}
|
||||
|
||||
/* release the shared memory buffers
|
||||
* buf_id is the unique ID assigned to the particular buffer
|
||||
*/
|
||||
int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block,
|
||||
uint64_t buff_id )
|
||||
{
|
||||
/* local variables */
|
||||
int ret=OMPI_SUCCESS;
|
||||
int memory_bank;
|
||||
uint64_t generation;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
|
||||
/* get the bank index that will be used */
|
||||
memory_bank=buff_id& buff_block->mask;
|
||||
memory_bank = memory_bank SHIFT_DOWN buff_block->log2_num_buffs_per_mem_bank;
|
||||
|
||||
/* get the generation of the bank this maps to */
|
||||
generation = buff_id SHIFT_DOWN (buff_block->log2_number_of_buffs);
|
||||
|
||||
/* the generation counter should not change until all resrouces
|
||||
* associated with this bank have been freed.
|
||||
*/
|
||||
assert(generation == buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter);
|
||||
(void)generation; // silence compiler warning
|
||||
|
||||
/*
|
||||
* increment counter of completed buffers
|
||||
*/
|
||||
OPAL_THREAD_ADD32(&(buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed),
|
||||
1);
|
||||
|
||||
/*
|
||||
* If I am the last to checkin - initiate resource recycling
|
||||
*/
|
||||
if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed ==
|
||||
buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) {
|
||||
|
||||
/* Lock to ensure atomic recycling of resources */
|
||||
OPAL_THREAD_LOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
|
||||
|
||||
/* make sure someone else did not already get to this */
|
||||
if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed !=
|
||||
buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) {
|
||||
/* release lock and exit */
|
||||
OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
|
||||
} else {
|
||||
sm_nbbar_desc_t *p_sm_nb_desc = NULL;
|
||||
/* initiate the freeing of resources. Need to make sure the other
|
||||
* ranks in the group are also done with their resources before this
|
||||
* block is made available for use again.
|
||||
* No one else will try to allocate from this block or free back to
|
||||
* this block until the next genration counter has been incremented,
|
||||
* so will just reset the number of freed buffers to 0, so no one else
|
||||
* will try to also initialize the recycling of these resrouces
|
||||
*/
|
||||
buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed=0;
|
||||
|
||||
/* Start the nonblocking barrier */
|
||||
p_sm_nb_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
|
||||
p_sm_nb_desc->coll_buff = buff_block;
|
||||
bcol_basesmuma_rd_nb_barrier_init_admin(p_sm_nb_desc);
|
||||
|
||||
if( NB_BARRIER_DONE !=
|
||||
buff_block->ctl_buffs_mgmt[memory_bank].
|
||||
nb_barrier_desc.collective_phase) {
|
||||
|
||||
opal_list_t *list=&(cs->nb_admin_barriers);
|
||||
opal_list_item_t *append_item;
|
||||
|
||||
/* put this onto the progression list */
|
||||
OPAL_THREAD_LOCK(&(cs->nb_admin_barriers_mutex));
|
||||
append_item=(opal_list_item_t *)
|
||||
&(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
|
||||
opal_list_append(list,append_item);
|
||||
OPAL_THREAD_UNLOCK(&(cs->nb_admin_barriers_mutex));
|
||||
/* progress communications so that resources can be freed up */
|
||||
opal_progress();
|
||||
} else {
|
||||
/* mark the block as available */
|
||||
(buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter)++;
|
||||
}
|
||||
|
||||
/* get out of here */
|
||||
OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* return */
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate buffers for storing non-blocking collective descriptions, required
|
||||
* for making code re-entrant
|
||||
*
|
||||
*/
|
||||
static int init_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc,
|
||||
void *base_addr, uint32_t num_banks,
|
||||
uint32_t num_buffers_per_bank,
|
||||
uint32_t size_buffer,
|
||||
uint32_t header_size,
|
||||
int group_size,
|
||||
int pow_k)
|
||||
{
|
||||
uint32_t i, j, ci;
|
||||
mca_bcol_basesmuma_nb_coll_buff_desc_t *tmp_desc = NULL;
|
||||
int k_nomial_radix = mca_bcol_basesmuma_component.k_nomial_radix;
|
||||
int pow_k_val = (0 == pow_k) ? 1 : pow_k;
|
||||
int num_to_alloc = (k_nomial_radix - 1) * pow_k_val * 2 + 1 ;
|
||||
|
||||
|
||||
*desc = (mca_bcol_basesmuma_nb_coll_buff_desc_t *)calloc(num_banks * num_buffers_per_bank, sizeof(mca_bcol_basesmuma_nb_coll_buff_desc_t));
|
||||
if (NULL == *desc) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
tmp_desc = *desc;
|
||||
|
||||
for (i = 0; i < num_banks; i++) {
|
||||
for (j = 0; j < num_buffers_per_bank; j++) {
|
||||
ci = i * num_buffers_per_bank + j;
|
||||
tmp_desc[ci].bank_index = i;
|
||||
tmp_desc[ci].buffer_index = j;
|
||||
/* *2 is for gather session +1 for extra peer */
|
||||
tmp_desc[ci].requests = (ompi_request_t **)
|
||||
calloc(num_to_alloc, sizeof(ompi_request_t *));
|
||||
tmp_desc[ci].data_addr = (void *)
|
||||
((unsigned char*)base_addr + ci * size_buffer + header_size);
|
||||
BASESMUMA_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr));
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Free buffers for storing non-blocking collective descriptions.
|
||||
*
|
||||
*/
|
||||
void cleanup_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc,
|
||||
uint32_t num_banks,
|
||||
uint32_t num_buffers_per_bank)
|
||||
{
|
||||
uint32_t ci;
|
||||
if (NULL != *desc) {
|
||||
for (ci=0; ci<num_banks*num_buffers_per_bank; ci++) {
|
||||
if (NULL != ((*desc)[ci]).requests) {
|
||||
free(((*desc)[ci]).requests);
|
||||
((*desc))[ci].requests = NULL;
|
||||
}
|
||||
}
|
||||
free(*desc);
|
||||
*desc = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
/* New init function used for new control scheme where we put the control
|
||||
* struct at the top of the payload buffer
|
||||
*/
|
||||
int bcol_basesmuma_bank_init_opti(struct mca_bcol_base_memory_block_desc_t *payload_block,
|
||||
uint32_t data_offset,
|
||||
mca_bcol_base_module_t *bcol_module,
|
||||
void *reg_data)
|
||||
{
|
||||
/* assumption here is that the block has been registered with
|
||||
* sm bcol hence has been mapped by each process, need to be
|
||||
* sure that memory is mapped amongst sm peers
|
||||
*/
|
||||
|
||||
/* local variables */
|
||||
int ret = OMPI_SUCCESS, i, j;
|
||||
sm_buffer_mgmt *pload_mgmt;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
bcol_basesmuma_registration_data_t *sm_reg_data =
|
||||
(bcol_basesmuma_registration_data_t *) reg_data;
|
||||
mca_bcol_basesmuma_module_t *sm_bcol =
|
||||
(mca_bcol_basesmuma_module_t *) bcol_module;
|
||||
mca_bcol_base_memory_block_desc_t *ml_block = payload_block;
|
||||
size_t malloc_size;
|
||||
bcol_basesmuma_smcm_file_t input_file;
|
||||
int leading_dim,loop_limit,buf_id;
|
||||
unsigned char *base_ptr;
|
||||
mca_bcol_basesmuma_module_t *sm_bcol_module=
|
||||
(mca_bcol_basesmuma_module_t *)bcol_module;
|
||||
int my_idx, array_id;
|
||||
mca_bcol_basesmuma_header_t *ctl_ptr;
|
||||
void **results_array=NULL, *mem_offset;
|
||||
|
||||
mca_bcol_basesmuma_local_mlmem_desc_t *ml_mem = &sm_bcol_module->ml_mem;
|
||||
|
||||
/* first, we get a pointer to the payload buffer management struct */
|
||||
pload_mgmt = &(sm_bcol->colls_with_user_data);
|
||||
|
||||
/* go ahead and get the header size that is cached on the payload block
|
||||
*/
|
||||
sm_bcol->total_header_size = data_offset;
|
||||
|
||||
/* allocate memory for pointers to mine and my peers' payload buffers
|
||||
* difference here is that now we use our new data struct
|
||||
*/
|
||||
malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank*
|
||||
pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t);
|
||||
pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size);
|
||||
if( !pload_mgmt->data_buffs) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* allocate some memory to hold the offsets */
|
||||
results_array = (void **) malloc(pload_mgmt->size_of_group * sizeof (void *));
|
||||
if (NULL == results_array) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* setup the input file for the shared memory connection manager */
|
||||
input_file.file_name = sm_reg_data->file_name;
|
||||
input_file.size = sm_reg_data->size;
|
||||
input_file.size_ctl_structure = 0;
|
||||
input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE;
|
||||
input_file.mpool_size = sm_reg_data->size;
|
||||
|
||||
/* call the connection manager and map my shared memory peers' file
|
||||
*/
|
||||
ret = bcol_basesmuma_smcm_allgather_connection(
|
||||
sm_bcol,
|
||||
sm_bcol->super.sbgp_partner_module,
|
||||
&(cs->sm_connections_list),
|
||||
&(sm_bcol->payload_backing_files_info),
|
||||
sm_bcol->super.sbgp_partner_module->group_comm,
|
||||
input_file, cs->payload_base_fname,
|
||||
false);
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
|
||||
/* now we exchange offset info - don't assume symmetric virtual memory
|
||||
*/
|
||||
|
||||
mem_offset = (void *) ((uintptr_t) ml_block->block->base_addr -
|
||||
(uintptr_t) cs->sm_payload_structs->data_addr);
|
||||
|
||||
/* call into the exchange offsets function */
|
||||
ret=comm_allgather_pml(&mem_offset, results_array, sizeof (void *), MPI_BYTE,
|
||||
sm_bcol_module->super.sbgp_partner_module->my_index,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_size,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_list,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_comm);
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* convert memory offset to virtual address in current rank */
|
||||
leading_dim = pload_mgmt->size_of_group;
|
||||
loop_limit = ml_block->num_banks*ml_block->num_buffers_per_bank;
|
||||
for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {
|
||||
|
||||
/* get the base pointer */
|
||||
int array_id=SM_ARRAY_INDEX(leading_dim,0,i);
|
||||
if( i == sm_bcol_module->super.sbgp_partner_module->my_index) {
|
||||
/* me */
|
||||
base_ptr=cs->sm_payload_structs->map_addr;
|
||||
} else {
|
||||
base_ptr=sm_bcol_module->payload_backing_files_info[i]->
|
||||
sm_mmap->map_addr;
|
||||
}
|
||||
|
||||
/* first, set the pointer to the control struct */
|
||||
pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
|
||||
(uintptr_t)(((uint64_t)(uintptr_t)results_array[array_id])+(uint64_t)(uintptr_t)base_ptr);
|
||||
/* second, calculate where to set the data pointer */
|
||||
pload_mgmt->data_buffs[array_id].payload=(void *)
|
||||
(uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
|
||||
(uint64_t)(uintptr_t) data_offset);
|
||||
|
||||
for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) {
|
||||
int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
|
||||
array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
|
||||
/* now, play the same game as above
|
||||
*
|
||||
* first, set the control struct's position */
|
||||
pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
|
||||
(uintptr_t)(((uint64_t)(uintptr_t)(pload_mgmt->data_buffs[array_id_m1].ctl_struct) +
|
||||
(uint64_t)(uintptr_t)ml_block->size_buffer));
|
||||
|
||||
/* second, set the payload pointer */
|
||||
pload_mgmt->data_buffs[array_id].payload =(void *)
|
||||
(uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
|
||||
(uint64_t)(uintptr_t) data_offset);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* done with the index array */
|
||||
free (results_array);
|
||||
results_array = NULL;
|
||||
|
||||
/* initialize my control structures!! */
|
||||
my_idx = sm_bcol_module->super.sbgp_partner_module->my_index;
|
||||
leading_dim = sm_bcol_module->super.sbgp_partner_module->group_size;
|
||||
for( buf_id = 0; buf_id < loop_limit; buf_id++){
|
||||
array_id = SM_ARRAY_INDEX(leading_dim,buf_id,my_idx);
|
||||
ctl_ptr = pload_mgmt->data_buffs[array_id].ctl_struct;
|
||||
|
||||
/* initialize the data structures */
|
||||
for( j = 0; j < SM_BCOLS_MAX; j++){
|
||||
for( i = 0; i < NUM_SIGNAL_FLAGS; i++){
|
||||
ctl_ptr->flags[i][j] = -1;
|
||||
}
|
||||
}
|
||||
ctl_ptr->sequence_number = -1;
|
||||
ctl_ptr->src = -1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* setup the data structures needed for releasing the payload
|
||||
* buffers back to the ml level
|
||||
*/
|
||||
for( i=0 ; i < (int) ml_block->num_banks ; i++ ) {
|
||||
sm_bcol->colls_with_user_data.
|
||||
ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor=
|
||||
ml_block;
|
||||
}
|
||||
|
||||
ml_mem->num_banks = ml_block->num_banks;
|
||||
ml_mem->bank_release_counter = calloc(ml_block->num_banks, sizeof(uint32_t));
|
||||
ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank;
|
||||
ml_mem->size_buffer = ml_block->size_buffer;
|
||||
/* pointer to ml level descriptor */
|
||||
ml_mem->ml_mem_desc = ml_block;
|
||||
|
||||
if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc,
|
||||
ml_block->block->base_addr,
|
||||
ml_mem->num_banks,
|
||||
ml_mem->num_buffers_per_bank,
|
||||
ml_mem->size_buffer,
|
||||
data_offset,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_size,
|
||||
sm_bcol_module->pow_k)) {
|
||||
|
||||
BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
exit_ERROR:
|
||||
if (NULL != results_array) {
|
||||
free(results_array);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* Basesmuma interface function used for buffer release */
|
||||
#if 0
|
||||
/* gvm
|
||||
* A collective operation calls this routine to release the payload buffer.
|
||||
* All processes in the shared memory sub-group of a bcol should call the non-blocking
|
||||
* barrier on the last payload buffer of a memory bank. On the completion
|
||||
* of the non-blocking barrier, the ML callback is called which is responsible
|
||||
* for recycling the memory bank.
|
||||
*/
|
||||
mca_bcol_basesmuma_module_t *sm_bcol_module
|
||||
int bcol_basesmuma_free_payload_buff(
|
||||
struct mca_bcol_base_memory_block_desc_t *block,
|
||||
sm_buffer_mgmt *ctl_mgmt,
|
||||
uint64_t buff_id)
|
||||
{
|
||||
/* local variables */
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
memory_bank = BANK_FROM_BUFFER_IDX(buff_id);
|
||||
ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed++;
|
||||
|
||||
OPAL_THREAD_ADD32(&(ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed),1);
|
||||
|
||||
if (ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed == block->size_buffers_bank){
|
||||
|
||||
/* start non-blocking barrier */
|
||||
bcol_basesmuma_rd_nb_barrier_init_admin(
|
||||
&(ctl_mgmt->ctl_buffs_mgmt[memory_bank].nb_barrier_desc));
|
||||
|
||||
if (NB_BARRIER_DONE !=
|
||||
ctl_mgmt->ctl_buffs_mgmt[memory_bank].
|
||||
nb_barrier_desc.collective_phase){
|
||||
|
||||
/* progress the barrier */
|
||||
opal_progress();
|
||||
}
|
||||
else{
|
||||
/* free the buffer - i.e. initiate callback to ml level */
|
||||
block->ml_release_cb(block,memory_bank);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
@ -1,380 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "opal/align.h"
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
/*
|
||||
* Public string showing the coll ompi_sm V2 component version number
|
||||
*/
|
||||
const char *mca_bcol_basesmuma_component_version_string =
|
||||
"Open MPI bcol - basesmuma collective MCA component version " OMPI_VERSION;
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int basesmuma_register(void);
|
||||
static int basesmuma_open(void);
|
||||
static int basesmuma_close(void);
|
||||
static int mca_bcol_basesmuma_deregister_ctl_sm(
|
||||
mca_bcol_basesmuma_component_t *bcol_component);
|
||||
|
||||
|
||||
static inline int mca_bcol_basesmuma_param_register_int(
|
||||
const char* param_name, int default_value, int *storage)
|
||||
{
|
||||
*storage = default_value;
|
||||
return mca_base_component_var_register(&mca_bcol_basesmuma_component.super.bcol_version, param_name,
|
||||
NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
}
|
||||
|
||||
static inline int mca_bcol_basesmuma_param_register_bool(
|
||||
const char* param_name, bool default_value, bool *storage)
|
||||
{
|
||||
*storage = default_value;
|
||||
return mca_base_component_var_register(&mca_bcol_basesmuma_component.super.bcol_version, param_name,
|
||||
NULL, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
}
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component = {
|
||||
|
||||
/* First, fill in the super */
|
||||
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
.bcol_version = {
|
||||
MCA_BCOL_BASE_VERSION_2_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
|
||||
.mca_component_name = "basesmuma",
|
||||
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
|
||||
OMPI_RELEASE_VERSION),
|
||||
|
||||
/* Component open and close functions */
|
||||
|
||||
.mca_open_component = basesmuma_open,
|
||||
.mca_close_component = basesmuma_close,
|
||||
.mca_register_component_params = basesmuma_register,
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
|
||||
.collm_init_query = mca_bcol_basesmuma_init_query,
|
||||
.collm_comm_query = mca_bcol_basesmuma_comm_query,
|
||||
.init_done = false,
|
||||
.need_ordering = false,
|
||||
.priority = 0, /* (default) priority */
|
||||
},
|
||||
};
|
||||
|
||||
/*
|
||||
* Register the component
|
||||
*/
|
||||
static int basesmuma_register(void)
|
||||
{
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
|
||||
/* set component priority */
|
||||
mca_bcol_basesmuma_param_register_int("priority", 90, &cs->super.priority);
|
||||
|
||||
/* Number of memory banks */
|
||||
mca_bcol_basesmuma_param_register_int("basesmuma_num_ctl_banks", 2,
|
||||
&cs->basesmuma_num_mem_banks);
|
||||
|
||||
/* Number of regions per memory bank */
|
||||
mca_bcol_basesmuma_param_register_int("basesmuma_num_buffs_per_bank", 16,
|
||||
&cs->basesmuma_num_regions_per_bank);
|
||||
|
||||
/* number of polling loops to allow pending resources to
|
||||
* complete their work
|
||||
*/
|
||||
mca_bcol_basesmuma_param_register_int("n_poll_loops", 4, &cs->n_poll_loops);
|
||||
|
||||
|
||||
/* Number of groups supported */
|
||||
mca_bcol_basesmuma_param_register_int("n_groups_supported", 100,
|
||||
&cs->n_groups_supported);
|
||||
|
||||
/* order of fanin tree */
|
||||
mca_bcol_basesmuma_param_register_int("radix_fanin", 2, &cs->radix_fanin);
|
||||
|
||||
/* order of fanout tree */
|
||||
mca_bcol_basesmuma_param_register_int("radix_fanout", 2, &cs->radix_fanout);
|
||||
|
||||
/* order of read tree */
|
||||
mca_bcol_basesmuma_param_register_int("radix_read_tree", 3,
|
||||
&cs->radix_read_tree);
|
||||
|
||||
/* order of reduction fanout tree */
|
||||
mca_bcol_basesmuma_param_register_int("order_reduction_tree", 2,
|
||||
&cs->order_reduction_tree);
|
||||
|
||||
/* k-nomial radix */
|
||||
mca_bcol_basesmuma_param_register_int("k_nomial_radix", 3, &cs->k_nomial_radix);
|
||||
|
||||
/* number of polling loops for non-blocking algorithms */
|
||||
mca_bcol_basesmuma_param_register_int("num_to_probe", 10, &cs->num_to_probe);
|
||||
|
||||
/* radix of the k-ary scatter tree */
|
||||
mca_bcol_basesmuma_param_register_int("scatter_kary_radix", 4,
|
||||
&cs->scatter_kary_radix);
|
||||
|
||||
/* register parmeters controlling message fragementation */
|
||||
mca_bcol_basesmuma_param_register_int("min_frag_size", getpagesize(),
|
||||
&cs->super.min_frag_size);
|
||||
mca_bcol_basesmuma_param_register_int("max_frag_size", FRAG_SIZE_NO_LIMIT,
|
||||
&cs->super.max_frag_size);
|
||||
|
||||
/* by default use pre-registered shared memory segments */
|
||||
/* RLG NOTE: When we have a systematic way to handle single memory
|
||||
* copy semantics, we need to update this logic
|
||||
*/
|
||||
mca_bcol_basesmuma_param_register_bool("can_use_user_buffers", false,
|
||||
&cs->super.can_use_user_buffers);
|
||||
|
||||
mca_bcol_basesmuma_param_register_int("verbose", 0, &cs->verbose);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Open the component
|
||||
*/
|
||||
static int basesmuma_open(void)
|
||||
{
|
||||
|
||||
/* local variables */
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
int ret = OMPI_SUCCESS;
|
||||
opal_mutex_t *mutex_ptr;
|
||||
int dummy;
|
||||
|
||||
/*
|
||||
* Make sure that the number of banks is a power of 2
|
||||
*/
|
||||
cs->basesmuma_num_mem_banks=
|
||||
ompi_roundup_to_power_radix(2,cs->basesmuma_num_mem_banks, &dummy);
|
||||
if ( 0 == cs->basesmuma_num_mem_banks ) {
|
||||
ret=OMPI_ERROR;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the the number of buffers is a power of 2
|
||||
*/
|
||||
cs->basesmuma_num_regions_per_bank=
|
||||
ompi_roundup_to_power_radix(2,cs->basesmuma_num_regions_per_bank, &dummy);
|
||||
if ( 0 == cs->basesmuma_num_regions_per_bank ) {
|
||||
ret=OMPI_ERROR;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* Portals initialization */
|
||||
cs->portals_init = false;
|
||||
cs->portals_info = NULL;
|
||||
|
||||
/*
|
||||
* initialization
|
||||
*/
|
||||
cs->sm_ctl_structs=NULL;
|
||||
OBJ_CONSTRUCT(&(cs->sm_connections_list),opal_list_t);
|
||||
OBJ_CONSTRUCT(&(cs->nb_admin_barriers),opal_list_t);
|
||||
mutex_ptr= &(cs->nb_admin_barriers_mutex);
|
||||
OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t);
|
||||
|
||||
/* Control structures object construct
|
||||
*/
|
||||
OBJ_CONSTRUCT(&(cs->ctl_structures), opal_list_t);
|
||||
|
||||
/* shared memory has not been registered yet */
|
||||
cs->mpool_inited = false;
|
||||
|
||||
/* initialize base file names */
|
||||
cs->clt_base_fname="sm_ctl_mem_";
|
||||
cs->payload_base_fname="sm_payload_mem_";
|
||||
|
||||
/* initialize the size of the shared memory scartch region */
|
||||
cs->my_scratch_shared_memory_size=getpagesize();
|
||||
cs->my_scratch_shared_memory=NULL;
|
||||
cs->scratch_offset_from_base_ctl_file=0;
|
||||
|
||||
/*
|
||||
* register the progess function
|
||||
*/
|
||||
ret=opal_progress_register(bcol_basesmuma_progress);
|
||||
if (MPI_SUCCESS != ret) {
|
||||
opal_output(ompi_bcol_base_framework.framework_output, "failed to register the progress function");
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
exit_ERROR:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* release the control structure backing file
|
||||
*/
|
||||
static int mca_bcol_basesmuma_deregister_ctl_sm(mca_bcol_basesmuma_component_t *bcol_component)
|
||||
{
|
||||
if (NULL != bcol_component->sm_ctl_structs) {
|
||||
OBJ_RELEASE(bcol_component->sm_ctl_structs);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Close the component
|
||||
*/
|
||||
static int basesmuma_close(void)
|
||||
{
|
||||
int ret;
|
||||
bcol_basesmuma_registration_data_t *net_ctx;
|
||||
bcol_base_network_context_t *net_reg;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
|
||||
/* gvm Leak FIX */
|
||||
OPAL_LIST_DESTRUCT (&cs->ctl_structures);
|
||||
|
||||
/* deregister the progress function */
|
||||
ret=opal_progress_unregister(bcol_basesmuma_progress);
|
||||
if (MPI_SUCCESS != ret) {
|
||||
opal_output(ompi_bcol_base_framework.framework_output, "failed to unregister the progress function");
|
||||
}
|
||||
|
||||
/* remove the control structure backing file */
|
||||
ret=mca_bcol_basesmuma_deregister_ctl_sm(&mca_bcol_basesmuma_component);
|
||||
if (MPI_SUCCESS != ret) {
|
||||
opal_output(ompi_bcol_base_framework.framework_output, "failed to remove control structure backing file");
|
||||
}
|
||||
|
||||
/* remove the network contexts - only one network context defined for
|
||||
* this component.
|
||||
*/
|
||||
/* file_name returne by asprintf, so need to free the resource */
|
||||
if(mca_bcol_basesmuma_component.super.network_contexts ) {
|
||||
net_reg=(bcol_base_network_context_t *)
|
||||
mca_bcol_basesmuma_component.super.network_contexts[0];
|
||||
if(net_reg) {
|
||||
net_ctx=(bcol_basesmuma_registration_data_t *)net_reg->context_data;
|
||||
if( net_ctx) {
|
||||
if(net_ctx->file_name) {
|
||||
free(net_ctx->file_name);
|
||||
}
|
||||
free(net_ctx);
|
||||
}
|
||||
free(net_reg);
|
||||
}
|
||||
free(mca_bcol_basesmuma_component.super.network_contexts);
|
||||
mca_bcol_basesmuma_component.super.network_contexts=NULL;
|
||||
}
|
||||
|
||||
/* normal return */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* query to see if the component is available for use, and can
|
||||
* satisfy the thread and progress requirements
|
||||
*/
|
||||
int mca_bcol_basesmuma_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
/* done */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* This routine is used to allocate shared memory for the the shared
|
||||
* memory control regions.
|
||||
*/
|
||||
int mca_bcol_basesmuma_allocate_sm_ctl_memory(mca_bcol_basesmuma_component_t *cs)
|
||||
{
|
||||
/* local variables */
|
||||
int name_length, ret = OMPI_SUCCESS;
|
||||
size_t ctl_length;
|
||||
char *name;
|
||||
size_t page_size = getpagesize ();
|
||||
|
||||
/* set the file name */
|
||||
name_length=asprintf(&name,
|
||||
"%s"OPAL_PATH_SEP"%s""%0d",
|
||||
ompi_process_info.job_session_dir,
|
||||
cs->clt_base_fname,
|
||||
(int)getpid());
|
||||
if( 0 > name_length ) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
/* make sure name is not too long */
|
||||
if ( OPAL_PATH_MAX < (name_length-1) ) {
|
||||
free (name);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* compute segment length */
|
||||
|
||||
ctl_length=(cs->basesmuma_num_mem_banks*
|
||||
cs->basesmuma_num_regions_per_bank+cs->basesmuma_num_mem_banks)
|
||||
*sizeof(mca_bcol_basesmuma_ctl_struct_t)*cs->n_groups_supported;
|
||||
/* need two banks of memory per group - for algorithms that have
|
||||
* user payload, and those that don't
|
||||
*/
|
||||
ctl_length*=2;
|
||||
|
||||
/* add space for internal library management purposes */
|
||||
ctl_length+=cs->my_scratch_shared_memory_size;
|
||||
|
||||
/* round up to multiple of page size */
|
||||
ctl_length = OPAL_ALIGN(ctl_length, page_size, size_t);
|
||||
|
||||
/* allocate the shared file */
|
||||
cs->sm_ctl_structs=bcol_basesmuma_smcm_mem_reg (NULL, ctl_length, getpagesize(), name);
|
||||
if( !cs->sm_ctl_structs) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output,
|
||||
"In mca_bcol_basesmuma_allocate_sm_ctl_memory failed to allocathe backing file %s\n", name);
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* free the memory allocated by asprintf for the file name -
|
||||
* in mca_base_smcm_mem_reg this name is copied into a new
|
||||
* memory location */
|
||||
free (name);
|
||||
|
||||
/* successful return */
|
||||
return ret;
|
||||
}
|
@ -1,218 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/* Recursive doubling blocking barrier */
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/patterns/net/netpatterns.h"
|
||||
|
||||
#include "opal/sys/atomic.h"
|
||||
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
/********************************************************************************/
|
||||
/********************************** New Fan-In **********************************/
|
||||
/********************************************************************************/
|
||||
|
||||
static int bcol_basesmuma_fanin_new(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
int64_t sequence_number;
|
||||
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
|
||||
|
||||
int i, child_rank, idx, n_children, probe,
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index,
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
int8_t ready_flag;
|
||||
int8_t bcol_id = (int8_t) bcol_module->super.bcol_id;
|
||||
int buff_index = input_args->buffer_index;
|
||||
int *active_requests =
|
||||
&(bcol_module->ml_mem.nb_coll_desc[buff_index].active_requests);
|
||||
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
|
||||
int matched = 0;
|
||||
|
||||
|
||||
volatile mca_bcol_basesmuma_payload_t *ctl_structs;
|
||||
|
||||
/* control structures */
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl;
|
||||
volatile mca_bcol_basesmuma_header_t *child_ctl;
|
||||
|
||||
|
||||
netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node);
|
||||
|
||||
/* Figure out - what instance of the basesmuma bcol I am */
|
||||
sequence_number = input_args->sequence_num;
|
||||
|
||||
idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0);
|
||||
ctl_structs = (volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs + idx;
|
||||
my_ctl = ctl_structs[my_rank].ctl_struct;
|
||||
|
||||
/* Init the header */
|
||||
BASESMUMA_HEADER_INIT(my_ctl, ready_flag, sequence_number, bcol_id);
|
||||
|
||||
/* Cache num of children value in a local variable */
|
||||
n_children = my_tree_node->n_children;
|
||||
|
||||
/* initialize the active requests */
|
||||
*active_requests = 0;
|
||||
/* create a bit map for children */
|
||||
for( i = 0; i < n_children; i++){
|
||||
*active_requests ^= (1<<i);
|
||||
}
|
||||
|
||||
/* Wait until my childeren arrive */
|
||||
for (i = 0; i < n_children; ++i) {
|
||||
matched = 0;
|
||||
/* Get child ctl struct */
|
||||
child_rank = my_tree_node->children_ranks[i];
|
||||
child_ctl = ctl_structs[child_rank].ctl_struct;
|
||||
/* I'm sacrificing cache for concurrency */
|
||||
for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
|
||||
if(IS_PEER_READY(child_ctl, ready_flag, sequence_number,BARRIER_FANIN_FLAG, bcol_id)) {
|
||||
matched = 1;
|
||||
/* flip the bit */
|
||||
*active_requests ^= (1<<i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(0 == *active_requests ) {
|
||||
if(ROOT_NODE != my_tree_node->my_node_type){
|
||||
/* I have no more active requests,
|
||||
signal my parent */
|
||||
my_ctl->flags[BARRIER_FANIN_FLAG][bcol_id] = ready_flag;
|
||||
}
|
||||
} else {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
my_ctl->starting_flag_value[bcol_id]++;
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static int bcol_basesmuma_fanin_new_progress(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
int64_t sequence_number;
|
||||
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
|
||||
|
||||
int i, child_rank, flag_offset, idx, n_children, probe,
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index,
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
int8_t ready_flag;
|
||||
int8_t bcol_id = (int8_t) bcol_module->super.bcol_id;
|
||||
int buff_index = input_args->buffer_index;
|
||||
int *active_requests =
|
||||
&(bcol_module->ml_mem.nb_coll_desc[buff_index].active_requests);
|
||||
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
|
||||
int matched = 0;
|
||||
|
||||
|
||||
volatile mca_bcol_basesmuma_payload_t *ctl_structs;
|
||||
|
||||
/* control structures */
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl;
|
||||
volatile mca_bcol_basesmuma_header_t *child_ctl;
|
||||
|
||||
|
||||
netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node);
|
||||
|
||||
sequence_number = input_args->sequence_num;
|
||||
|
||||
idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0);
|
||||
ctl_structs = (volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs + idx;
|
||||
my_ctl = ctl_structs[my_rank].ctl_struct;
|
||||
|
||||
|
||||
flag_offset = my_ctl->starting_flag_value[bcol_id];
|
||||
ready_flag = flag_offset + 1;
|
||||
my_ctl->sequence_number = sequence_number;
|
||||
|
||||
/* Cache num of children value in a local variable */
|
||||
n_children = my_tree_node->n_children;
|
||||
|
||||
|
||||
/* Wait until my childeren arrive */
|
||||
for (i = 0; i < n_children; ++i) {
|
||||
matched = 0;
|
||||
/* Get child ctl struct */
|
||||
if ( 1 == ((*active_requests >> i)&1) ) {
|
||||
child_rank = my_tree_node->children_ranks[i];
|
||||
child_ctl = ctl_structs[child_rank].ctl_struct;
|
||||
/* I'm sacrificing cache for concurrency */
|
||||
for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
|
||||
if(IS_PEER_READY(child_ctl, ready_flag, sequence_number, BARRIER_FANIN_FLAG,bcol_id)) {
|
||||
matched = 1;
|
||||
/* flip the bit */
|
||||
*active_requests ^= (1<<i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if(0 == *active_requests ){
|
||||
if(ROOT_NODE != my_tree_node->my_node_type){
|
||||
/* If I am not the root of the fanin tree,
|
||||
then signal my parent */
|
||||
my_ctl->flags[BARRIER_FANIN_FLAG][bcol_id] = ready_flag;
|
||||
}
|
||||
} else {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
my_ctl->starting_flag_value[bcol_id]++;
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
|
||||
int bcol_basesmuma_fanin_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
BASESMUMA_VERBOSE(10, ("Basesmuma Fan-In register.\n"));
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_FANIN;
|
||||
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super,
|
||||
&comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_fanin_new,
|
||||
bcol_basesmuma_fanin_new_progress);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,123 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/* Recursive doubling blocking barrier */
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/patterns/net/netpatterns.h"
|
||||
|
||||
#include "opal/sys/atomic.h"
|
||||
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
/***********************************************************************************/
|
||||
/*********************************** New Fan-Out ***********************************/
|
||||
/***********************************************************************************/
|
||||
|
||||
static int bcol_basesmuma_fanout_new(
|
||||
bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
int64_t sequence_number;
|
||||
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
|
||||
|
||||
int idx, probe,
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index,
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
int8_t ready_flag;
|
||||
int8_t bcol_id = (int8_t) bcol_module->super.bcol_id;
|
||||
int buff_index = input_args->buffer_index;
|
||||
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
|
||||
|
||||
|
||||
volatile mca_bcol_basesmuma_payload_t *ctl_structs;
|
||||
|
||||
/* control structures */
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl;
|
||||
volatile mca_bcol_basesmuma_header_t *parent_ctl;
|
||||
|
||||
|
||||
netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node);
|
||||
|
||||
/* Figure out - what instance of the basesmuma bcol I am */
|
||||
sequence_number = input_args->sequence_num;
|
||||
|
||||
idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0);
|
||||
ctl_structs = (volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs + idx;
|
||||
my_ctl = ctl_structs[my_rank].ctl_struct;
|
||||
|
||||
/* init the header */
|
||||
BASESMUMA_HEADER_INIT(my_ctl, ready_flag, sequence_number, bcol_id);
|
||||
|
||||
/* Wait on my parent to arrive */
|
||||
if (my_tree_node->n_parents) {
|
||||
parent_ctl = ctl_structs[my_tree_node->parent_rank].ctl_struct;
|
||||
for( probe = 0; probe < cm->num_to_probe; probe++){
|
||||
if (IS_PEER_READY(parent_ctl, ready_flag, sequence_number, BARRIER_FANOUT_FLAG, bcol_id)) {
|
||||
/* signal my children */
|
||||
my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag;
|
||||
/* bump the starting flag */
|
||||
my_ctl->starting_flag_value[bcol_id]++;
|
||||
return BCOL_FN_COMPLETE;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
/* I am the root of the fanout */
|
||||
my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag;
|
||||
/* bump the starting flag */
|
||||
my_ctl->starting_flag_value[bcol_id]++;
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
int bcol_basesmuma_fanout_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
BASESMUMA_VERBOSE(10, ("Basesmuma Fan-Out register.\n"));
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_FANOUT;
|
||||
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super,
|
||||
&comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_fanout_new,
|
||||
bcol_basesmuma_fanout_new);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,626 +0,0 @@
|
||||
#ifdef __PORTALS_AVAIL__
|
||||
#define __PORTALS_ENABLE__
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
|
||||
#include "bcol_basesmuma_utils.h"
|
||||
#include "bcol_basesmuma_portals.h"
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
#if 0
|
||||
struct scatter_allgather_nb_bcast_state_t
|
||||
{
|
||||
/* local variables */
|
||||
uint64_t length;
|
||||
int my_rank, src, matched;
|
||||
int *src_list;
|
||||
int group_size;
|
||||
int64_t ready_flag;
|
||||
int pow_2, pow_2_levels;
|
||||
int src_list_index;
|
||||
uint64_t fragment_size; /* user buffer size */
|
||||
|
||||
/* Input argument variables */
|
||||
void *my_userbuf;
|
||||
int64_t sequence_number;
|
||||
|
||||
/* Extra source variables */
|
||||
bool secondary_root;
|
||||
int partner , extra_partner;
|
||||
|
||||
/* Scatter Allgather offsets */
|
||||
uint64_t local_sg_offset , global_sg_offset , partner_offset ;
|
||||
|
||||
/* Portals messaging relevant variables */
|
||||
ptl_handle_eq_t allgather_eq_h;
|
||||
ptl_handle_eq_t read_eq;
|
||||
ptl_event_t allgather_event;
|
||||
bool msg_posted;
|
||||
|
||||
/* OMPI module and component variables */
|
||||
mca_bcol_basesmuma_component_t *cs;
|
||||
mca_bcol_basesmuma_module_t *bcol_module;
|
||||
|
||||
/* Control structure and payload variables */
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* scatter source */
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer; /* scatter source */
|
||||
|
||||
int phase;
|
||||
};
|
||||
|
||||
typedef struct scatter_allgather_nb_bcast_state_t sg_state_t;
|
||||
#endif
|
||||
|
||||
bool blocked_post = false;
|
||||
|
||||
#define IS_SG_DATA_READY(peer, my_flag, my_sequence_number) \
|
||||
(((peer)->sequence_number == (my_sequence_number) && \
|
||||
(peer)->flags[BCAST_FLAG] >= (my_flag) \
|
||||
)? true : false )
|
||||
|
||||
|
||||
|
||||
#define SG_LARGE_MSG_PROBE(src_list, n_src, src_list_index, matched, \
|
||||
src, data_buffs, data_src_ctl_pointer, \
|
||||
data_src_lmsg_ctl_pointer, ready_flag, \
|
||||
sequence_number) \
|
||||
do { \
|
||||
int j; \
|
||||
for( j = 0; j < n_src; j++) { \
|
||||
if(src_list[j] != -1) { \
|
||||
data_src_ctl_pointer = data_buffs[src_list[j]].ctl_struct; \
|
||||
data_src_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) \
|
||||
data_buffs[src_list[j]].payload; \
|
||||
if( IS_SG_DATA_READY(data_src_ctl_pointer,ready_flag,sequence_number)) { \
|
||||
src = src_list[j]; \
|
||||
matched = 1; \
|
||||
src_list_index = j; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define SG_LARGE_MSG_NB_PROBE(src_list, n_src, src_list_index, matched, \
|
||||
src, ctl_structs, data_src_ctl_pointer, \
|
||||
ready_flag, sequence_number) \
|
||||
do { \
|
||||
int j; \
|
||||
for( j = 0; j < n_src; j++) { \
|
||||
if(src_list[j] != -1) { \
|
||||
data_src_ctl_pointer = ctl_structs[src_list[j]]; \
|
||||
if( IS_SG_DATA_READY(data_src_ctl_pointer,ready_flag,sequence_number)) { \
|
||||
src = src_list[j]; \
|
||||
matched = 1; \
|
||||
src_list_index = j; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int wait_for_peers(int my_rank, int npeers, volatile mca_bcol_basesmuma_payload_t *data_buffs,
|
||||
int flag_value, int sn)
|
||||
{
|
||||
int *peers_list = NULL;
|
||||
int counter = 0, diter = 0;
|
||||
volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer = NULL;
|
||||
|
||||
peers_list = (int *)malloc(sizeof(int) * npeers);
|
||||
|
||||
for (diter = 0; diter < npeers; diter++ ){
|
||||
peers_list[diter] = my_rank ^ (1<<diter);
|
||||
assert(peers_list[diter] != -1);
|
||||
}
|
||||
|
||||
counter = 0;
|
||||
while (counter < npeers) {
|
||||
for (diter = 0; diter < npeers; diter++){
|
||||
if (-1 != peers_list[diter]) {
|
||||
peer_ctl_pointer = data_buffs[peers_list[diter]].ctl_struct;
|
||||
|
||||
if (IS_SG_DATA_READY(peer_ctl_pointer, flag_value, sn)) {
|
||||
counter++;
|
||||
peers_list[diter] = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int wait_for_peers_nb(int my_rank, int npeers,
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs,
|
||||
volatile int flag_value, int sn)
|
||||
{
|
||||
int *peers_list = NULL;
|
||||
int counter = 0, diter = 0;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *peer_ctl_pointer = NULL;
|
||||
|
||||
peers_list = (int *)malloc(sizeof(int) * npeers);
|
||||
|
||||
for (diter = 0; diter < npeers; diter++ ){
|
||||
peers_list[diter] = my_rank ^ (1<<diter);
|
||||
assert(peers_list[diter] != -1);
|
||||
}
|
||||
|
||||
counter = 0;
|
||||
while (counter < npeers) {
|
||||
for (diter = 0; diter < npeers; diter++){
|
||||
if (-1 != peers_list[diter]) {
|
||||
peer_ctl_pointer = ctl_structs[peers_list[diter]];
|
||||
|
||||
if (IS_SG_DATA_READY(peer_ctl_pointer, flag_value, sn)) {
|
||||
counter++;
|
||||
peers_list[diter] = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int wait_for_post_complete_nb(int my_rank, int npeers,
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs,
|
||||
int flag_value, int sn)
|
||||
{
|
||||
/* int *peers_list = NULL; */
|
||||
int peers_list[MAX_SM_GROUP_SIZE];
|
||||
int counter = 0, diter = 0;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *peer_ctl_pointer = NULL;
|
||||
|
||||
/* peers_list = (int *)malloc(sizeof(int) * npeers); */
|
||||
|
||||
assert(npeers < MAX_SM_GROUP_SIZE);
|
||||
|
||||
for (diter = 0; diter < npeers; diter++ ){
|
||||
peers_list[diter] = my_rank ^ (1<<diter);
|
||||
assert(peers_list[diter] != -1);
|
||||
}
|
||||
|
||||
counter = 0;
|
||||
for (diter = 0; diter < npeers; diter++){
|
||||
peer_ctl_pointer = ctl_structs[peers_list[diter]];
|
||||
|
||||
if (IS_SG_DATA_READY(peer_ctl_pointer, flag_value, sn)) {
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
|
||||
/* free(peers_list); */
|
||||
return counter;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int sg_large_msg_probe(sg_state_t *sg_state)
|
||||
{
|
||||
int j,n_src = sg_state->pow_2_levels+1;
|
||||
|
||||
|
||||
for( j = 0; j < n_src; j++) {
|
||||
if(sg_state->src_list[j] != -1) {
|
||||
sg_state->parent_ctl_pointer = sg_state->ctl_structs[sg_state->src_list[j]];
|
||||
|
||||
BASESMUMA_VERBOSE(5,("Parent %d ctl pointer (parent=%x, my ctl=%x) flag %d",
|
||||
sg_state->src_list[j],sg_state->parent_ctl_pointer,
|
||||
sg_state->my_ctl_pointer,
|
||||
sg_state->parent_ctl_pointer->flag));
|
||||
|
||||
if (IS_SG_DATA_READY(sg_state->parent_ctl_pointer,
|
||||
sg_state->ready_flag, sg_state->sequence_number)) {
|
||||
sg_state->src = sg_state->src_list[j];
|
||||
sg_state->matched = 1;
|
||||
sg_state->src_list_index = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
* I will post message for all the my children
|
||||
*/
|
||||
static inline __opal_attribute_always_inline__
|
||||
int sm_portals_root_scatter(sg_state_t *sg_state)
|
||||
{
|
||||
int extra_src_posts = -1, scatter_posts = -1, allgather_posts = -1,
|
||||
total_msg_posts = -1;
|
||||
|
||||
BASESMUMA_VERBOSE(10,("I am the root of the data"));
|
||||
sg_state->my_ctl_pointer->offset = 0;
|
||||
sg_state->my_ctl_pointer->n_sends = sg_state->pow_2_levels;
|
||||
sg_state->my_ctl_pointer->length = sg_state->fragment_size;
|
||||
|
||||
|
||||
|
||||
extra_src_posts = (sg_state->my_rank + sg_state->pow_2 < sg_state->group_size ) ? 1: 0;
|
||||
scatter_posts = sg_state->my_ctl_pointer->n_sends;
|
||||
allgather_posts = sg_state->pow_2_levels - 1;
|
||||
|
||||
total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
|
||||
|
||||
if ( total_msg_posts <= 0) {
|
||||
BASESMUMA_VERBOSE(10,("No need to post the data "));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
|
||||
&sg_state->my_ctl_pointer->portals_buf_addr,
|
||||
sg_state->my_userbuf, sg_state->fragment_size,
|
||||
PTL_EQ_NONE,
|
||||
total_msg_posts,
|
||||
blocked_post,
|
||||
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE |
|
||||
PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
|
||||
|
||||
/*
|
||||
mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
|
||||
&sg_state->my_ctl_pointer->portals_buf_addr,
|
||||
sg_state->my_userbuf, sg_state->fragment_size,
|
||||
sg_state->allgather_eq_h,
|
||||
total_msg_posts,
|
||||
blocked_post,
|
||||
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE |
|
||||
PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
|
||||
*/
|
||||
|
||||
sg_state->msg_posted = true ;
|
||||
|
||||
/*
|
||||
opal_atomic_wmb();
|
||||
*/
|
||||
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Im root but my rank > pow2_groupsize, so will copy to partner who
|
||||
* will act as root (secondary)
|
||||
*/
|
||||
static inline __opal_attribute_always_inline__
|
||||
int sm_portals_extra_root_scatter(sg_state_t *sg_state)
|
||||
{
|
||||
int scatter_partner = -1;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *scatter_partner_ctl_pointer = NULL;
|
||||
|
||||
int total_msg_posts = 1;
|
||||
|
||||
if ( total_msg_posts <= 0) {
|
||||
BASESMUMA_VERBOSE(10,("No need to post the data "));
|
||||
}
|
||||
else {
|
||||
mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
|
||||
&sg_state->my_ctl_pointer->portals_buf_addr,
|
||||
sg_state->my_userbuf, sg_state->fragment_size,
|
||||
PTL_EQ_NONE,
|
||||
total_msg_posts,
|
||||
blocked_post,
|
||||
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET
|
||||
| PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
|
||||
sg_state->msg_posted = true ;
|
||||
|
||||
}
|
||||
|
||||
opal_atomic_wmb();
|
||||
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
|
||||
|
||||
|
||||
|
||||
scatter_partner = sg_state->my_rank - sg_state->pow_2;
|
||||
scatter_partner_ctl_pointer =
|
||||
sg_state->ctl_structs[scatter_partner];
|
||||
|
||||
while(!IS_SG_DATA_READY(scatter_partner_ctl_pointer, sg_state->ready_flag,
|
||||
sg_state->sequence_number)){
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Gets msg from the partner (> pow2_groupsize) and posts the
|
||||
* message acting as root
|
||||
*/
|
||||
static inline __opal_attribute_always_inline__
|
||||
int sm_portals_secondary_root_scatter(sg_state_t *sg_state)
|
||||
{
|
||||
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *extra_src_ctl_pointer = NULL;
|
||||
int scatter_posts, allgather_posts, extra_src_posts, total_msg_posts;
|
||||
|
||||
sg_state->secondary_root = true;
|
||||
BASESMUMA_VERBOSE(10,("I am the secondary root for the data"));
|
||||
sg_state->my_ctl_pointer->offset = 0;
|
||||
sg_state->my_ctl_pointer->n_sends = sg_state->pow_2_levels;
|
||||
sg_state->my_ctl_pointer->length = sg_state->fragment_size;
|
||||
|
||||
extra_src_ctl_pointer = sg_state->ctl_structs[sg_state->src];
|
||||
|
||||
mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
|
||||
sg_state->read_eq,
|
||||
&sg_state->my_ctl_pointer->portals_buf_addr,
|
||||
&extra_src_ctl_pointer->portals_buf_addr, 0,
|
||||
0, sg_state->fragment_size);
|
||||
|
||||
|
||||
extra_src_posts = 0;
|
||||
scatter_posts = sg_state->my_ctl_pointer->n_sends;
|
||||
allgather_posts = sg_state->pow_2_levels - 1;
|
||||
|
||||
total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
|
||||
|
||||
if (total_msg_posts > 0) {
|
||||
mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
|
||||
&sg_state->my_ctl_pointer->portals_buf_addr,
|
||||
sg_state->my_userbuf, sg_state->fragment_size,
|
||||
PTL_EQ_NONE,
|
||||
total_msg_posts,
|
||||
blocked_post,
|
||||
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET
|
||||
| PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
|
||||
sg_state->msg_posted = true ;
|
||||
}
|
||||
opal_atomic_wmb();
|
||||
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Internode Scatter: Get data from my parent and post for my children
|
||||
*/
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int sm_portals_internode_scatter(sg_state_t *sg_state)
|
||||
{
|
||||
|
||||
int scatter_posts, allgather_posts, extra_src_posts,
|
||||
total_msg_posts;
|
||||
uint64_t local_offset, remote_offset;
|
||||
|
||||
/* compute the size of the chunk to copy */
|
||||
sg_state->length = (sg_state->parent_ctl_pointer->length)/
|
||||
(1<<(sg_state->parent_ctl_pointer->n_sends - sg_state->my_ctl_pointer->n_sends));
|
||||
sg_state->my_ctl_pointer->length = sg_state->length;
|
||||
sg_state->my_ctl_pointer->offset =
|
||||
sg_state->parent_ctl_pointer->offset + sg_state->length;
|
||||
|
||||
|
||||
local_offset = sg_state->my_ctl_pointer->offset;
|
||||
remote_offset = sg_state->parent_ctl_pointer->offset +
|
||||
sg_state->length;
|
||||
|
||||
mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
|
||||
sg_state->read_eq,
|
||||
&sg_state->my_ctl_pointer->portals_buf_addr,
|
||||
&sg_state->parent_ctl_pointer->portals_buf_addr,local_offset,
|
||||
remote_offset,sg_state->length);
|
||||
|
||||
/* Now post the message for other children to read */
|
||||
extra_src_posts = (sg_state->my_rank + sg_state->pow_2 <
|
||||
sg_state->group_size ) ? 1: 0;
|
||||
scatter_posts = sg_state->my_ctl_pointer->n_sends;
|
||||
allgather_posts = sg_state->pow_2_levels - 1;
|
||||
|
||||
total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
|
||||
|
||||
if (total_msg_posts > 0) {
|
||||
mca_bcol_basesmuma_portals_post_msg(sg_state->cs, &sg_state->my_ctl_pointer->portals_buf_addr,
|
||||
sg_state->my_userbuf, sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length,
|
||||
PTL_EQ_NONE,
|
||||
total_msg_posts,
|
||||
blocked_post,
|
||||
PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE
|
||||
| PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
|
||||
|
||||
sg_state->msg_posted = true;
|
||||
}
|
||||
/*
|
||||
opal_atomic_wmb();
|
||||
*/
|
||||
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Bcast's Allgather Phase:
|
||||
* Combines data from all processes using recursive doubling algorithm
|
||||
*/
|
||||
static inline __opal_attribute_always_inline__
|
||||
int sm_portals_bcasts_allgather_phase(sg_state_t *sg_state)
|
||||
{
|
||||
int ag_loop, partner;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *partner_ctl_pointer = NULL; /* recursive double */
|
||||
|
||||
|
||||
for( ag_loop = 1; ag_loop < sg_state->pow_2_levels; ag_loop++) {
|
||||
/* get my partner for this level */
|
||||
partner = sg_state->my_rank^(1<<ag_loop);
|
||||
partner_ctl_pointer = sg_state->ctl_structs[partner];
|
||||
|
||||
|
||||
/* Block until partner is at this level of recursive-doubling stage */
|
||||
while(!IS_SG_DATA_READY(partner_ctl_pointer, sg_state->ready_flag,
|
||||
sg_state->sequence_number)) {
|
||||
opal_progress();
|
||||
}
|
||||
assert(partner_ctl_pointer->flag >= sg_state->ready_flag);
|
||||
|
||||
if (partner_ctl_pointer->offset < sg_state->my_ctl_pointer->offset) {
|
||||
sg_state->global_sg_offset -= sg_state->length;
|
||||
sg_state->local_sg_offset = sg_state->global_sg_offset;
|
||||
} else {
|
||||
sg_state->local_sg_offset = sg_state->global_sg_offset + sg_state->length;
|
||||
}
|
||||
|
||||
|
||||
BASESMUMA_VERBOSE(10,("Allgather Phase: Get message from process %d, length %d",
|
||||
partner, sg_state->length));
|
||||
mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
|
||||
sg_state->read_eq,
|
||||
&sg_state->my_ctl_pointer->portals_buf_addr,
|
||||
&partner_ctl_pointer->portals_buf_addr,sg_state->local_sg_offset,
|
||||
sg_state->local_sg_offset, sg_state->length);
|
||||
|
||||
sg_state->ready_flag++;
|
||||
opal_atomic_wmb();
|
||||
sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
|
||||
|
||||
/* Block until partner is at this level of recursive-doubling stage */
|
||||
while(!IS_SG_DATA_READY(partner_ctl_pointer, sg_state->ready_flag,
|
||||
sg_state->sequence_number)) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
/* double the length */
|
||||
sg_state->length *= 2;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int init_sm_group_info(sg_state_t *sg_state, int buff_idx)
|
||||
{
|
||||
int idx, leading_dim;
|
||||
int first_instance=0;
|
||||
int flag_offset;
|
||||
|
||||
/* Get addresing information */
|
||||
sg_state->group_size = sg_state->bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim = sg_state->bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
|
||||
BASESMUMA_VERBOSE(1,("My buffer idx %d group size %d, leading dim %d, idx %d",
|
||||
buff_idx,sg_state->group_size,leading_dim,idx));
|
||||
/* grab the ctl buffs */
|
||||
sg_state->ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **)
|
||||
sg_state->bcol_module->colls_with_user_data.ctl_buffs+idx;
|
||||
|
||||
sg_state->my_rank = sg_state->bcol_module->super.sbgp_partner_module->my_index;
|
||||
sg_state->my_ctl_pointer = sg_state->ctl_structs[sg_state->my_rank];
|
||||
|
||||
if (sg_state->my_ctl_pointer->sequence_number < sg_state->sequence_number) {
|
||||
first_instance = 1;
|
||||
}
|
||||
|
||||
if(first_instance) {
|
||||
sg_state->my_ctl_pointer->flag = -1;
|
||||
sg_state->my_ctl_pointer->index = 1;
|
||||
|
||||
sg_state->my_ctl_pointer->starting_flag_value = 0;
|
||||
flag_offset = 0;
|
||||
|
||||
} else {
|
||||
sg_state->my_ctl_pointer->index++;
|
||||
}
|
||||
|
||||
/* For bcast we shud have only entry to this bcol
|
||||
assert(sg_state->my_ctl_pointer->flag == -1);
|
||||
*/
|
||||
|
||||
/* increment the starting flag by one and return */
|
||||
flag_offset = sg_state->my_ctl_pointer->starting_flag_value;
|
||||
sg_state->ready_flag = flag_offset + sg_state->sequence_number + 1;
|
||||
|
||||
sg_state->my_ctl_pointer->sequence_number = sg_state->sequence_number;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int init_sm_portals_sg_info(sg_state_t *sg_state)
|
||||
{
|
||||
/* Get portals info*/
|
||||
mca_bcol_basesmuma_portal_proc_info_t *portals_info;
|
||||
int rc = OMPI_SUCCESS;
|
||||
int sg_matchbits;
|
||||
|
||||
portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)sg_state->cs->portals_info;
|
||||
|
||||
sg_matchbits = sg_state->sequence_number ;
|
||||
|
||||
/* Construct my portal buffer address and copy to payload buffer */
|
||||
mca_bcol_basesmuma_construct_portal_address(&sg_state->my_ctl_pointer->portals_buf_addr,
|
||||
portals_info->portal_id.nid,
|
||||
portals_info->portal_id.pid,
|
||||
sg_matchbits,
|
||||
sg_state->bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
|
||||
|
||||
sg_state->my_ctl_pointer->portals_buf_addr.userbuf = sg_state->my_userbuf;
|
||||
sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length = sg_state->fragment_size;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int compute_src_from_root(int group_root, int my_group_rank, int pow2, int
|
||||
group_size)
|
||||
{
|
||||
|
||||
int root, relative_rank, src, i;
|
||||
|
||||
if (group_root < pow2) {
|
||||
root = group_root;
|
||||
} else {
|
||||
/* the source of the data is extra node,
|
||||
the real root it represented by some rank from
|
||||
pow2 group */
|
||||
root = group_root - pow2;
|
||||
/* shortcut for the case when my rank is root for the group */
|
||||
if (my_group_rank == root) {
|
||||
return group_root;
|
||||
}
|
||||
}
|
||||
|
||||
relative_rank = (my_group_rank - root) < 0 ? my_group_rank - root + pow2 :
|
||||
my_group_rank - root;
|
||||
|
||||
for (i = 1; i < pow2; i<<=1) {
|
||||
if (relative_rank & i) {
|
||||
src = my_group_rank ^ i;
|
||||
if (src >= pow2)
|
||||
src -= pow2;
|
||||
|
||||
return src;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args);
|
||||
|
||||
int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args);
|
||||
|
||||
int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args);
|
||||
|
||||
#endif
|
@ -1,452 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
/* #define __PORTALS_AVAIL__ */
|
||||
#ifdef __PORTALS_AVAIL__
|
||||
|
||||
#define __PORTALS_ENABLE__
|
||||
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "bcol_basesmuma_utils.h"
|
||||
|
||||
#include "bcol_basesmuma_portals.h"
|
||||
|
||||
/* debug */
|
||||
#include <unistd.h>
|
||||
/* end debug */
|
||||
|
||||
|
||||
/**
|
||||
* Shared memory non-blocking Broadcast - K-nomial fan-out for small data buffers.
|
||||
* This routine assumes that buf (the input buffer) is a single writer
|
||||
* multi reader (SWMR) shared memory buffer owned by the calling rank
|
||||
* which is the only rank that can write to this buffers.
|
||||
* It is also assumed that the buffers are registered and fragmented
|
||||
* at the ML level and that buf is sufficiently large to hold the data.
|
||||
*
|
||||
*
|
||||
* @param buf - SWMR shared buffer within a sbgp that the
|
||||
* executing rank can write to.
|
||||
* @param count - the number of elements in the shared buffer.
|
||||
* @param dtype - the datatype of a shared buffer element.
|
||||
* @param root - the index within the sbgp of the root.
|
||||
* @param module - basesmuma module.
|
||||
*/
|
||||
int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
#if 0
|
||||
/* local variables */
|
||||
mca_bcol_basesmuma_module_t* bcol_module=
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
int i, matched = 0;
|
||||
int src=-1;
|
||||
int group_size;
|
||||
int my_rank, first_instance=0, flag_offset;
|
||||
int rc = OMPI_SUCCESS;
|
||||
int leading_dim, buff_idx, idx;
|
||||
int count=input_args->count;
|
||||
struct ompi_datatype_t* dtype=input_args->dtype;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
|
||||
volatile int64_t ready_flag;
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile char* parent_data_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
void *userbuf = (void *)((unsigned char *)input_args->userbuf);
|
||||
|
||||
size_t pack_len = 0, dt_size;
|
||||
|
||||
struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL;
|
||||
struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL;
|
||||
mca_bcol_basesmuma_portal_proc_info_t *portals_info;
|
||||
portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info;
|
||||
|
||||
/* we will work only on packed data - so compute the length*/
|
||||
ompi_datatype_type_size(dtype, &dt_size);
|
||||
pack_len=count*dt_size;
|
||||
buff_idx = input_args->src_desc->buffer_index;
|
||||
|
||||
/* Get addressing information */
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
|
||||
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
|
||||
/* Set pointer to current proc ctrl region */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) data_buffs[my_rank].payload;
|
||||
|
||||
/* setup resource recycling */
|
||||
if( my_ctl_pointer->sequence_number < sequence_number ) {
|
||||
first_instance=1;
|
||||
}
|
||||
|
||||
if( first_instance ) {
|
||||
/* Signal arrival */
|
||||
my_ctl_pointer->flag = -1;
|
||||
my_ctl_pointer->index=1;
|
||||
/* this does not need to use any flag values , so only need to
|
||||
* set the value for subsequent values that may need this */
|
||||
my_ctl_pointer->starting_flag_value=0;
|
||||
flag_offset=0;
|
||||
|
||||
} else {
|
||||
/* only one thread at a time will be making progress on this
|
||||
* collective, so no need to make this atomic */
|
||||
my_ctl_pointer->index++;
|
||||
}
|
||||
|
||||
|
||||
/* increment the starting flag by one and return */
|
||||
flag_offset = my_ctl_pointer->starting_flag_value;
|
||||
ready_flag = flag_offset + sequence_number + 1;
|
||||
my_ctl_pointer->sequence_number = sequence_number;
|
||||
|
||||
|
||||
/* Construct my portal buffer address and copy to payload buffer */
|
||||
mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer,
|
||||
portals_info->portal_id.nid,
|
||||
portals_info->portal_id.pid,
|
||||
sequence_number,
|
||||
bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
|
||||
|
||||
/* non-blocking broadcast algorithm */
|
||||
|
||||
/* If I am the root, then signal ready flag */
|
||||
if(input_args->root_flag) {
|
||||
ptl_handle_eq_t eq_h;
|
||||
ptl_event_t event;
|
||||
int ret;
|
||||
|
||||
BASESMUMA_VERBOSE(10,("I am the root of the data"));
|
||||
|
||||
/* create an event queue for the incoming buffer */
|
||||
ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
|
||||
cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, PTL_EQ_HANDLER_NONE, &eq_h);
|
||||
|
||||
if (ret != PTL_OK) {
|
||||
fprintf(stderr, "PtlEQAlloc() failed: %d \n",ret);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Post the message using portal copy */
|
||||
|
||||
mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf,
|
||||
pack_len, eq_h, my_lmsg_ctl_pointer->nsends);
|
||||
|
||||
/*
|
||||
* signal ready flag
|
||||
*/
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
/* wait for a response from the client */
|
||||
mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT,
|
||||
&event, my_lmsg_ctl_pointer->nsends);
|
||||
|
||||
/* free the event queue */
|
||||
ret = PtlEQFree(eq_h);
|
||||
if (ret != PTL_OK) {
|
||||
fprintf(stderr, "PtlEQFree() failed: %d )\n",ret);
|
||||
}
|
||||
|
||||
/* root is finished */
|
||||
goto Release;
|
||||
}
|
||||
|
||||
/* If I am not the root, then poll on possible "senders'" control structs */
|
||||
for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
|
||||
|
||||
/* Shared memory iprobe */
|
||||
/*
|
||||
BCOL_BASESMUMA_SM_PROBE(bcol_module->src, bcol_module->src_size,
|
||||
my_rank, matched, src);
|
||||
*/
|
||||
do {
|
||||
int j, n_src, my_index;
|
||||
n_src = bcol_module->src_size;
|
||||
|
||||
for( j = 0; j < n_src; j++) {
|
||||
parent_ctl_pointer = data_buffs[bcol_module->src[j]].ctl_struct;
|
||||
parent_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *)
|
||||
data_buffs[bcol_module->src[j]].payload;
|
||||
if (IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) {
|
||||
|
||||
src = bcol_module->src[j];
|
||||
matched = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
|
||||
}
|
||||
|
||||
/* If not matched, then hop out and put me on progress list */
|
||||
if(0 == matched ) {
|
||||
BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
}
|
||||
|
||||
/* else, we found our root within the group ... */
|
||||
BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
|
||||
|
||||
/* receive the data from sender */
|
||||
/* get the data buff */
|
||||
/* taken care of in the macro */
|
||||
/*parent_data_pointer = data_buffs[src].payload;*/
|
||||
/* copy the data */
|
||||
mca_bcol_basesmuma_portals_get_msg(cs, parent_lmsg_ctl_pointer, userbuf, pack_len);
|
||||
|
||||
/* set the memory barrier to ensure completion */
|
||||
opal_atomic_wmb ();
|
||||
/* signal that I am done */
|
||||
my_ctl_pointer->flag = ready_flag;
|
||||
|
||||
/* am I the last one? If so, release buffer */
|
||||
|
||||
Release:
|
||||
my_ctl_pointer->starting_flag_value++;
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
#define BASESMUMA_K_NOMIAL_SEND_SIGNAL(radix_mask, radix, my_relative_index, \
|
||||
my_group_index, group_size,sm_data_buffs,sender_ready_flag, \
|
||||
num_pending_sends) \
|
||||
{ \
|
||||
int k, rc; \
|
||||
int dst; \
|
||||
int comm_dst; \
|
||||
volatile mca_bcol_basesmuma_header_t *recv_ctl_pointer = NULL; \
|
||||
volatile mca_bcol_basesmuma_portal_buf_addr_t *recv_lmsg_ctl_pointer = NULL; \
|
||||
\
|
||||
num_pending_sends = 0; \
|
||||
while(radix_mask > 0) { \
|
||||
/* For each level of tree, do sends */ \
|
||||
for (k = 1; \
|
||||
k < radix && my_relative_index + radix_mask * k < group_size; \
|
||||
++k) { \
|
||||
\
|
||||
dst = my_group_index + radix_mask * k; \
|
||||
if (dst >= group_size) { \
|
||||
dst -= group_size; \
|
||||
} \
|
||||
/* Signal the children to get data */ \
|
||||
recv_ctl_pointer = data_buffs[dst].ctl; \
|
||||
recv_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *) \
|
||||
data_buffs[dst].payload; \
|
||||
recv_lmsg_ctl_pointer->src_index = my_group_index; \
|
||||
recv_lmsg_ctl_pointer->flag = sender_ready_flag; \
|
||||
++num_pending_sends; \
|
||||
} \
|
||||
radix_mask /= radix; \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
|
||||
|
||||
int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
mca_bcol_basesmuma_module_t* bcol_module=
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
int i, matched = 0;
|
||||
int src=-1;
|
||||
int group_size;
|
||||
int my_rank, first_instance=0, flag_offset;
|
||||
int rc = OMPI_SUCCESS;
|
||||
int leading_dim, buff_idx, idx;
|
||||
int count=input_args->count;
|
||||
struct ompi_datatype_t* dtype=input_args->dtype;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
|
||||
volatile int64_t ready_flag;
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile char* parent_data_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
void *userbuf = (void *)((unsigned char *)input_args->userbuf);
|
||||
|
||||
size_t pack_len = 0, dt_size;
|
||||
|
||||
struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL;
|
||||
struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL;
|
||||
mca_bcol_basesmuma_portal_proc_info_t *portals_info;
|
||||
portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info;
|
||||
|
||||
/* we will work only on packed data - so compute the length*/
|
||||
ompi_datatype_type_size(dtype, &dt_size);
|
||||
pack_len=count*dt_size;
|
||||
buff_idx = input_args->src_desc->buffer_index;
|
||||
|
||||
/* Get addressing information */
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
|
||||
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
|
||||
/* Set pointer to current proc ctrl region */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) data_buffs[my_rank].payload;
|
||||
|
||||
/* setup resource recycling */
|
||||
if( my_ctl_pointer->sequence_number < sequence_number ) {
|
||||
first_instance=1;
|
||||
}
|
||||
|
||||
if( first_instance ) {
|
||||
/* Signal arrival */
|
||||
my_ctl_pointer->flag = -1;
|
||||
my_ctl_pointer->index=1;
|
||||
/* this does not need to use any flag values , so only need to
|
||||
* set the value for subsequent values that may need this */
|
||||
my_ctl_pointer->starting_flag_value=0;
|
||||
flag_offset=0;
|
||||
|
||||
} else {
|
||||
/* only one thread at a time will be making progress on this
|
||||
* collective, so no need to make this atomic */
|
||||
my_ctl_pointer->index++;
|
||||
}
|
||||
|
||||
|
||||
/* increment the starting flag by one and return */
|
||||
flag_offset = my_ctl_pointer->starting_flag_value;
|
||||
ready_flag = flag_offset + sequence_number + 1;
|
||||
my_ctl_pointer->sequence_number = sequence_number;
|
||||
|
||||
|
||||
/* Construct my portal buffer address and copy to payload buffer */
|
||||
mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer,
|
||||
portals_info->portal_id.nid,
|
||||
portals_info->portal_id.pid,
|
||||
sequence_number,
|
||||
bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
|
||||
|
||||
my_lmsg_ctl_pointer->userbuf = userbuff;
|
||||
my_lsmg_ctl_pointer->userbuf_length = fragment_length;
|
||||
/* create an event queue */
|
||||
ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
|
||||
cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, PTL_EQ_HANDLER_NONE, &eq_h);
|
||||
|
||||
/* non-blocking broadcast algorithm */
|
||||
|
||||
/* If I am the root, then signal ready flag */
|
||||
if(input_args->root_flag) {
|
||||
ptl_handle_eq_t eq_h;
|
||||
ptl_event_t event;
|
||||
int ret;
|
||||
int root_radix_mask = sm_module->pow_knum;
|
||||
|
||||
BASESMUMA_VERBOSE(10,("I am the root of the data"));
|
||||
|
||||
|
||||
if (ret != PTL_OK) {
|
||||
fprintf(stderr, "PtlEQAlloc() failed: %d \n",ret);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
BASESMUMA_K_NOMIAL_SEND_SIGNAL(root_radix_mask, radix, 0,
|
||||
my_rank, group_size, data_buffs, ready_flag, nsends) ;
|
||||
|
||||
mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf,
|
||||
pack_len, eq_h, nsends);
|
||||
|
||||
/* wait for a response from the client */
|
||||
mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT,
|
||||
&event, nsends);
|
||||
|
||||
/* root is finished */
|
||||
goto Release;
|
||||
}
|
||||
|
||||
/* Im not a root so wait until someone puts data and
|
||||
* compute where to get data from */
|
||||
|
||||
while (my_ctl_pointer->flag != ready_flag) ;
|
||||
|
||||
my_data_source_index = lmsg_ctl_pointer->src_index;
|
||||
|
||||
parent_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *)
|
||||
data_buffs[my_data_source_index].payload;
|
||||
|
||||
mca_bcol_basesmuma_portals_get_msg(cs, parent_lmsg_ctl_pointer, userbuf, pack_len);
|
||||
|
||||
|
||||
|
||||
|
||||
/* I am done getting data, should I send the data to someone */
|
||||
|
||||
my_relative_index = (my_rank - my_data_source_index) < 0 ? my_rank -
|
||||
my_data_source_index + group_size : my_rank - my_data_source_index;
|
||||
|
||||
/*
|
||||
* 2. Locate myself in the tree:
|
||||
* calculate number of radix steps that we should to take
|
||||
*/
|
||||
radix_mask = 1;
|
||||
while (radix_mask < group_size) {
|
||||
if (0 != my_relative_index % (radix * radix_mask)) {
|
||||
/* I found my level in tree */
|
||||
break;
|
||||
}
|
||||
radix_mask *= radix;
|
||||
}
|
||||
|
||||
/* go one step back */
|
||||
radix_mask /=radix;
|
||||
|
||||
BASESMUMA_K_NOMIAL_SEND_SIGNAL(radix_mask, radix, my_relative_index,
|
||||
my_rank, group_size,data_buffs,ready_flag,nsends)
|
||||
|
||||
mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf,
|
||||
pack_len, eq_h, nsends);
|
||||
|
||||
/* wait for childrens to read */
|
||||
mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT,
|
||||
&event, nsends);
|
||||
|
||||
|
||||
|
||||
Release:
|
||||
/* free the event queue */
|
||||
ret = PtlEQFree(eq_h);
|
||||
if (ret != PTL_OK) {
|
||||
fprintf(stderr, "PtlEQFree() failed: %d )\n",ret);
|
||||
}
|
||||
|
||||
|
||||
my_ctl_pointer->starting_flag_value++;
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
@ -1,101 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
|
||||
/* Shared memory registration function: Calls into the "shared memory
|
||||
connection manager" (aka - smcm) and registers a chunk of memory by
|
||||
opening and mmaping a file.
|
||||
|
||||
@input:
|
||||
|
||||
void *reg_data - shared memory specific data needed by the registration
|
||||
function.
|
||||
|
||||
void *base - pointer to memory address.
|
||||
|
||||
size_t size - size of memory chunk to be registered with sm.
|
||||
|
||||
mca_mpool_base_registration_t *reg - registration data is cached here.
|
||||
|
||||
@output:
|
||||
|
||||
returns OMPI_SUCCESS on successful registration.
|
||||
|
||||
returns OMPI_ERROR on failure.
|
||||
|
||||
*/
|
||||
|
||||
int mca_bcol_basesmuma_register_sm(void *context_data, void *base, size_t size,
|
||||
void **reg_desc)
|
||||
{
|
||||
|
||||
/* local variables */
|
||||
int ret = OMPI_SUCCESS;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
bcol_basesmuma_registration_data_t *sm_reg =
|
||||
(bcol_basesmuma_registration_data_t*) context_data;
|
||||
|
||||
/* cache some info on sm_reg aka "context_data", you'll need it later */
|
||||
sm_reg->base_addr = base;
|
||||
sm_reg->size = size;
|
||||
|
||||
/* call into the shared memory registration function in smcm
|
||||
* we need to be sure that the memory is page aligned in order
|
||||
* to "map_fixed"
|
||||
*/
|
||||
sm_reg->sm_mmap = bcol_basesmuma_smcm_mem_reg(base, size,
|
||||
sm_reg->data_seg_alignment,
|
||||
sm_reg->file_name);
|
||||
if(NULL == sm_reg->sm_mmap) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Bcol_basesmuma memory registration error");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* don't let other communicators re-register me! */
|
||||
cs->mpool_inited = true;
|
||||
/* alias back to component */
|
||||
cs->sm_payload_structs = sm_reg->sm_mmap;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Shared memory deregistration function - deregisters memory by munmapping it and removing the
|
||||
shared memory file.
|
||||
|
||||
Basic steps (please let me know if this is incompatible with your notion of deregistration
|
||||
or if it causes problems on cleanup):
|
||||
|
||||
1. munmap the shared memory file.
|
||||
2. set the base pointer to the mmaped memory to NULL.
|
||||
3. permanently remove the shared memory file from the directory.
|
||||
|
||||
*/
|
||||
|
||||
int mca_bcol_basesmuma_deregister_sm(void *context_data, void *reg)
|
||||
{
|
||||
/* local variables */
|
||||
bcol_basesmuma_registration_data_t *sm_reg =
|
||||
(bcol_basesmuma_registration_data_t*) context_data;
|
||||
|
||||
if (sm_reg->sm_mmap) {
|
||||
OBJ_RELEASE(sm_reg->sm_mmap);
|
||||
}
|
||||
|
||||
/* set the pointer to NULL */
|
||||
sm_reg->base_addr = NULL;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,687 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/patterns/net/netpatterns.h"
|
||||
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/align.h"
|
||||
|
||||
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h"
|
||||
#include "bcol_basesmuma.h"
|
||||
#include "bcol_basesmuma_utils.h"
|
||||
|
||||
#ifdef __PORTALS_AVAIL__
|
||||
#include "bcol_basesmuma_portals.h"
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int alloc_lmsg_reduce_offsets_array(mca_bcol_basesmuma_module_t *sm_module)
|
||||
{
|
||||
int rc = OMPI_SUCCESS, i = 0;
|
||||
netpatterns_k_exchange_node_t *k_node = &sm_module->knomial_exchange_tree;
|
||||
int n_exchanges = k_node->n_exchanges;
|
||||
|
||||
/* Precalculate the allreduce offsets */
|
||||
if (0 < k_node->n_exchanges) {
|
||||
sm_module->reduce_offsets = (int **)malloc(n_exchanges * sizeof(int*));
|
||||
|
||||
if (!sm_module->reduce_offsets) {
|
||||
rc = OMPI_ERROR;
|
||||
return rc;
|
||||
}
|
||||
|
||||
for (i=0; i < n_exchanges ; i++) {
|
||||
sm_module->reduce_offsets[i] = (int *)malloc (sizeof(int) * NOFFSETS);
|
||||
|
||||
if (!sm_module->reduce_offsets[i]){
|
||||
rc = OMPI_ERROR;
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int free_lmsg_reduce_offsets_array(mca_bcol_basesmuma_module_t *sm_module)
|
||||
{
|
||||
int rc = OMPI_SUCCESS, i = 0;
|
||||
netpatterns_k_exchange_node_t *k_node = &sm_module->knomial_exchange_tree;
|
||||
int n_exchanges = k_node->n_exchanges;
|
||||
|
||||
if (sm_module->reduce_offsets) {
|
||||
for (i=0; i < n_exchanges; i++) {
|
||||
free (sm_module->reduce_offsets[i]);
|
||||
}
|
||||
|
||||
free(sm_module->reduce_offsets);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void
|
||||
mca_bcol_basesmuma_module_construct(mca_bcol_basesmuma_module_t *module)
|
||||
{
|
||||
/* initialize all values to 0 */
|
||||
memset((void*)((uintptr_t) module + sizeof (module->super)), 0, sizeof (*module) - sizeof (module->super));
|
||||
module->super.bcol_component = (mca_bcol_base_component_t *) &mca_bcol_basesmuma_component;
|
||||
module->super.list_n_connected = NULL;
|
||||
module->super.hier_scather_offset = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
|
||||
{
|
||||
/* local variables */
|
||||
mca_sbgp_base_module_t *sbgp_module = sm_module->super.sbgp_partner_module;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
|
||||
/*
|
||||
* release allocated resrouces
|
||||
*/
|
||||
|
||||
/* ...but not until you're sure you have no outstanding collectives */
|
||||
while(0 != opal_list_get_size(&(cs->nb_admin_barriers))) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
#ifdef __PORTALS_AVAIL__
|
||||
/* Remove portals bcast specific resources */
|
||||
if ( PTL_OK != PtlEQFree(sm_module->sg_state.read_eq)) {
|
||||
BASESMUMA_VERBOSE(10,("PtlEQFree() failed: )"));
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Remove Lmsg Reduce Offsets Array */
|
||||
free_lmsg_reduce_offsets_array(sm_module);
|
||||
|
||||
/* collective topology data */
|
||||
if( sm_module->fanout_read_tree) {
|
||||
for (int i = 0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
|
||||
if(0 < sm_module->fanout_read_tree[i].n_children ) {
|
||||
free(sm_module->fanout_read_tree[i].children_ranks);
|
||||
sm_module->fanout_read_tree[i].children_ranks=NULL;
|
||||
}
|
||||
}
|
||||
free(sm_module->fanout_read_tree);
|
||||
sm_module->fanout_read_tree=NULL;
|
||||
}
|
||||
|
||||
/* gvm Leak FIX Reduction_tree[].children_ranks has
|
||||
* to be removed. I don't how to get the size (which is
|
||||
* size of subgroup) of array reduction_tree
|
||||
*/
|
||||
if( sm_module->reduction_tree) {
|
||||
for (int i = 0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
|
||||
if(0 < sm_module->reduction_tree[i].n_children ) {
|
||||
free(sm_module->reduction_tree[i].children_ranks);
|
||||
sm_module->reduction_tree[i].children_ranks=NULL;
|
||||
}
|
||||
}
|
||||
free(sm_module->reduction_tree);
|
||||
sm_module->reduction_tree=NULL;
|
||||
}
|
||||
|
||||
/* gvm Leak FIX */
|
||||
if (sm_module->fanout_node.children_ranks){
|
||||
free(sm_module->fanout_node.children_ranks);
|
||||
sm_module->fanout_node.children_ranks = NULL;
|
||||
}
|
||||
|
||||
if (sm_module->fanin_node.children_ranks){
|
||||
free(sm_module->fanin_node.children_ranks);
|
||||
sm_module->fanin_node.children_ranks = NULL;
|
||||
}
|
||||
|
||||
/* colls_no_user_data resrouces */
|
||||
if(sm_module->colls_no_user_data.ctl_buffs_mgmt){
|
||||
free(sm_module->colls_no_user_data.ctl_buffs_mgmt);
|
||||
sm_module->colls_no_user_data.ctl_buffs_mgmt=NULL;
|
||||
}
|
||||
if(sm_module->colls_no_user_data.ctl_buffs){
|
||||
free(sm_module->colls_no_user_data.ctl_buffs);
|
||||
sm_module->colls_no_user_data.ctl_buffs=NULL;
|
||||
}
|
||||
|
||||
/* return control */
|
||||
opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->no_userdata_ctl);
|
||||
|
||||
/* colls_with_user_data resrouces */
|
||||
/*
|
||||
*debug print */
|
||||
/*
|
||||
fprintf(stderr,"AAA colls_with_user_data.ctl_buffs %p \n",
|
||||
sm_module->colls_with_user_data.ctl_buffs_mgmt);
|
||||
end debug */
|
||||
|
||||
if(sm_module->colls_with_user_data.ctl_buffs_mgmt){
|
||||
free(sm_module->colls_with_user_data.ctl_buffs_mgmt);
|
||||
sm_module->colls_with_user_data.ctl_buffs_mgmt=NULL;
|
||||
}
|
||||
if(sm_module->colls_with_user_data.ctl_buffs){
|
||||
free(sm_module->colls_with_user_data.ctl_buffs);
|
||||
sm_module->colls_with_user_data.ctl_buffs=NULL;
|
||||
}
|
||||
|
||||
if(sm_module->shared_memory_scratch_space) {
|
||||
free(sm_module->shared_memory_scratch_space);
|
||||
sm_module->shared_memory_scratch_space=NULL;
|
||||
}
|
||||
|
||||
/* return control */
|
||||
opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->userdata_ctl);
|
||||
|
||||
#if 1
|
||||
if(sm_module->scatter_kary_tree) {
|
||||
for (int i = 0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
|
||||
if(0 < sm_module->scatter_kary_tree[i].n_children) {
|
||||
free(sm_module->scatter_kary_tree[i].children_ranks);
|
||||
sm_module->scatter_kary_tree[i].children_ranks=NULL;
|
||||
}
|
||||
}
|
||||
free(sm_module->scatter_kary_tree);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(NULL != sm_module->super.list_n_connected ){
|
||||
free(sm_module->super.list_n_connected);
|
||||
sm_module->super.list_n_connected = NULL;
|
||||
}
|
||||
|
||||
cleanup_nb_coll_buff_desc(&sm_module->ml_mem.nb_coll_desc,
|
||||
sm_module->ml_mem.num_banks,
|
||||
sm_module->ml_mem.num_buffers_per_bank);
|
||||
|
||||
for (int i = 0; i < BCOL_NUM_OF_FUNCTIONS; i++){
|
||||
/* gvm FIX: Go through the list and destroy each item */
|
||||
/* Destroy the function table object for each bcol type list */
|
||||
OPAL_LIST_DESTRUCT((&sm_module->super.bcol_fns_table[i]));
|
||||
}
|
||||
|
||||
if (NULL != sm_module->payload_backing_files_info) {
|
||||
bcol_basesmuma_smcm_release_connections (sm_module, sbgp_module, &cs->sm_connections_list,
|
||||
&sm_module->payload_backing_files_info);
|
||||
}
|
||||
|
||||
if (NULL != sm_module->ctl_backing_files_info) {
|
||||
bcol_basesmuma_smcm_release_connections (sm_module, sbgp_module, &cs->sm_connections_list,
|
||||
&sm_module->ctl_backing_files_info);
|
||||
}
|
||||
|
||||
if (NULL != sm_module->ml_mem.bank_release_counter) {
|
||||
free(sm_module->ml_mem.bank_release_counter);
|
||||
sm_module->ml_mem.bank_release_counter = NULL;
|
||||
}
|
||||
|
||||
if (NULL != sm_module->colls_with_user_data.data_buffs) {
|
||||
free((void *)sm_module->colls_with_user_data.data_buffs);
|
||||
sm_module->colls_with_user_data.data_buffs = NULL;
|
||||
}
|
||||
|
||||
/* free the k-nomial allgather tree here */
|
||||
netpatterns_cleanup_recursive_knomial_allgather_tree_node(&sm_module->knomial_allgather_tree);
|
||||
netpatterns_cleanup_recursive_doubling_tree_node(&sm_module->recursive_doubling_tree);
|
||||
netpatterns_cleanup_recursive_knomial_tree_node(&sm_module->knomial_exchange_tree);
|
||||
|
||||
/* done */
|
||||
}
|
||||
|
||||
static void bcol_basesmuma_set_small_msg_thresholds(struct mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_basesmuma_module_t *basesmuma_module =
|
||||
(mca_bcol_basesmuma_module_t *) super;
|
||||
|
||||
size_t basesmuma_offset = bcol_basesmuma_data_offset_calc(basesmuma_module);
|
||||
|
||||
/* Set the Allreduce threshold, for Basesmuma it equals to ML buffer size - data offset */
|
||||
super->small_message_thresholds[BCOL_ALLREDUCE] =
|
||||
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
|
||||
|
||||
/* Set the Bcast threshold, for Basesmuma it equals to ML buffer size - data offset */
|
||||
super->small_message_thresholds[BCOL_BCAST] =
|
||||
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
|
||||
|
||||
/* Set the Gather threshold, for Basesmuma it equals to ML buffer size - data offset */
|
||||
super->small_message_thresholds[BCOL_GATHER] =
|
||||
(basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) /
|
||||
ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
|
||||
|
||||
/* Set the ALLgather threshold, for Basesmuma it equals to ML buffer size - data offset */
|
||||
super->small_message_thresholds[BCOL_ALLGATHER] =
|
||||
(basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) /
|
||||
ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
|
||||
|
||||
/* Set the Reduce threshold, for Basesmuma it equals to ML buffer size - data offset */
|
||||
super->small_message_thresholds[BCOL_REDUCE] =
|
||||
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
|
||||
|
||||
/* Set the Scatter threshold, for Basesmuma it equals to ML buffer size - data offset */
|
||||
super->small_message_thresholds[BCOL_SCATTER] =
|
||||
basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
|
||||
}
|
||||
|
||||
/* setup memory management and collective routines */
|
||||
|
||||
static void load_func(mca_bcol_base_module_t *super)
|
||||
{
|
||||
int fnc;
|
||||
|
||||
/* Loading memory management and collective functions */
|
||||
|
||||
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
|
||||
super->bcol_function_table[fnc] = NULL;
|
||||
}
|
||||
|
||||
/*super->bcol_function_table[BCOL_BARRIER] = bcol_basesmuma_recursive_double_barrier;*/
|
||||
|
||||
#ifdef __PORTALS_AVAIL__
|
||||
super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_lmsg_scatter_allgather_portals_bcast;
|
||||
/* super->bcol_function_table[BCOL_BCAST] =
|
||||
bcol_basesmuma_lmsg_bcast_k_nomial_anyroot; */
|
||||
#endif
|
||||
|
||||
/*super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_bcast;*/
|
||||
/*super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_binary_scatter_allgather_segment;*/
|
||||
/*super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_bcast_k_nomial_anyroot;*/
|
||||
super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_bcast;
|
||||
#ifdef __PORTALS_AVAIL__
|
||||
super->bcol_function_table[BCOL_BCAST] =
|
||||
bcol_basesmuma_lmsg_scatter_allgather_portals_bcast;
|
||||
#endif
|
||||
/* super->bcol_function_table[BCOL_ALLREDUCE] = bcol_basesmuma_allreduce_intra_fanin_fanout; */
|
||||
super->bcol_function_table[BCOL_ALLREDUCE] = bcol_basesmuma_allreduce_intra_recursive_doubling;
|
||||
super->bcol_function_table[BCOL_REDUCE] = bcol_basesmuma_reduce_intra_fanin_old;
|
||||
/* memory management */
|
||||
super->bcol_memory_init = bcol_basesmuma_bank_init_opti;
|
||||
|
||||
super->k_nomial_tree = bcol_basesmuma_setup_knomial_tree;
|
||||
|
||||
/* Set thresholds */
|
||||
super->set_small_msg_thresholds = bcol_basesmuma_set_small_msg_thresholds;
|
||||
}
|
||||
|
||||
static void load_func_with_choices(mca_bcol_base_module_t *super)
|
||||
{
|
||||
int fnc;
|
||||
|
||||
/* Loading memory management and collective functions */
|
||||
|
||||
for (fnc=0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
|
||||
super->bcol_function_init_table[fnc] = NULL;
|
||||
}
|
||||
|
||||
super->bcol_function_init_table[BCOL_FANIN] = bcol_basesmuma_fanin_init;
|
||||
super->bcol_function_init_table[BCOL_FANOUT] = bcol_basesmuma_fanout_init;
|
||||
super->bcol_function_init_table[BCOL_BARRIER] = bcol_basesmuma_barrier_init;
|
||||
|
||||
super->bcol_function_init_table[BCOL_BCAST] = bcol_basesmuma_bcast_init;
|
||||
super->bcol_function_init_table[BCOL_ALLREDUCE] = bcol_basesmuma_allreduce_init;
|
||||
super->bcol_function_init_table[BCOL_REDUCE] = bcol_basesmuma_reduce_init;
|
||||
super->bcol_function_init_table[BCOL_GATHER] = bcol_basesmuma_gather_init;
|
||||
super->bcol_function_init_table[BCOL_ALLGATHER] = bcol_basesmuma_allgather_init;
|
||||
super->bcol_function_init_table[BCOL_SYNC] = bcol_basesmuma_memsync_init;
|
||||
/* memory management */
|
||||
super->bcol_memory_init = bcol_basesmuma_bank_init_opti;
|
||||
|
||||
super->k_nomial_tree = bcol_basesmuma_setup_knomial_tree;
|
||||
|
||||
}
|
||||
|
||||
static int load_recursive_knomial_info(mca_bcol_basesmuma_module_t
|
||||
*sm_module)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
rc = netpatterns_setup_recursive_knomial_tree_node(sm_module->super.sbgp_partner_module->group_size,
|
||||
sm_module->super.sbgp_partner_module->my_index,
|
||||
mca_bcol_basesmuma_component.k_nomial_radix,
|
||||
&sm_module->knomial_exchange_tree);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_basesmuma_module_t *sm_module = (mca_bcol_basesmuma_module_t *) super;
|
||||
|
||||
return netpatterns_setup_recursive_knomial_allgather_tree_node(sm_module->super.sbgp_partner_module->group_size,
|
||||
sm_module->super.sbgp_partner_module->my_index,
|
||||
mca_bcol_basesmuma_component.k_nomial_radix,
|
||||
super->list_n_connected,
|
||||
&sm_module->knomial_allgather_tree);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* query to see if the module is available for use on the given
|
||||
* communicator, and if so, what it's priority is. This is where
|
||||
* the backing shared-memory file is created.
|
||||
*/
|
||||
mca_bcol_base_module_t **
|
||||
mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules)
|
||||
{
|
||||
/* local variables */
|
||||
mca_bcol_base_module_t **sm_modules = NULL;
|
||||
mca_bcol_basesmuma_module_t *sm_module;
|
||||
bcol_basesmuma_registration_data_t *sm_reg_data;
|
||||
int ret, my_rank, name_length;
|
||||
char *name;
|
||||
int i;
|
||||
|
||||
int bcast_radix;
|
||||
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
/*mca_base_component_list_item_t *hdl_cli = NULL;*/
|
||||
/*int hdl_num;*/
|
||||
|
||||
/* at this point I think there is only a sinle shared
|
||||
memory bcol that we need to be concerned with */
|
||||
|
||||
/* No group, no modules */
|
||||
if (OPAL_UNLIKELY(NULL == module)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* allocate and initialize an sm_bcol module */
|
||||
sm_module = OBJ_NEW(mca_bcol_basesmuma_module_t);
|
||||
|
||||
/* set the subgroup */
|
||||
sm_module->super.sbgp_partner_module=module;
|
||||
|
||||
(*num_modules)=1;
|
||||
cs->super.n_net_contexts = *num_modules;
|
||||
sm_module->reduction_tree = NULL;
|
||||
sm_module->fanout_read_tree = NULL;
|
||||
|
||||
ret=netpatterns_setup_recursive_doubling_tree_node(
|
||||
module->group_size,module->my_index,
|
||||
&(sm_module->recursive_doubling_tree));
|
||||
if(OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Error setting up recursive_doubling_tree \n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* setup the fanin tree - this is used only as part of a hierarchical
|
||||
* barrier, so will set this up with rank 0 as the root */
|
||||
my_rank=module->my_index;
|
||||
ret=netpatterns_setup_narray_tree(cs->radix_fanin,
|
||||
my_rank,module->group_size,&(sm_module->fanin_node));
|
||||
if(OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Error setting up fanin tree \n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* setup the fanout tree - this is used only as part of a hierarchical
|
||||
* barrier, so will set this up with rank 0 as the root */
|
||||
ret=netpatterns_setup_narray_tree(cs->radix_fanout,
|
||||
my_rank,module->group_size,&(sm_module->fanout_node));
|
||||
if(OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Error setting up fanout tree \n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Setup the broadcast tree - this is used only as part of a hierarchical
|
||||
* bcast, so will set this up with rank 0 as the root.
|
||||
*/
|
||||
|
||||
/* set the radix of the bcast tree */
|
||||
bcast_radix = cs->radix_read_tree;
|
||||
|
||||
/* initialize fan-out read tree */
|
||||
sm_module->fanout_read_tree=(netpatterns_tree_node_t*) malloc(
|
||||
sizeof(netpatterns_tree_node_t)*module->group_size);
|
||||
if( NULL == sm_module->fanout_read_tree ) {
|
||||
goto Error;
|
||||
}
|
||||
|
||||
for(i = 0; i < module->group_size; i++){
|
||||
ret = netpatterns_setup_narray_tree(bcast_radix,
|
||||
i, module->group_size, &(sm_module->fanout_read_tree[i]));
|
||||
if(OMPI_SUCCESS != ret) {
|
||||
goto Error;
|
||||
}
|
||||
}
|
||||
|
||||
ret = load_recursive_knomial_info(sm_module);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
BASESMUMA_VERBOSE(10, ("Failed to load recursive knomial tree"));
|
||||
goto Error;
|
||||
}
|
||||
|
||||
/* Allocate offsets array for lmsg reduce */
|
||||
ret = alloc_lmsg_reduce_offsets_array(sm_module);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
BASESMUMA_VERBOSE(10, ("Failed to allocate reduce offsets array"));
|
||||
goto Error;
|
||||
}
|
||||
|
||||
/* initialize reduction tree */
|
||||
sm_module->reduction_tree=(netpatterns_tree_node_t *) malloc(
|
||||
sizeof(netpatterns_tree_node_t )*module->group_size);
|
||||
if( NULL == sm_module->reduction_tree ) {
|
||||
goto Error;
|
||||
}
|
||||
|
||||
ret=netpatterns_setup_multinomial_tree(
|
||||
cs->order_reduction_tree,module->group_size,
|
||||
sm_module->reduction_tree);
|
||||
if( MPI_SUCCESS != ret ) {
|
||||
goto Error;
|
||||
}
|
||||
|
||||
/* get largest power of k for given group size */
|
||||
sm_module->pow_k_levels = pow_sm_k(cs->k_nomial_radix,
|
||||
sm_module->super.sbgp_partner_module->group_size,
|
||||
&(sm_module->pow_k));
|
||||
|
||||
/* get largest power of 2 for a given group size
|
||||
* used in scatter allgather
|
||||
*/
|
||||
sm_module->pow_2_levels = pow_sm_k(2,
|
||||
sm_module->super.sbgp_partner_module->group_size,
|
||||
&(sm_module->pow_2));
|
||||
|
||||
/*
|
||||
* setup scatter data
|
||||
*/
|
||||
sm_module->scatter_kary_radix=cs->scatter_kary_radix;
|
||||
sm_module->scatter_kary_tree=NULL;
|
||||
ret=netpatterns_setup_narray_tree_contigous_ranks(
|
||||
sm_module->scatter_kary_radix,
|
||||
sm_module->super.sbgp_partner_module->group_size,
|
||||
&(sm_module->scatter_kary_tree));
|
||||
if(OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "In base_bcol_masesmuma_setup_library_buffers and scatter k-ary tree setup failed \n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* setup the module shared memory management */
|
||||
ret=base_bcol_basesmuma_setup_library_buffers(sm_module, cs);
|
||||
|
||||
if(OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "In base_bcol_masesmuma_setup_library_buffers and mpool was not successfully setup!\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* setup the collectives and memory management */
|
||||
|
||||
/* check to see whether or not the mpool has been inited */
|
||||
/* allocate some space for the network contexts */
|
||||
if(!cs->mpool_inited) {
|
||||
/* if it's empty, then fill it for first time */
|
||||
cs->super.network_contexts = (bcol_base_network_context_t **)
|
||||
malloc((cs->super.n_net_contexts)*
|
||||
sizeof(bcol_base_network_context_t *));
|
||||
/* you need to do some basic setup - define the file name,
|
||||
* set data seg alignment and size of cntl structure in sm
|
||||
* file.
|
||||
*/
|
||||
/* give the payload sm file a name */
|
||||
name_length=asprintf(&name,
|
||||
"%s"OPAL_PATH_SEP"0%s%0d",
|
||||
ompi_process_info.job_session_dir,
|
||||
cs->payload_base_fname,
|
||||
(int)getpid());
|
||||
if( 0 > name_length ) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Failed to assign the shared memory payload file a name\n");
|
||||
return NULL;
|
||||
}
|
||||
/* make sure name is not too long */
|
||||
if ( OPAL_PATH_MAX < (name_length-1) ) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Shared memory file name is too long!\n");
|
||||
return NULL;
|
||||
}
|
||||
/* set the name and alignment characteristics */
|
||||
sm_reg_data = (bcol_basesmuma_registration_data_t *) malloc(
|
||||
sizeof(bcol_basesmuma_registration_data_t));
|
||||
sm_reg_data->file_name = name;
|
||||
|
||||
sm_reg_data->data_seg_alignment = getpagesize();
|
||||
sm_reg_data->size_ctl_structure = 0;
|
||||
cs->super.network_contexts[0] = (bcol_base_network_context_t *)
|
||||
malloc(sizeof(bcol_base_network_context_t));
|
||||
cs->super.network_contexts[0]->context_data =
|
||||
(void *) sm_reg_data;
|
||||
cs->super.network_contexts[0]->
|
||||
register_memory_fn = mca_bcol_basesmuma_register_sm;
|
||||
cs->super.network_contexts[0]->
|
||||
deregister_memory_fn = mca_bcol_basesmuma_deregister_sm;
|
||||
sm_module->super.network_context = cs->super.network_contexts[0];
|
||||
} else {
|
||||
|
||||
sm_module->super.network_context = cs->super.network_contexts[0];
|
||||
}
|
||||
|
||||
/* Set the header size */
|
||||
sm_module->super.header_size = sizeof(mca_bcol_basesmuma_header_t);
|
||||
|
||||
/*initialize the hdl module if it's to be enabled*/
|
||||
#if 0
|
||||
if (module->use_hdl) {
|
||||
sm_module->super.use_hdl = module->use_hdl;
|
||||
hdl_cli = (mca_base_component_list_item_t *)
|
||||
opal_list_get_first(&mca_hdl_base_components_in_use);
|
||||
sm_module->hdl_module = ((mca_hdl_base_component_t*)
|
||||
hdl_cli->cli_component)->hdl_comm_query(sm_module, &hdl_num);
|
||||
if (1 != hdl_num || sm_module->hdl_module == NULL) {
|
||||
ML_ERROR(("hdl modules are not successfully initialized!\n"));
|
||||
goto Error;
|
||||
}
|
||||
} else {
|
||||
sm_module->hdl_module = NULL;
|
||||
}
|
||||
#else
|
||||
sm_module->hdl_module = NULL;
|
||||
#endif
|
||||
|
||||
|
||||
/* collective setup */
|
||||
load_func(&(sm_module->super));
|
||||
load_func_with_choices(&(sm_module->super));
|
||||
|
||||
/*
|
||||
* This initializes all collective algorithms
|
||||
*/
|
||||
|
||||
ret = mca_bcol_base_bcol_fns_table_init(&(sm_module->super));
|
||||
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
|
||||
goto Error;
|
||||
}
|
||||
|
||||
sm_module->super.supported_mode = 0;
|
||||
|
||||
/* NTH: this is not set anywhere on the trunk as of 08/13/13 */
|
||||
#if 0
|
||||
if (module->use_hdl) {
|
||||
sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Initializes portals library required for basesmuma large message */
|
||||
#ifdef __PORTALS_AVAIL__
|
||||
/* Enable zero copy mode */
|
||||
sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY;
|
||||
|
||||
ret = mca_bcol_basesmuma_portals_init(cs);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
sm_module->sg_state.phase = INIT;
|
||||
|
||||
ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
|
||||
cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
|
||||
PTL_EQ_HANDLER_NONE, &sm_module->sg_state.read_eq);
|
||||
|
||||
if (ret != PTL_OK) {
|
||||
BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d",ret));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#endif
|
||||
/* blocking recursive double barrier test */
|
||||
/*
|
||||
{
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "BBB About to hit the barrier test\n");
|
||||
int rc;
|
||||
bcol_function_args_t bogus;
|
||||
rc = bcol_basesmuma_rd_barrier_init(&(sm_module->super));
|
||||
rc = bcol_basesmuma_recursive_double_barrier(
|
||||
&bogus, &(sm_module->super));
|
||||
}
|
||||
*/
|
||||
|
||||
/* in this case we only expect a single network context.
|
||||
in the future we should loop around this */
|
||||
sm_modules = (mca_bcol_base_module_t **) malloc(sizeof(mca_bcol_base_module_t *));
|
||||
if( !sm_modules ) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "In base_bcol_masesmuma_setup_library_buffers failed to allocate memory for sm_modules\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
sm_modules[0] = &(sm_module->super);
|
||||
|
||||
return sm_modules;
|
||||
|
||||
Error:
|
||||
|
||||
/* cleanup */
|
||||
if( sm_module->reduction_tree ) {
|
||||
free(sm_module->reduction_tree);
|
||||
sm_module->reduction_tree=NULL;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_bcol_basesmuma_module_t,
|
||||
mca_bcol_base_module_t,
|
||||
mca_bcol_basesmuma_module_construct,
|
||||
mca_bcol_basesmuma_module_destruct);
|
@ -1,74 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
/* the progress function to be called from the opal progress function
|
||||
*/
|
||||
int bcol_basesmuma_progress(void)
|
||||
{
|
||||
/* local variables */
|
||||
volatile int32_t *cntr;
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
|
||||
/* check to see if release of memory blocks needs to be done */
|
||||
if( opal_list_get_size(&(cs->nb_admin_barriers)) ) {
|
||||
sm_nbbar_desc_t *item_ptr;
|
||||
opal_list_t *list=&(cs->nb_admin_barriers);
|
||||
/* process only if the list is non-empty */
|
||||
if( !OPAL_THREAD_TRYLOCK(&cs->nb_admin_barriers_mutex)) {
|
||||
|
||||
for (item_ptr = (sm_nbbar_desc_t*) opal_list_get_first(list);
|
||||
item_ptr != (sm_nbbar_desc_t*) opal_list_get_end(list);
|
||||
item_ptr = (sm_nbbar_desc_t*) opal_list_get_next(item_ptr) )
|
||||
{
|
||||
bcol_basesmuma_rd_nb_barrier_progress_admin(item_ptr);
|
||||
/* check to see if an complete */
|
||||
if( NB_BARRIER_DONE == item_ptr->collective_phase ) {
|
||||
/* barrier is complete - remove from the list. No need
|
||||
* to put it on another list, as it is part of the memory
|
||||
* bank control structure, and will be picked up
|
||||
* again when needed.
|
||||
*/
|
||||
int index=
|
||||
item_ptr->pool_index;
|
||||
/* old way - ctl_struct specific */
|
||||
/*
|
||||
volatile uint64_t *cntr= (volatile uint64_t *)
|
||||
&(item_ptr->sm_module->colls_no_user_data.
|
||||
ctl_buffs_mgmt[index].bank_gen_counter);
|
||||
*/
|
||||
|
||||
cntr= (volatile int32_t *) &(item_ptr->coll_buff->
|
||||
ctl_buffs_mgmt[index].bank_gen_counter);
|
||||
item_ptr=(sm_nbbar_desc_t*)opal_list_remove_item((opal_list_t *)list,
|
||||
( opal_list_item_t *)item_ptr);
|
||||
/* increment the generation number */
|
||||
OPAL_THREAD_ADD32(cntr,1);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&cs->nb_admin_barriers_mutex);
|
||||
}
|
||||
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
}
|
@ -1,218 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/* Recursive doubling blocking barrier */
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/patterns/net/netpatterns.h"
|
||||
|
||||
#include "opal/sys/atomic.h"
|
||||
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
#if 0
|
||||
int bcol_basesmuma_recursive_double_barrier(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
|
||||
/* local variables */
|
||||
int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange, flag_to_set;
|
||||
int pair_rank, flag_offset;
|
||||
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
||||
netpatterns_pair_exchange_node_t *my_exchange_node;
|
||||
int extra_rank, my_rank, pow_2;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *partner_ctl;
|
||||
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl;
|
||||
int64_t sequence_number;
|
||||
bool found;
|
||||
int buff_index, first_instance=0;
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
#if 0
|
||||
fprintf(stderr,"Entering the sm rd barrier\n");
|
||||
fflush(stderr);
|
||||
#endif
|
||||
|
||||
/* get the pointer to the segment of control structures */
|
||||
my_exchange_node=&(bcol_module->recursive_doubling_tree);
|
||||
my_rank=bcol_module->super.sbgp_partner_module->my_index;
|
||||
pow_2=bcol_module->super.sbgp_partner_module->pow_2;
|
||||
|
||||
/* figure out what instance of the basesmuma bcol I am */
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
sequence_number=input_args->sequence_num - c_input_args->bcol_module->squence_number_offset;
|
||||
|
||||
buff_index=sequence_number & (bcol_module->colls_no_user_data.mask);
|
||||
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_index,0);
|
||||
ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
|
||||
bcol_module->colls_no_user_data.ctl_buffs+idx;
|
||||
my_ctl=ctl_structs[my_rank];
|
||||
if( my_ctl->sequence_number < sequence_number ) {
|
||||
first_instance=1;
|
||||
}
|
||||
|
||||
/* get the pool index */
|
||||
if( first_instance ) {
|
||||
idx = -1;
|
||||
while( idx == -1 ) {
|
||||
|
||||
idx=bcol_basesmuma_get_buff_index(
|
||||
&(bcol_module->colls_no_user_data),sequence_number);
|
||||
}
|
||||
if( -1 == idx ){
|
||||
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
my_ctl->index=1;
|
||||
/* this does not need to use any flag values , so only need to
|
||||
* set the value for subsequent values that may need this */
|
||||
my_ctl->starting_flag_value=0;
|
||||
flag_offset=0;
|
||||
} else {
|
||||
/* only one thread at a time will be making progress on this
|
||||
* collective, so no need to make this atomic */
|
||||
my_ctl->index++;
|
||||
flag_offset=my_ctl->starting_flag_value;
|
||||
}
|
||||
|
||||
/* signal that I have arrived */
|
||||
my_ctl->flag = -1;
|
||||
/* don't need to set this flag anymore */
|
||||
my_ctl->sequence_number = sequence_number;
|
||||
/* opal_atomic_wmb ();*/
|
||||
|
||||
if(0 < my_exchange_node->n_extra_sources) {
|
||||
if (EXCHANGE_NODE == my_exchange_node->node_type) {
|
||||
volatile int64_t *partner_sn;
|
||||
int cnt=0;
|
||||
|
||||
/* I will participate in the exchange - wait for signal from extra
|
||||
** process */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
partner_ctl=(volatile mca_bcol_basesmuma_ctl_struct_t *)ctl_structs[extra_rank];
|
||||
|
||||
/*partner_ctl=ctl_structs[extra_rank];*/
|
||||
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
|
||||
|
||||
/* spin n iterations until partner registers */
|
||||
loop_cnt=0;
|
||||
found=false;
|
||||
while( !found )
|
||||
{
|
||||
if( *partner_sn >= sequence_number ) {
|
||||
found=true;
|
||||
}
|
||||
cnt++;
|
||||
if( cnt == 1000 ) {
|
||||
opal_progress();
|
||||
cnt=0;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
/* Nothing to do, already registared that I am here */
|
||||
}
|
||||
}
|
||||
|
||||
for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) {
|
||||
|
||||
volatile int64_t *partner_sn;
|
||||
volatile int *partner_flag;
|
||||
int cnt=0;
|
||||
|
||||
/* rank of exchange partner */
|
||||
pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
|
||||
partner_ctl=ctl_structs[pair_rank];
|
||||
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
|
||||
partner_flag=(volatile int *)&(partner_ctl->flag);
|
||||
|
||||
/* signal that I am at iteration exchange of the algorithm */
|
||||
flag_to_set=flag_offset+exchange;
|
||||
my_ctl->flag = flag_to_set;
|
||||
|
||||
/* check to see if the partner has arrived */
|
||||
|
||||
/* spin n iterations until partner registers */
|
||||
found=false;
|
||||
while( !found )
|
||||
{
|
||||
if( (*partner_sn > sequence_number) ||
|
||||
( *partner_sn == sequence_number &&
|
||||
*partner_flag >= flag_to_set ) ) {
|
||||
found=true;
|
||||
} else {
|
||||
cnt++;
|
||||
if( cnt == 1000 ) {
|
||||
opal_progress();
|
||||
cnt=0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(0 < my_exchange_node->n_extra_sources) {
|
||||
if ( EXTRA_NODE == my_exchange_node->node_type ) {
|
||||
int cnt=0;
|
||||
|
||||
/* I will not participate in the exchange -
|
||||
* wait for signal from extra partner */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
partner_ctl=ctl_structs[extra_rank];
|
||||
flag_to_set=flag_offset+my_exchange_node->log_2;
|
||||
|
||||
/* spin n iterations until partner registers */
|
||||
found=false;
|
||||
while( !found )
|
||||
{
|
||||
if (IS_PEER_READY(partner_ctl, flag_to_set, sequence_number)){
|
||||
found=true;
|
||||
} else {
|
||||
cnt++;
|
||||
if( cnt == 1000 ) {
|
||||
opal_progress();
|
||||
cnt=0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
/* signal the extra rank that I am done with the recursive
|
||||
* doubling phase.
|
||||
*/
|
||||
flag_to_set=flag_offset+my_exchange_node->log_2;
|
||||
my_ctl->flag = flag_to_set;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/* if I am the last instance of a basesmuma function in this collectie,
|
||||
* release the resrouces */
|
||||
if (IS_LAST_BCOL_FUNC(c_input_args)){
|
||||
idx=bcol_basesmuma_free_buff(
|
||||
&(bcol_module->colls_no_user_data),
|
||||
sequence_number);
|
||||
} else {
|
||||
/* increment flag value - so next sm collective in the hierarchy
|
||||
* will not collide with the current one, as they share the
|
||||
* control structure */
|
||||
my_ctl->starting_flag_value+=(my_exchange_node->log_2+1);
|
||||
}
|
||||
|
||||
/* return */
|
||||
return ret;
|
||||
}
|
||||
#endif
|
@ -1,462 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
/* we need make cleanup with all these includes START */
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "bcol_basesmuma.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
#include "ompi/patterns/net/netpatterns.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
|
||||
/*
|
||||
* Initialize nonblocking barrier. This is code specific for handling
|
||||
* the recycling of data, and uses only a single set of control buffers.
|
||||
* It also assumes that for a given process, only a single outstanding
|
||||
* barrier operation will occur for a given control structure,
|
||||
* with the sequence number being used for potential overlap in time
|
||||
* between succesive barrier calls on different processes.
|
||||
*/
|
||||
int bcol_basesmuma_rd_nb_barrier_init_admin(
|
||||
sm_nbbar_desc_t *sm_desc)
|
||||
|
||||
{
|
||||
/* local variables */
|
||||
int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange;
|
||||
int pair_rank;
|
||||
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
||||
netpatterns_pair_exchange_node_t *my_exchange_node;
|
||||
int extra_rank, my_rank;
|
||||
mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl;
|
||||
mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl;
|
||||
int64_t bank_genaration;
|
||||
bool found;
|
||||
int pool_index=sm_desc->pool_index;
|
||||
mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module;
|
||||
|
||||
/* get the pointer to the segment of control structures */
|
||||
idx=sm_desc->coll_buff->number_of_buffs+pool_index;
|
||||
leading_dim=sm_desc->coll_buff->size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,idx,0);
|
||||
ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
|
||||
sm_desc->coll_buff->ctl_buffs+idx;
|
||||
bank_genaration= sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter;
|
||||
|
||||
my_exchange_node=&(bcol_module->recursive_doubling_tree);
|
||||
my_rank=bcol_module->super.sbgp_partner_module->my_index;
|
||||
my_ctl=ctl_structs[my_rank];
|
||||
/* debug print */
|
||||
/*
|
||||
{
|
||||
int ii;
|
||||
for(ii = 0; ii < 6; ii++) {
|
||||
fprintf(stderr,"UUU ctl_struct[%d] := %p\n",ii,
|
||||
bcol_module->colls_no_user_data.ctl_buffs[ii]);
|
||||
fflush(stderr);
|
||||
}
|
||||
}
|
||||
*/
|
||||
/* end debug */
|
||||
|
||||
/* signal that I have arrived */
|
||||
my_ctl->flag = -1;
|
||||
|
||||
opal_atomic_wmb ();
|
||||
|
||||
/* don't need to set this flag anymore */
|
||||
my_ctl->sequence_number = bank_genaration;
|
||||
|
||||
if(0 < my_exchange_node->n_extra_sources) {
|
||||
if (EXCHANGE_NODE == my_exchange_node->node_type) {
|
||||
volatile int64_t *partner_sn;
|
||||
/* I will participate in the exchange - wait for signal from extra
|
||||
** process */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
partner_ctl=ctl_structs[extra_rank];
|
||||
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
|
||||
|
||||
/* spin n iterations until partner registers */
|
||||
loop_cnt=0;
|
||||
found=false;
|
||||
while( loop_cnt < bcol_module->super.n_poll_loops )
|
||||
{
|
||||
if( *partner_sn >= bank_genaration ) {
|
||||
found=true;
|
||||
break;
|
||||
}
|
||||
loop_cnt++;
|
||||
}
|
||||
if( !found ) {
|
||||
/* set restart parameters */
|
||||
sm_desc->collective_phase=NB_PRE_PHASE;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
/* Nothing to do, already registared that I am here */
|
||||
}
|
||||
}
|
||||
|
||||
for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) {
|
||||
|
||||
volatile int64_t *partner_sn;
|
||||
volatile int *partner_flag;
|
||||
|
||||
/* rank of exchange partner */
|
||||
pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
|
||||
partner_ctl=ctl_structs[pair_rank];
|
||||
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
|
||||
partner_flag=(volatile int *)&(partner_ctl->flag);
|
||||
|
||||
/* signal that I am at iteration exchange of the algorithm */
|
||||
my_ctl->flag = exchange;
|
||||
|
||||
/* check to see if the partner has arrived */
|
||||
|
||||
/* spin n iterations until partner registers */
|
||||
loop_cnt=0;
|
||||
found=false;
|
||||
while( loop_cnt < bcol_module->super.n_poll_loops )
|
||||
{
|
||||
if( (*partner_sn > bank_genaration) ||
|
||||
( *partner_sn == bank_genaration &&
|
||||
*partner_flag >= exchange ) ) {
|
||||
found=true;
|
||||
break;
|
||||
}
|
||||
|
||||
loop_cnt++;
|
||||
|
||||
}
|
||||
if( !found ) {
|
||||
/* set restart parameters */
|
||||
sm_desc->collective_phase=NB_RECURSIVE_DOUBLING;
|
||||
sm_desc->recursive_dbl_iteration=exchange;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(0 < my_exchange_node->n_extra_sources) {
|
||||
if ( EXTRA_NODE == my_exchange_node->node_type ) {
|
||||
volatile int64_t *partner_sn;
|
||||
volatile int *partner_flag;
|
||||
|
||||
/* I will not participate in the exchange -
|
||||
* wait for signal from extra partner */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
partner_ctl=ctl_structs[extra_rank];
|
||||
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
|
||||
partner_flag=(volatile int *)&(partner_ctl->flag);
|
||||
|
||||
/* spin n iterations until partner registers */
|
||||
loop_cnt=0;
|
||||
found=false;
|
||||
while( loop_cnt < bcol_module->super.n_poll_loops )
|
||||
{
|
||||
if( (*partner_sn > bank_genaration) ||
|
||||
( (*partner_sn == bank_genaration) &&
|
||||
(*partner_flag == (my_exchange_node->log_2)) ) ) {
|
||||
found=true;
|
||||
break;
|
||||
}
|
||||
loop_cnt++;
|
||||
}
|
||||
if( !found ) {
|
||||
/* set restart parameters */
|
||||
sm_desc->collective_phase=NB_POST_PHASE;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
/* signal the extra rank that I am done with the recursive
|
||||
* doubling phase.
|
||||
*/
|
||||
my_ctl->flag = my_exchange_node->n_exchanges;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/* set the barrier as complete */
|
||||
sm_desc->collective_phase=NB_BARRIER_DONE;
|
||||
/* return */
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* admin nonblocking barrier - progress function */
|
||||
int bcol_basesmuma_rd_nb_barrier_progress_admin(
|
||||
sm_nbbar_desc_t *sm_desc)
|
||||
|
||||
{
|
||||
/* local variables */
|
||||
int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange;
|
||||
int pair_rank, start_index, restart_phase;
|
||||
mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
||||
netpatterns_pair_exchange_node_t *my_exchange_node;
|
||||
int extra_rank, my_rank;
|
||||
mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl;
|
||||
mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl;
|
||||
int64_t bank_genaration;
|
||||
int pool_index=sm_desc->pool_index;
|
||||
bool found;
|
||||
mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module;
|
||||
|
||||
/* get the pointer to the segment of control structures */
|
||||
idx = sm_desc->coll_buff->number_of_buffs+pool_index;
|
||||
leading_dim = sm_desc->coll_buff->size_of_group;
|
||||
idx = SM_ARRAY_INDEX(leading_dim,idx,0);
|
||||
ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
|
||||
sm_desc->coll_buff->ctl_buffs+idx;
|
||||
bank_genaration = sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter;
|
||||
|
||||
my_exchange_node=&(bcol_module->recursive_doubling_tree);
|
||||
my_rank=bcol_module->super.sbgp_partner_module->my_index;
|
||||
my_ctl=ctl_structs[my_rank];
|
||||
|
||||
/* check to make sure that this should be progressed */
|
||||
if( ( sm_desc->collective_phase == NB_BARRIER_INACTIVE ) ||
|
||||
( sm_desc->collective_phase == NB_BARRIER_DONE ) )
|
||||
{
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* set the restart up - and jump to the correct place in the algorithm */
|
||||
restart_phase=sm_desc->collective_phase;
|
||||
if ( NB_PRE_PHASE == restart_phase ) {
|
||||
start_index=0;
|
||||
} else if ( NB_RECURSIVE_DOUBLING == restart_phase ) {
|
||||
start_index=sm_desc->recursive_dbl_iteration;
|
||||
goto Exchange_phase;
|
||||
} else {
|
||||
goto Post_phase;
|
||||
}
|
||||
|
||||
if(0 < my_exchange_node->n_extra_sources) {
|
||||
if (EXCHANGE_NODE == my_exchange_node->node_type) {
|
||||
volatile int64_t *partner_sn;
|
||||
/* I will participate in the exchange - wait for signal from extra
|
||||
** process */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
partner_ctl=ctl_structs[extra_rank];
|
||||
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
|
||||
|
||||
/* spin n iterations until partner registers */
|
||||
loop_cnt=0;
|
||||
while( loop_cnt < bcol_module->super.n_poll_loops )
|
||||
{
|
||||
found=false;
|
||||
if( *partner_sn >= bank_genaration ) {
|
||||
found=true;
|
||||
break;
|
||||
}
|
||||
loop_cnt++;
|
||||
}
|
||||
if( !found ) {
|
||||
/* set restart parameters */
|
||||
sm_desc->collective_phase=NB_PRE_PHASE;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
/* Nothing to do, already registared that I am here */
|
||||
}
|
||||
}
|
||||
|
||||
Exchange_phase:
|
||||
|
||||
for(exchange = start_index;
|
||||
exchange < my_exchange_node->n_exchanges; exchange++) {
|
||||
|
||||
volatile int64_t *partner_sn;
|
||||
volatile int *partner_flag;
|
||||
|
||||
/* rank of exchange partner */
|
||||
pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
|
||||
partner_ctl=ctl_structs[pair_rank];
|
||||
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
|
||||
partner_flag=(volatile int *)&(partner_ctl->flag);
|
||||
|
||||
/* signal that I am at iteration exchange of the algorithm */
|
||||
my_ctl->flag = exchange;
|
||||
|
||||
/* check to see if the partner has arrived */
|
||||
|
||||
/* spin n iterations until partner registers */
|
||||
loop_cnt=0;
|
||||
found=false;
|
||||
while( loop_cnt < bcol_module->super.n_poll_loops )
|
||||
{
|
||||
if( (*partner_sn > bank_genaration) ||
|
||||
( (*partner_sn == bank_genaration) &&
|
||||
(*partner_flag >= exchange) ) ) {
|
||||
found=true;
|
||||
break;
|
||||
}
|
||||
loop_cnt++;
|
||||
}
|
||||
if( !found ) {
|
||||
/* set restart parameters */
|
||||
sm_desc->collective_phase=NB_RECURSIVE_DOUBLING;
|
||||
sm_desc->recursive_dbl_iteration=exchange;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Post_phase:
|
||||
if(0 < my_exchange_node->n_extra_sources) {
|
||||
if ( EXTRA_NODE == my_exchange_node->node_type ) {
|
||||
volatile int64_t *partner_sn;
|
||||
volatile int *partner_flag;
|
||||
|
||||
/* I will not participate in the exchange -
|
||||
* wait for signal from extra partner */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
partner_ctl=ctl_structs[extra_rank];
|
||||
partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
|
||||
partner_flag=(volatile int *)&(partner_ctl->flag);
|
||||
|
||||
/* spin n iterations until partner registers */
|
||||
loop_cnt=0;
|
||||
found=false;
|
||||
while( loop_cnt < bcol_module->super.n_poll_loops )
|
||||
{
|
||||
if( (*partner_sn > bank_genaration) ||
|
||||
( *partner_sn == bank_genaration &&
|
||||
*partner_flag == (my_exchange_node->log_2) ) ) {
|
||||
found=true;
|
||||
break;
|
||||
}
|
||||
loop_cnt++;
|
||||
}
|
||||
if( !found ) {
|
||||
/* set restart parameters */
|
||||
sm_desc->collective_phase=NB_POST_PHASE;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
/* signal the extra rank that I am done with the recursive
|
||||
* doubling phase.
|
||||
*/
|
||||
my_ctl->flag = my_exchange_node->n_exchanges;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/* set the barrier as complete */
|
||||
sm_desc->collective_phase=NB_BARRIER_DONE;
|
||||
|
||||
/* return */
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bcol_basesmuma_memsync(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
int rc;
|
||||
int memory_bank = input_args->root;
|
||||
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
sm_buffer_mgmt *buff_block = &(bcol_module->colls_with_user_data);
|
||||
sm_nbbar_desc_t *sm_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
|
||||
|
||||
sm_desc->coll_buff = buff_block;
|
||||
/*
|
||||
printf("XXX SYNC call\n");
|
||||
*/
|
||||
|
||||
rc = bcol_basesmuma_rd_nb_barrier_init_admin(
|
||||
sm_desc);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (NB_BARRIER_DONE != sm_desc->collective_phase) {
|
||||
mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
|
||||
opal_list_t *list=&(cs->nb_admin_barriers);
|
||||
opal_list_item_t *append_item;
|
||||
|
||||
/* put this onto the progression list */
|
||||
OPAL_THREAD_LOCK(&(cs->nb_admin_barriers_mutex));
|
||||
append_item=(opal_list_item_t *)
|
||||
&(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
|
||||
opal_list_append(list,append_item);
|
||||
OPAL_THREAD_UNLOCK(&(cs->nb_admin_barriers_mutex));
|
||||
/* progress communications so that resources can be freed up */
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
/* Done - bump the counter */
|
||||
(buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter)++;
|
||||
/*
|
||||
printf("XXX SYNC call done \n");
|
||||
*/
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static int bcol_basesmuma_memsync_progress(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
int memory_bank = input_args->root;
|
||||
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
sm_buffer_mgmt *buff_block = &(bcol_module->colls_with_user_data);
|
||||
sm_nbbar_desc_t *sm_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
|
||||
|
||||
/* I do not have to do anything, since the
|
||||
progress done by basesmuma progress engine */
|
||||
|
||||
if (NB_BARRIER_DONE != sm_desc->collective_phase) {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
int bcol_basesmuma_memsync_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_SYNC;
|
||||
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super,
|
||||
&comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_memsync,
|
||||
bcol_basesmuma_memsync_progress);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,382 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
|
||||
#include "opal/include/opal_stdint.h"
|
||||
|
||||
#include "bcol_basesmuma.h"
|
||||
#include "bcol_basesmuma_reduce.h"
|
||||
/**
|
||||
* gvm - Shared memory reduce
|
||||
*/
|
||||
|
||||
static int bcol_basesmuma_reduce_intra_fanin_progress(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args);
|
||||
|
||||
int bcol_basesmuma_reduce_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_REDUCE;
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1048576;
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000;
|
||||
inv_attribs.datatype_bitmap = 0x11111111;
|
||||
inv_attribs.op_types_bitmap = 0x11111111;
|
||||
|
||||
|
||||
/* Set attributes for fanin fanout algorithm */
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_basesmuma_reduce_intra_fanin,
|
||||
bcol_basesmuma_reduce_intra_fanin_progress);
|
||||
|
||||
inv_attribs.bcol_msg_min = 10000000;
|
||||
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, NULL, NULL);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Small data fanin reduce
|
||||
* ML buffers are used for both payload and control structures
|
||||
* This functions works with hierarchical allreduce and
|
||||
* progress engine
|
||||
*/
|
||||
static inline int reduce_children (mca_bcol_basesmuma_module_t *bcol_module, volatile void *rbuf, netpatterns_tree_node_t *my_reduction_node,
|
||||
int *iteration, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, ompi_datatype_t *dtype,
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs, int count, struct ompi_op_t *op, int process_shift) {
|
||||
volatile mca_bcol_basesmuma_header_t * child_ctl_pointer;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
int64_t sequence_number = my_ctl_pointer->sequence_number;
|
||||
int8_t ready_flag = my_ctl_pointer->ready_flag;
|
||||
int group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
|
||||
if (LEAF_NODE != my_reduction_node->my_node_type) {
|
||||
volatile char *child_data_pointer;
|
||||
volatile void *child_rbuf;
|
||||
|
||||
/* for each child */
|
||||
/* my_result_data = child_result_data (op) my_source_data */
|
||||
|
||||
for (int child = *iteration ; child < my_reduction_node->n_children ; ++child) {
|
||||
int child_rank = my_reduction_node->children_ranks[child] + process_shift;
|
||||
|
||||
if (group_size <= child_rank){
|
||||
child_rank -= group_size;
|
||||
}
|
||||
|
||||
child_ctl_pointer = data_buffs[child_rank].ctl_struct;
|
||||
child_data_pointer = data_buffs[child_rank].payload;
|
||||
|
||||
if (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, REDUCE_FLAG, bcol_id)) {
|
||||
*iteration = child;
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
child_rbuf = child_data_pointer + child_ctl_pointer->roffsets[bcol_id];
|
||||
|
||||
ompi_op_reduce(op,(void *)child_rbuf,(void *)rbuf, count, dtype);
|
||||
} /* end child loop */
|
||||
}
|
||||
|
||||
if (ROOT_NODE != my_reduction_node->my_node_type) {
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->flags[REDUCE_FLAG][bcol_id] = ready_flag;
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static int bcol_basesmuma_reduce_intra_fanin_progress(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
|
||||
netpatterns_tree_node_t *my_reduction_node;
|
||||
int my_rank, my_node_index;
|
||||
struct ompi_datatype_t *dtype = input_args->dtype;
|
||||
int leading_dim, idx;
|
||||
|
||||
/* Buffer index */
|
||||
int buff_idx = input_args->src_desc->buffer_index;
|
||||
|
||||
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
|
||||
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
void *data_addr = (void *)input_args->src_desc->data_addr;
|
||||
volatile void *rbuf;
|
||||
|
||||
/* get addressing information */
|
||||
my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
idx = SM_ARRAY_INDEX(leading_dim, buff_idx, 0);
|
||||
|
||||
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs + idx;
|
||||
|
||||
/* Get control structure and payload buffer */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
|
||||
my_node_index = my_rank - input_args->root;
|
||||
if (0 > my_node_index) {
|
||||
int group_size = bcol_module->colls_no_user_data.size_of_group;
|
||||
my_node_index += group_size;
|
||||
}
|
||||
|
||||
my_reduction_node = bcol_module->reduction_tree + my_node_index;
|
||||
rbuf = (volatile void *)((uintptr_t) data_addr + input_args->rbuf_offset);
|
||||
|
||||
return reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer, dtype,
|
||||
data_buffs, input_args->count, input_args->op, input_args->root);
|
||||
}
|
||||
|
||||
int bcol_basesmuma_reduce_intra_fanin(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
int rc=BCOL_FN_COMPLETE;
|
||||
int my_rank,group_size,my_node_index;
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
|
||||
netpatterns_tree_node_t *my_reduction_node;
|
||||
volatile int8_t ready_flag;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
volatile void *sbuf,*rbuf;
|
||||
int sbuf_offset,rbuf_offset;
|
||||
int root,count;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
struct ompi_datatype_t *dtype;
|
||||
int leading_dim,idx;
|
||||
|
||||
/* Buffer index */
|
||||
int buff_idx = input_args->src_desc->buffer_index;
|
||||
|
||||
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
|
||||
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile char * my_data_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
void *data_addr = (void *)input_args->src_desc->data_addr;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr,"777 entering sm reduce \n");
|
||||
#endif
|
||||
|
||||
/* get addressing information */
|
||||
my_rank=bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size=bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
|
||||
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
/* fprintf(stderr,"AAA the devil!!\n"); */
|
||||
/* Get control structure and payload buffer */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
my_data_pointer = (volatile char *)data_addr;
|
||||
|
||||
/* Align node index to around sbgp root */
|
||||
root = input_args->root;
|
||||
my_node_index = my_rank - root;
|
||||
if (0 > my_node_index) {
|
||||
my_node_index += group_size;
|
||||
}
|
||||
|
||||
/* get arguments */
|
||||
sbuf_offset = input_args->sbuf_offset;
|
||||
rbuf_offset = input_args->rbuf_offset;
|
||||
sbuf = (volatile void *)(my_data_pointer + sbuf_offset);
|
||||
data_buffs[my_rank].payload = (void*)sbuf;
|
||||
rbuf = (volatile void *)(my_data_pointer + rbuf_offset);
|
||||
count = input_args->count;
|
||||
dtype = input_args->dtype;
|
||||
|
||||
/* Cache my rbuf_offset */
|
||||
my_ctl_pointer->roffsets[bcol_id] = rbuf_offset;
|
||||
|
||||
/* get my node for the reduction tree */
|
||||
my_reduction_node=&(bcol_module->reduction_tree[my_node_index]);
|
||||
|
||||
/* init the header */
|
||||
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
|
||||
|
||||
input_args->result_in_rbuf = (ROOT_NODE == my_reduction_node->my_node_type);
|
||||
|
||||
/* set starting point for progress loop */
|
||||
*iteration = 0;
|
||||
my_ctl_pointer->ready_flag = ready_flag;
|
||||
|
||||
if (sbuf != rbuf) {
|
||||
rc = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf,
|
||||
(char *)sbuf);
|
||||
if( 0 != rc ) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
rc = reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer, dtype,
|
||||
data_buffs, count, input_args->op, root);
|
||||
|
||||
/* Flag value if other bcols are called */
|
||||
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
||||
|
||||
/* Recycle payload buffers */
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Small data fanin reduce
|
||||
* Uses SM buffer (backed by SM file) for both control structures and
|
||||
* payload
|
||||
*
|
||||
* NTH: How does this differ from the new one? Can we replace this
|
||||
* with a call to the new init then a call the new progress until
|
||||
* complete?
|
||||
*/
|
||||
int bcol_basesmuma_reduce_intra_fanin_old(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args)
|
||||
{
|
||||
/* local variables */
|
||||
int rc=OMPI_SUCCESS;
|
||||
int my_rank,group_size,process_shift,my_node_index;
|
||||
int n_children,child;
|
||||
mca_bcol_basesmuma_module_t* bcol_module =
|
||||
(mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
|
||||
|
||||
netpatterns_tree_node_t *my_reduction_node;
|
||||
volatile int8_t ready_flag;
|
||||
volatile void *sbuf,*rbuf;
|
||||
int sbuf_offset,rbuf_offset;
|
||||
int root,count;
|
||||
struct ompi_op_t *op;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
struct ompi_datatype_t *dtype;
|
||||
int leading_dim,idx;
|
||||
int buff_idx;
|
||||
int child_rank;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
volatile char * my_data_pointer;
|
||||
volatile char * child_data_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t * child_ctl_pointer;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr,"Entering fanin reduce \n");
|
||||
#endif
|
||||
|
||||
/* Buffer index */
|
||||
buff_idx = input_args->src_desc->buffer_index;
|
||||
/* get addressing information */
|
||||
my_rank=bcol_module->super.sbgp_partner_module->my_index;
|
||||
group_size=bcol_module->colls_no_user_data.size_of_group;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
|
||||
/*ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
|
||||
bcol_module->colls_with_user_data.ctl_buffs+idx;*/
|
||||
data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
|
||||
/* Get control structure and payload buffer */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
my_data_pointer = (volatile char *) data_buffs[my_rank].payload;
|
||||
|
||||
/* Align node index to around sbgp root */
|
||||
root = input_args->root;
|
||||
process_shift = root;
|
||||
my_node_index = my_rank - root;
|
||||
if (0 > my_node_index ) {
|
||||
my_node_index += group_size;
|
||||
}
|
||||
|
||||
/* get arguments */
|
||||
sbuf_offset = input_args->sbuf_offset;
|
||||
rbuf_offset = input_args->rbuf_offset;
|
||||
sbuf = (volatile void *)(my_data_pointer + sbuf_offset);
|
||||
rbuf = (volatile void *)(my_data_pointer + rbuf_offset);
|
||||
op = input_args->op;
|
||||
count = input_args->count;
|
||||
dtype = input_args->dtype;
|
||||
|
||||
/* get my node for the reduction tree */
|
||||
my_reduction_node=&(bcol_module->reduction_tree[my_node_index]);
|
||||
n_children=my_reduction_node->n_children;
|
||||
|
||||
/* init the header */
|
||||
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
|
||||
|
||||
input_args->result_in_rbuf = (ROOT_NODE == my_reduction_node->my_node_type);
|
||||
|
||||
rc = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf,
|
||||
(char *)sbuf);
|
||||
if (0 != rc) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if (LEAF_NODE != my_reduction_node->my_node_type) {
|
||||
volatile void *child_rbuf;
|
||||
/* for each child */
|
||||
/* my_result_data = child_result_data (op) my_source_data */
|
||||
|
||||
for (child = 0 ; child < n_children ; ++child) {
|
||||
child_rank = my_reduction_node->children_ranks[child];
|
||||
child_rank += process_shift;
|
||||
|
||||
/* wrap around */
|
||||
if( group_size <= child_rank ){
|
||||
child_rank-=group_size;
|
||||
}
|
||||
|
||||
/*child_ctl_pointer = ctl_structs[child_rank];*/
|
||||
child_ctl_pointer = data_buffs[child_rank].ctl_struct;
|
||||
child_data_pointer = data_buffs[child_rank].payload;
|
||||
|
||||
child_rbuf = child_data_pointer + rbuf_offset;
|
||||
/* wait until child child's data is ready for use */
|
||||
while (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, REDUCE_FLAG, bcol_id)) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
/* apply collective operation */
|
||||
ompi_op_reduce(op,(void *)child_rbuf,(void *)rbuf, count,dtype);
|
||||
} /* end child loop */
|
||||
}
|
||||
|
||||
if (ROOT_NODE != my_reduction_node->my_node_type) {
|
||||
opal_atomic_wmb ();
|
||||
my_ctl_pointer->flags[REDUCE_FLAG][bcol_id] = ready_flag;
|
||||
}
|
||||
|
||||
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
||||
|
||||
return rc;
|
||||
}
|
@ -1,92 +0,0 @@
|
||||
#ifndef __BASESMUMA_REDUCE_H_
|
||||
|
||||
#define __BASESMUMA_REDUCE_H_
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "bcol_basesmuma_utils.h"
|
||||
#include <unistd.h>
|
||||
|
||||
enum {
|
||||
BLOCK_OFFSET = 0,
|
||||
LOCAL_REDUCE_SEG_OFFSET,
|
||||
BLOCK_COUNT,
|
||||
SEG_SIZE,
|
||||
NOFFSETS
|
||||
};
|
||||
|
||||
int compute_knomial_reduce_offsets(int group_index, int count, struct
|
||||
ompi_datatype_t *dtype,int k_radix,int n_exchanges,
|
||||
int **offsets);
|
||||
|
||||
int compute_knomial_reduce_offsets_reverse(int group_index, int count, struct
|
||||
ompi_datatype_t *dtype,int k_radix,int n_exchanges,
|
||||
int **offsets);
|
||||
|
||||
int bcol_basesmuma_lmsg_reduce_recursivek_scatter_reduce(mca_bcol_basesmuma_module_t *sm_module,
|
||||
const int buffer_index, void *sbuf,
|
||||
void *rbuf,
|
||||
struct ompi_op_t *op,
|
||||
const int count, struct ompi_datatype_t *dtype,
|
||||
const int relative_group_index,
|
||||
const int padded_start_byte,
|
||||
volatile int8_t ready_flag,
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs);
|
||||
|
||||
int bcol_basesmuma_lmsg_reduce_knomial_gather(mca_bcol_basesmuma_module_t *basesmuma_module,
|
||||
const int buffer_index,
|
||||
void *sbuf,void *rbuf, int count, struct
|
||||
ompi_datatype_t *dtype,
|
||||
const int my_group_index,
|
||||
const int padded_start_byte,
|
||||
volatile int8_t rflag,
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs);
|
||||
|
||||
int bcol_basesmuma_lmsg_reduce_extra_root(mca_bcol_basesmuma_module_t *sm_module,
|
||||
const int buffer_index, void *sbuf,
|
||||
void *rbuf,
|
||||
struct ompi_op_t *op,
|
||||
const int count, struct ompi_datatype_t *dtype,
|
||||
const int relative_group_index,
|
||||
const int padded_start_byte,
|
||||
volatile int8_t rflag,
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs);
|
||||
|
||||
|
||||
|
||||
int bcol_basesmuma_lmsg_reduce_extra_non_root(mca_bcol_basesmuma_module_t *sm_module,
|
||||
const int buffer_index, void *sbuf,
|
||||
void *rbuf,
|
||||
int root,
|
||||
struct ompi_op_t *op,
|
||||
const int count, struct ompi_datatype_t *dtype,
|
||||
const int relative_group_index,
|
||||
const int group_size,
|
||||
const int padded_start_byte,
|
||||
volatile int8_t rflag,
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs);
|
||||
|
||||
int bcol_basesmuma_lmsg_reduce(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args);
|
||||
|
||||
int bcol_basesmuma_lmsg_reduce_extra(bcol_function_args_t *input_args,
|
||||
mca_bcol_base_function_t *c_input_args);
|
||||
|
||||
void basesmuma_reduce_recv(int my_group_index, int peer,
|
||||
void *recv_buffer,
|
||||
int recv_size,
|
||||
volatile int8_t ready_flag_val,
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs);
|
||||
|
||||
void basesmuma_reduce_send(int my_group_index,
|
||||
int peer,
|
||||
void *send_buffer,
|
||||
int snd_size,
|
||||
int send_offset,
|
||||
volatile int8_t ready_flag_val,
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs);
|
||||
|
||||
#endif
|
@ -1,442 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
|
||||
|
||||
/*
|
||||
#define IS_BARRIER_READY(peer, my_flag, my_sequence_number)\
|
||||
(((peer)->sequence_number == (my_sequence_number) && \
|
||||
(peer)->flags[BARRIER_RKING_FLAG][bcol_id] >= (my_flag) \
|
||||
)? true : false )
|
||||
*/
|
||||
|
||||
#define CALC_ACTIVE_REQUESTS(active_requests,peers, tree_order) \
|
||||
do{ \
|
||||
for( j = 0; j < (tree_order - 1); j++){ \
|
||||
if( 0 > peers[j] ) { \
|
||||
/* set the bit */ \
|
||||
*active_requests ^= (1<<j); \
|
||||
} \
|
||||
} \
|
||||
}while(0)
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Recursive K-ing barrier
|
||||
*/
|
||||
|
||||
/*
|
||||
*
|
||||
* Recurssive k-ing algorithm
|
||||
* Example k=3 n=9
|
||||
*
|
||||
*
|
||||
* Number of Exchange steps = log (basek) n
|
||||
* Number of steps in exchange step = k (radix)
|
||||
*
|
||||
*/
|
||||
int bcol_basesmuma_k_nomial_barrier_init(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variables */
|
||||
int flag_offset = 0;
|
||||
volatile int8_t ready_flag;
|
||||
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
|
||||
netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
|
||||
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
|
||||
uint32_t buffer_index = input_args->buffer_index;
|
||||
int *active_requests =
|
||||
&(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
|
||||
|
||||
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
|
||||
int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
|
||||
int leading_dim, buff_idx, idx;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
|
||||
int i, j, probe;
|
||||
int src;
|
||||
|
||||
int pow_k, tree_order;
|
||||
int max_requests = 0; /* important to initialize this */
|
||||
|
||||
bool matched;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
int my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
|
||||
/* control structures */
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;
|
||||
#if 0
|
||||
fprintf(stderr,"entering sm barrier sn = %d buff index = %d\n",sequence_number,input_args->buffer_index);
|
||||
#endif
|
||||
/* initialize the iteration counter */
|
||||
buff_idx = input_args->buffer_index;
|
||||
leading_dim = bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
/* Set pointer to current proc ctrl region */
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
|
||||
/* init the header */
|
||||
BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
|
||||
/* initialize these */
|
||||
*iteration = 0;
|
||||
*active_requests = 0;
|
||||
*status = 0;
|
||||
|
||||
/* k-nomial parameters */
|
||||
tree_order = exchange_node->tree_order;
|
||||
pow_k = exchange_node->log_tree_order;
|
||||
|
||||
/* calculate the maximum number of requests
|
||||
* at each level each rank communicates with
|
||||
* at most (k - 1) peers
|
||||
* so if we set k - 1 bit fields in "max_requests", then
|
||||
* we have max_request == 2^(k - 1) -1
|
||||
*/
|
||||
for(i = 0; i < (tree_order - 1); i++){
|
||||
max_requests ^= (1<<i);
|
||||
}
|
||||
/* let's begin the collective, starting with extra ranks and their
|
||||
* respective proxies
|
||||
*/
|
||||
|
||||
if( EXTRA_NODE == exchange_node->node_type ) {
|
||||
|
||||
/* then I will signal to my proxy rank*/
|
||||
|
||||
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
|
||||
ready_flag = flag_offset + 1 + pow_k + 2;
|
||||
/* now, poll for completion */
|
||||
|
||||
src = exchange_node->rank_extra_sources_array[0];
|
||||
peer_ctl_pointer = data_buffs[src].ctl_struct;
|
||||
|
||||
for( i = 0; i < cm->num_to_probe ; i++ ) {
|
||||
if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
|
||||
goto FINISHED;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* cache state and bail */
|
||||
*iteration = -1;
|
||||
return BCOL_FN_STARTED;
|
||||
|
||||
}else if ( 0 < exchange_node->n_extra_sources ) {
|
||||
|
||||
/* I am a proxy for someone */
|
||||
src = exchange_node->rank_extra_sources_array[0];
|
||||
peer_ctl_pointer = data_buffs[src].ctl_struct;
|
||||
|
||||
/* probe for extra rank's arrival */
|
||||
for( i = 0, matched = false ; i < cm->num_to_probe && !matched ; i++) {
|
||||
if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
|
||||
/* copy it in */
|
||||
matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!matched) {
|
||||
*status = ready_flag;
|
||||
*iteration = -1;
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
/* bump the ready flag */
|
||||
ready_flag++;
|
||||
|
||||
/* we start the recursive k - ing phase */
|
||||
for( *iteration = 0; *iteration < pow_k; (*iteration)++) {
|
||||
/* announce my arrival */
|
||||
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
|
||||
/* calculate the number of active requests */
|
||||
CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iteration],tree_order);
|
||||
/* Now post the recv's */
|
||||
for( j = 0; j < (tree_order - 1); j++ ) {
|
||||
|
||||
/* recv phase */
|
||||
src = exchange_node->rank_exchanges[*iteration][j];
|
||||
if( src < 0 ) {
|
||||
/* then not a valid rank, continue */
|
||||
continue;
|
||||
}
|
||||
|
||||
peer_ctl_pointer = data_buffs[src].ctl_struct;
|
||||
if( !(*active_requests&(1<<j))) {
|
||||
/* then the bit hasn't been set, thus this peer
|
||||
* hasn't been processed at this level
|
||||
* I am putting the probe loop as the inner most loop to achieve
|
||||
* better temporal locality, this comes at a cost to asynchronicity
|
||||
* but should get better cache performance
|
||||
*/
|
||||
for( probe = 0; probe < cm->num_to_probe ; probe++){
|
||||
if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
|
||||
/* set this request's bit */
|
||||
*active_requests ^= (1<<j);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if( max_requests == *active_requests ){
|
||||
/* bump the ready flag */
|
||||
ready_flag++;
|
||||
/*reset the active requests */
|
||||
*active_requests = 0;
|
||||
} else {
|
||||
/* cache the state and hop out
|
||||
* only the iteration needs to be tracked
|
||||
*/
|
||||
*status = my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id];
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
/* bump the flag one more time for the extra rank */
|
||||
ready_flag = flag_offset + 1 + pow_k + 2;
|
||||
|
||||
/* finish off the last piece, send the data back to the extra */
|
||||
if( 0 < exchange_node->n_extra_sources ) {
|
||||
/* simply announce my arrival */
|
||||
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
|
||||
|
||||
}
|
||||
|
||||
FINISHED:
|
||||
|
||||
|
||||
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
|
||||
/* allgather progress function */
|
||||
|
||||
int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
|
||||
|
||||
/* local variables */
|
||||
int flag_offset;
|
||||
volatile int8_t ready_flag;
|
||||
mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
|
||||
netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
|
||||
mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
|
||||
uint32_t buffer_index = input_args->buffer_index;
|
||||
int *active_requests =
|
||||
&(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
|
||||
|
||||
int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
|
||||
int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
|
||||
int *iter = iteration; /* double alias */
|
||||
int leading_dim, idx, buff_idx;
|
||||
|
||||
int i, j, probe;
|
||||
int src;
|
||||
int max_requests = 0; /* critical to set this */
|
||||
int pow_k, tree_order;
|
||||
int bcol_id = (int) bcol_module->super.bcol_id;
|
||||
|
||||
bool matched;
|
||||
int64_t sequence_number=input_args->sequence_num;
|
||||
int my_rank = bcol_module->super.sbgp_partner_module->my_index;
|
||||
|
||||
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
||||
|
||||
/* control structures */
|
||||
volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
|
||||
volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;
|
||||
#if 0
|
||||
fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n",my_rank,
|
||||
*active_requests,*iter,*status);
|
||||
#endif
|
||||
buff_idx = buffer_index;
|
||||
leading_dim=bcol_module->colls_no_user_data.size_of_group;
|
||||
idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
|
||||
|
||||
data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
|
||||
bcol_module->colls_with_user_data.data_buffs+idx;
|
||||
my_ctl_pointer = data_buffs[my_rank].ctl_struct;
|
||||
|
||||
/* increment the starting flag by one and return */
|
||||
flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];
|
||||
ready_flag = *status;
|
||||
/* k-nomial parameters */
|
||||
tree_order = exchange_node->tree_order;
|
||||
pow_k = exchange_node->log_tree_order;
|
||||
|
||||
/* calculate the maximum number of requests
|
||||
* at each level each rank communicates with
|
||||
* at most (k - 1) peers
|
||||
* so if we set k - 1 bit fields in "max_requests", then
|
||||
* we have max_request == 2^(k - 1) -1
|
||||
*/
|
||||
for(i = 0; i < (tree_order - 1); i++){
|
||||
max_requests ^= (1<<i);
|
||||
}
|
||||
|
||||
/* let's begin the collective, starting with extra ranks and their
|
||||
* respective proxies
|
||||
*/
|
||||
|
||||
if( EXTRA_NODE == exchange_node->node_type ) {
|
||||
|
||||
/* If I'm in here, then I must be looking for data */
|
||||
ready_flag = flag_offset + 1 + pow_k + 2;
|
||||
|
||||
src = exchange_node->rank_extra_sources_array[0];
|
||||
peer_ctl_pointer = data_buffs[src].ctl_struct;
|
||||
|
||||
for( i = 0; i < cm->num_to_probe ; i++ ) {
|
||||
if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
|
||||
goto FINISHED;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* haven't found it, state is cached, bail out */
|
||||
return BCOL_FN_STARTED;
|
||||
|
||||
}else if ( ( -1 == *iteration ) && (0 < exchange_node->n_extra_sources) ) {
|
||||
|
||||
/* I am a proxy for someone */
|
||||
src = exchange_node->rank_extra_sources_array[0];
|
||||
peer_ctl_pointer = data_buffs[src].ctl_struct;
|
||||
|
||||
/* probe for extra rank's arrival */
|
||||
for( i = 0, matched = false ; i < cm->num_to_probe && !matched ; i++) {
|
||||
if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
|
||||
matched = true;
|
||||
/* bump the flag */
|
||||
ready_flag++;
|
||||
*iteration = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!matched) {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
/* start the recursive k - ing phase */
|
||||
for( *iter=*iteration; *iter < pow_k; (*iter)++) {
|
||||
/* I am ready at this level */
|
||||
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
|
||||
if( 0 == *active_requests ) {
|
||||
/* flip some bits, if we don't have active requests from a previous visit */
|
||||
CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iter],tree_order);
|
||||
}
|
||||
for( j = 0; j < (tree_order - 1); j++ ) {
|
||||
|
||||
/* recv phase */
|
||||
src = exchange_node->rank_exchanges[*iter][j];
|
||||
if( src < 0 ) {
|
||||
/* then not a valid rank, continue
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
peer_ctl_pointer = data_buffs[src].ctl_struct;
|
||||
if( !(*active_requests&(1<<j))){
|
||||
|
||||
/* I am putting the probe loop as the inner most loop to achieve
|
||||
* better temporal locality
|
||||
*/
|
||||
for( probe = 0; probe < cm->num_to_probe ; probe++){
|
||||
if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
|
||||
/* flip the request's bit */
|
||||
*active_requests ^= (1<<j);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if( max_requests == *active_requests ){
|
||||
/* bump the ready flag */
|
||||
ready_flag++;
|
||||
/* reset the active requests for the next level */
|
||||
*active_requests = 0;
|
||||
/* calculate the number of active requests
|
||||
* logically makes sense to do it here. We don't
|
||||
* want to inadvertantly flip a bit to zero that we
|
||||
* set previously
|
||||
*/
|
||||
} else {
|
||||
/* state is saved hop out
|
||||
*/
|
||||
*status = my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id];
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
/* bump the flag one more time for the extra rank */
|
||||
ready_flag = flag_offset + 1 + pow_k + 2;
|
||||
|
||||
/* finish off the last piece, send the data back to the extra */
|
||||
if( 0 < exchange_node->n_extra_sources ) {
|
||||
/* simply announce my arrival */
|
||||
my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
|
||||
|
||||
}
|
||||
|
||||
FINISHED:
|
||||
|
||||
my_ctl_pointer->starting_flag_value[bcol_id]++;
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
/* Register k-nomial barrier functions to the BCOL function table,
|
||||
* so they can be selected
|
||||
*/
|
||||
int bcol_basesmuma_barrier_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_BARRIER;
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_basesmuma_k_nomial_barrier_init,
|
||||
bcol_basesmuma_k_nomial_barrier_progress);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -1,588 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/patterns/comm/coll_ops.h"
|
||||
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
int base_bcol_basesmuma_setup_ctl_struct(
|
||||
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
||||
mca_bcol_basesmuma_component_t *cs,
|
||||
sm_buffer_mgmt *ctl_mgmt);
|
||||
|
||||
/* this is the new one, uses the pml allgather */
|
||||
int base_bcol_basesmuma_exchange_offsets(
|
||||
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
||||
void **result_array, uint64_t mem_offset, int loop_limit,
|
||||
int leading_dim)
|
||||
{
|
||||
int ret=OMPI_SUCCESS,i;
|
||||
int count;
|
||||
int index_in_group;
|
||||
char *send_buff;
|
||||
char *recv_buff;
|
||||
uint64_t rem_mem_offset;
|
||||
|
||||
/* malloc some memory */
|
||||
count = sizeof(uint64_t) + sizeof(int);
|
||||
send_buff = (char *) malloc(count);
|
||||
recv_buff = (char *) malloc(count *
|
||||
sm_bcol_module->super.sbgp_partner_module->group_size);
|
||||
/* exchange the base pointer for the controls structures - gather
|
||||
* every one else's infromation.
|
||||
*/
|
||||
|
||||
|
||||
/* pack the offset of the allocated region */
|
||||
memcpy((void *) send_buff, (void *) &(sm_bcol_module->super.sbgp_partner_module->my_index), sizeof(int));
|
||||
memcpy((void *) (send_buff+ sizeof(int)), (void *) &(mem_offset), sizeof(uint64_t));
|
||||
|
||||
/* get the offsets from all procs, so can setup the control data
|
||||
* structures.
|
||||
*/
|
||||
|
||||
ret=comm_allgather_pml((void *) send_buff,(void *) recv_buff,count,
|
||||
MPI_BYTE,
|
||||
sm_bcol_module->super.sbgp_partner_module->my_index,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_size,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_list,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_comm);
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* get the control stucture offsets within the shared memory
|
||||
* region and populate the control structures - we do not assume
|
||||
* any symmetry in memory layout of each process
|
||||
*/
|
||||
|
||||
/* loop over the procs in the group */
|
||||
for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){
|
||||
int array_id;
|
||||
/* get this peer's index in the group */
|
||||
memcpy((void *) &index_in_group, (void *) (recv_buff + i*count) , sizeof(int));
|
||||
|
||||
/* get the offset */
|
||||
memcpy((void *) &rem_mem_offset, (void *) (recv_buff + i*count + sizeof(int)), sizeof(uint64_t));
|
||||
|
||||
array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group);
|
||||
result_array[array_id]=(void *)(uintptr_t)rem_mem_offset;
|
||||
|
||||
}
|
||||
|
||||
exit_ERROR:
|
||||
/* clean up */
|
||||
if( NULL != send_buff ) {
|
||||
free(send_buff);
|
||||
send_buff = NULL;
|
||||
}
|
||||
if( NULL != recv_buff ) {
|
||||
free(recv_buff);
|
||||
recv_buff = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
|
||||
}
|
||||
|
||||
#if 0
|
||||
int base_bcol_basesmuma_exchange_offsets(
|
||||
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
||||
void **result_array, uint64_t mem_offset, int loop_limit,
|
||||
int leading_dim)
|
||||
{
|
||||
int ret=OMPI_SUCCESS,i,dummy;
|
||||
int index_in_group, pcnt;
|
||||
opal_list_t peers;
|
||||
ompi_namelist_t *peer;
|
||||
ompi_proc_t *proc_temp, *my_id;
|
||||
opal_buffer_t *send_buffer = OBJ_NEW(opal_buffer_t);
|
||||
opal_buffer_t *recv_buffer = OBJ_NEW(opal_buffer_t);
|
||||
uint64_t rem_mem_offset;
|
||||
|
||||
/* exchange the base pointer for the controls structures - gather
|
||||
* every one else's infromation.
|
||||
*/
|
||||
/* get list of procs that will participate in the communication */
|
||||
OBJ_CONSTRUCT(&peers, opal_list_t);
|
||||
for (i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++) {
|
||||
/* get the proc info */
|
||||
proc_temp = ompi_comm_peer_lookup(
|
||||
sm_bcol_module->super.sbgp_partner_module->group_comm,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_list[i]);
|
||||
peer = OBJ_NEW(ompi_namelist_t);
|
||||
peer->name.jobid = proc_temp->proc_name.jobid;
|
||||
peer->name.vpid = proc_temp->proc_name.vpid;
|
||||
opal_list_append(&peers,&peer->super); /* this is with the new field called "super" in ompi_namelist_t struct */
|
||||
}
|
||||
/* pack up the data into the allgather send buffer */
|
||||
if (NULL == send_buffer || NULL == recv_buffer) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for sbuffer or rbuffer\n");
|
||||
ret = OMPI_ERROR;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* get my proc information */
|
||||
my_id = ompi_proc_local();
|
||||
|
||||
/* pack my information */
|
||||
ret = opal_dss.pack(send_buffer,
|
||||
&(sm_bcol_module->super.sbgp_partner_module->my_index),1,OPAL_UINT32);
|
||||
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Error packing my_index!!\n");
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* pack the offset of the allocated region */
|
||||
ret = opal_dss.pack(send_buffer,&(mem_offset),1,OPAL_UINT64);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* get the offsets from all procs, so can setup the control data
|
||||
* structures.
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_allgather_list(&peers, send_buffer, recv_buffer))) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "ompi_rte_allgather_list returned error %d\n", ret);
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* unpack the dummy */
|
||||
pcnt=1;
|
||||
ret = opal_dss.unpack(recv_buffer,&dummy, &pcnt, OPAL_INT32);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for dummy\n",ret);
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* get the control stucture offsets within the shared memory
|
||||
* region and populate the control structures - we do not assume
|
||||
* any symmetry in memory layout of each process
|
||||
*/
|
||||
|
||||
/* loop over the procs in the group */
|
||||
for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){
|
||||
int array_id;
|
||||
pcnt=1;
|
||||
ret = opal_dss.unpack(recv_buffer,&index_in_group, &pcnt, OPAL_UINT32);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote index_in_group\n",ret);
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* get the offset */
|
||||
pcnt=1;
|
||||
ret = opal_dss.unpack(recv_buffer,&rem_mem_offset, &pcnt, OPAL_UINT64);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote memory offset\n",ret);
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group);
|
||||
result_array[array_id]=(void *)rem_mem_offset;
|
||||
|
||||
}
|
||||
|
||||
/* clean up */
|
||||
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
|
||||
while( NULL !=peer) {
|
||||
OBJ_RELEASE(peer);
|
||||
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
|
||||
}
|
||||
OBJ_DESTRUCT(&peers);
|
||||
if( send_buffer ) {
|
||||
OBJ_RELEASE(send_buffer);
|
||||
}
|
||||
if( recv_buffer ) {
|
||||
OBJ_RELEASE(recv_buffer);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
exit_ERROR:
|
||||
|
||||
/* free peer list */
|
||||
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
|
||||
while( NULL !=peer) {
|
||||
OBJ_RELEASE(peer);
|
||||
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
|
||||
}
|
||||
OBJ_DESTRUCT(&peers);
|
||||
if( send_buffer ) {
|
||||
OBJ_RELEASE(send_buffer);
|
||||
}
|
||||
if( recv_buffer ) {
|
||||
OBJ_RELEASE(recv_buffer);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static int base_bcol_basesmuma_exchange_ctl_params(
|
||||
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
||||
mca_bcol_basesmuma_component_t *cs,
|
||||
sm_buffer_mgmt *ctl_mgmt, list_data_t *data_blk)
|
||||
{
|
||||
int ret=OMPI_SUCCESS,i,loop_limit;
|
||||
int leading_dim, buf_id;
|
||||
void *mem_offset;
|
||||
unsigned char *base_ptr;
|
||||
mca_bcol_basesmuma_ctl_struct_t *ctl_ptr;
|
||||
|
||||
/* data block base offset in the mapped file */
|
||||
mem_offset = (void *)((uintptr_t)data_blk->data -
|
||||
(uintptr_t)cs->sm_ctl_structs->data_addr);
|
||||
|
||||
/* number of buffers in data block */
|
||||
loop_limit=cs->basesmuma_num_mem_banks+ctl_mgmt->number_of_buffs;
|
||||
leading_dim=ctl_mgmt->size_of_group;
|
||||
ret=comm_allgather_pml(&mem_offset, ctl_mgmt->ctl_buffs, sizeof(void *),
|
||||
MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_size,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_list,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_comm);
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
#if 0
|
||||
ret=base_bcol_basesmuma_exchange_offsets( sm_bcol_module,
|
||||
(void **)ctl_mgmt->ctl_buffs, mem_offset, loop_limit, leading_dim);
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
goto exit_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* convert memory offset to virtual address in current rank */
|
||||
for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {
|
||||
|
||||
/* get the base pointer */
|
||||
int array_id=SM_ARRAY_INDEX(leading_dim,0,i);
|
||||
if( i == sm_bcol_module->super.sbgp_partner_module->my_index) {
|
||||
/* me */
|
||||
base_ptr=cs->sm_ctl_structs->map_addr;
|
||||
} else {
|
||||
base_ptr=sm_bcol_module->ctl_backing_files_info[i]->sm_mmap->map_addr;
|
||||
}
|
||||
ctl_mgmt->ctl_buffs[array_id]=(void *)
|
||||
(uintptr_t)(((uint64_t)(uintptr_t)ctl_mgmt->ctl_buffs[array_id])+(uint64_t)(uintptr_t)base_ptr);
|
||||
for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) {
|
||||
int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
|
||||
array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
|
||||
ctl_mgmt->ctl_buffs[array_id]=(void *) (uintptr_t)((uint64_t)(uintptr_t)(ctl_mgmt->ctl_buffs[array_id_m1])+
|
||||
(uint64_t)(uintptr_t)sizeof(mca_bcol_basesmuma_ctl_struct_t));
|
||||
}
|
||||
}
|
||||
/* initialize my control structues */
|
||||
for( buf_id = 0 ; buf_id < loop_limit ; buf_id++ ) {
|
||||
|
||||
int my_idx=sm_bcol_module->super.sbgp_partner_module->my_index;
|
||||
int array_id=SM_ARRAY_INDEX(leading_dim,buf_id,my_idx);
|
||||
ctl_ptr = (mca_bcol_basesmuma_ctl_struct_t *)
|
||||
ctl_mgmt->ctl_buffs[array_id];
|
||||
|
||||
/* initialize the data structures - RLG, this is only one data
|
||||
* structure that needs to be initialized, more are missing */
|
||||
ctl_ptr->sequence_number=-1;
|
||||
ctl_ptr->flag=-1;
|
||||
ctl_ptr->index=0;
|
||||
ctl_ptr->src_ptr = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
exit_ERROR:
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int base_bcol_basesmuma_setup_ctl (mca_bcol_basesmuma_module_t *sm_bcol_module,
|
||||
mca_bcol_basesmuma_component_t *cs)
|
||||
{
|
||||
const int my_index = sm_bcol_module->super.sbgp_partner_module->my_index;;
|
||||
bcol_basesmuma_smcm_file_t input_file;
|
||||
int ret;
|
||||
|
||||
/* exchange remote addressing information if it has not already been done */
|
||||
if (NULL == sm_bcol_module->ctl_backing_files_info) {
|
||||
input_file.file_name=cs->sm_ctl_structs->map_path;
|
||||
input_file.size=cs->sm_ctl_structs->map_size;
|
||||
input_file.size_ctl_structure=0;
|
||||
input_file.data_seg_alignment=BASESMUMA_CACHE_LINE_SIZE;
|
||||
input_file.mpool_size=cs->sm_ctl_structs->map_size;
|
||||
ret = bcol_basesmuma_smcm_allgather_connection(sm_bcol_module,
|
||||
sm_bcol_module->super.sbgp_partner_module,
|
||||
&(cs->sm_connections_list),
|
||||
&(sm_bcol_module->ctl_backing_files_info),
|
||||
sm_bcol_module->super.sbgp_partner_module->group_comm,
|
||||
input_file, cs->clt_base_fname,
|
||||
false);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* fill in the pointer to other ranks scartch shared memory */
|
||||
if (NULL == sm_bcol_module->shared_memory_scratch_space) {
|
||||
sm_bcol_module->shared_memory_scratch_space =
|
||||
calloc (sm_bcol_module->super.sbgp_partner_module->group_size, sizeof (void *));
|
||||
if (!sm_bcol_module->shared_memory_scratch_space) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for shared_memory_scratch_space.");
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for (int i = 0 ; i < sm_bcol_module->super.sbgp_partner_module->group_size ; ++i) {
|
||||
if (i == my_index) {
|
||||
/* local file data is not cached in this list */
|
||||
continue;
|
||||
}
|
||||
|
||||
sm_bcol_module->shared_memory_scratch_space[i] =
|
||||
(void *)((intptr_t) sm_bcol_module->ctl_backing_files_info[i]->sm_mmap +
|
||||
cs->scratch_offset_from_base_ctl_file);
|
||||
}
|
||||
|
||||
sm_bcol_module->shared_memory_scratch_space[my_index] =
|
||||
(void *)((intptr_t) cs->sm_ctl_structs->map_addr + cs->scratch_offset_from_base_ctl_file);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int base_bcol_basesmuma_setup_ctl_struct(
|
||||
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
||||
mca_bcol_basesmuma_component_t *cs,
|
||||
sm_buffer_mgmt *ctl_mgmt)
|
||||
{
|
||||
int n_ctl, n_levels;
|
||||
int n_ctl_structs;
|
||||
size_t malloc_size;
|
||||
|
||||
/*
|
||||
* set my no user-data conrol structures
|
||||
*/
|
||||
/* number of banks and regions per bank are already a power of 2 */
|
||||
n_ctl_structs=cs->basesmuma_num_mem_banks*
|
||||
cs->basesmuma_num_regions_per_bank;
|
||||
|
||||
/* initialize the control structure management struct -
|
||||
* for collectives without user data
|
||||
*---------------------------------------------------------------
|
||||
*/
|
||||
|
||||
ctl_mgmt->number_of_buffs=n_ctl_structs;
|
||||
ctl_mgmt->num_mem_banks=
|
||||
cs->basesmuma_num_mem_banks;
|
||||
|
||||
ctl_mgmt->num_buffs_per_mem_bank=
|
||||
cs->basesmuma_num_regions_per_bank;
|
||||
ctl_mgmt->size_of_group=
|
||||
sm_bcol_module->super.sbgp_partner_module->group_size;
|
||||
ompi_roundup_to_power_radix(2,cs->basesmuma_num_regions_per_bank,&n_levels);
|
||||
ctl_mgmt->log2_num_buffs_per_mem_bank=n_levels;
|
||||
|
||||
ompi_roundup_to_power_radix(2,n_ctl_structs,&n_levels);
|
||||
ctl_mgmt->log2_number_of_buffs=n_levels;
|
||||
ctl_mgmt->mask=n_ctl_structs-1;
|
||||
sm_bcol_module->super.n_poll_loops=cs->n_poll_loops;
|
||||
|
||||
malloc_size=
|
||||
(ctl_mgmt->number_of_buffs +
|
||||
ctl_mgmt->num_mem_banks ) *
|
||||
ctl_mgmt->size_of_group *
|
||||
sizeof(void *);
|
||||
ctl_mgmt->ctl_buffs = malloc(malloc_size);
|
||||
if (!ctl_mgmt->ctl_buffs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* setup the no-data buffer managment data
|
||||
*/
|
||||
n_ctl = ctl_mgmt->num_mem_banks;
|
||||
ctl_mgmt->ctl_buffs_mgmt = (mem_bank_management_t *) calloc (n_ctl, sizeof (mem_bank_management_t));
|
||||
if (!ctl_mgmt->ctl_buffs_mgmt) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for ctl_buffs_mgmt");
|
||||
free (ctl_mgmt->ctl_buffs);
|
||||
ctl_mgmt->ctl_buffs = NULL;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* initialize each individual element */
|
||||
for (int i = 0 ; i < n_ctl ; ++i) {
|
||||
opal_list_item_t *item;
|
||||
opal_mutex_t *mutex_ptr;
|
||||
|
||||
ctl_mgmt->ctl_buffs_mgmt[i].available_buffers=
|
||||
ctl_mgmt->num_buffs_per_mem_bank;
|
||||
ctl_mgmt->ctl_buffs_mgmt[i].number_of_buffers=
|
||||
ctl_mgmt->num_buffs_per_mem_bank;
|
||||
mutex_ptr = &(ctl_mgmt->ctl_buffs_mgmt[i].mutex);
|
||||
OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t);
|
||||
ctl_mgmt->ctl_buffs_mgmt[i].index_shared_mem_ctl_structs=i;
|
||||
|
||||
item = (opal_list_item_t *)&(ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc);
|
||||
OBJ_CONSTRUCT(item, opal_list_item_t);
|
||||
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.sm_module =
|
||||
sm_bcol_module;
|
||||
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.pool_index = i;
|
||||
/* get the sm_buffer_mgmt pointer for the control structures */
|
||||
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.coll_buff = ctl_mgmt;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* this function initializes the internal scratch buffers and control
|
||||
* structures that will be used by the module. It also intitializes
|
||||
* the payload buffer management structures.
|
||||
*/
|
||||
int base_bcol_basesmuma_setup_library_buffers(
|
||||
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
||||
mca_bcol_basesmuma_component_t *cs)
|
||||
{
|
||||
int ret=OMPI_SUCCESS,i;
|
||||
int n_ctl_structs;
|
||||
size_t ctl_segement_size,total_memory;
|
||||
int max_elements;
|
||||
unsigned char *data_ptr;
|
||||
|
||||
/* */
|
||||
/* setup the control struct memory */
|
||||
if(!cs->sm_ctl_structs) {
|
||||
ret = mca_bcol_basesmuma_allocate_sm_ctl_memory(cs);
|
||||
if(OMPI_SUCCESS != ret) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "In bcol_comm_query mca_bcol_basesmuma_allocate_sm_ctl_memory failed\n");
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* put the memory onto the free list - we have worried about
|
||||
* alignment in the mpool allocation, and assume that the
|
||||
* ctl structures have the approriate size to mantain alignment
|
||||
*/
|
||||
|
||||
/* figure out segment size */
|
||||
n_ctl_structs=cs->basesmuma_num_mem_banks*
|
||||
cs->basesmuma_num_regions_per_bank;
|
||||
|
||||
/* add memory for the control structure used for recycling the banks */
|
||||
n_ctl_structs+=cs->basesmuma_num_mem_banks;
|
||||
|
||||
ctl_segement_size=n_ctl_structs*
|
||||
sizeof(mca_bcol_basesmuma_ctl_struct_t);
|
||||
|
||||
total_memory=cs->sm_ctl_structs->map_size - (
|
||||
(char *)(cs->sm_ctl_structs->data_addr)-
|
||||
(char *)(cs->sm_ctl_structs->map_addr));
|
||||
total_memory-=cs->my_scratch_shared_memory_size;
|
||||
max_elements=total_memory/ctl_segement_size;
|
||||
|
||||
/* populate the free list */
|
||||
data_ptr=cs->sm_ctl_structs->data_addr;
|
||||
|
||||
for( i=0 ; i < max_elements ; i++ ) {
|
||||
list_data_t *item = OBJ_NEW(list_data_t);
|
||||
if( !item ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
item->data=(void *)data_ptr;
|
||||
opal_list_append(&(cs->ctl_structures),(opal_list_item_t *)item);
|
||||
data_ptr+=ctl_segement_size;
|
||||
}
|
||||
/* set the scratch memory pointer and offset */
|
||||
cs->my_scratch_shared_memory=(char *)data_ptr;
|
||||
cs->scratch_offset_from_base_ctl_file=(size_t)
|
||||
((char *)data_ptr-(char *)cs->sm_ctl_structs->map_addr);
|
||||
|
||||
|
||||
/* At this stage the memory is mapped and ready to use by the local rank.
|
||||
* However, the memory of other processes has not yet been mmaped into the
|
||||
* memory of this process.
|
||||
*/
|
||||
}
|
||||
|
||||
/* intialize no_userdata_ctl */
|
||||
sm_bcol_module->no_userdata_ctl=(list_data_t *)
|
||||
opal_list_remove_last(&(cs->ctl_structures));
|
||||
if (!sm_bcol_module->no_userdata_ctl) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* intialize userdata_ctl */
|
||||
sm_bcol_module->userdata_ctl = (list_data_t *)
|
||||
opal_list_remove_last(&(cs->ctl_structures));
|
||||
if (!sm_bcol_module->userdata_ctl) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ret = base_bcol_basesmuma_setup_ctl (sm_bcol_module, cs);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = base_bcol_basesmuma_setup_ctl_struct (sm_bcol_module, cs, &(sm_bcol_module->colls_no_user_data));
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = base_bcol_basesmuma_setup_ctl_struct (sm_bcol_module, cs, &(sm_bcol_module->colls_with_user_data));
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* used for blocking recursive doubling barrier */
|
||||
sm_bcol_module->index_blocking_barrier_memory_bank=0;
|
||||
|
||||
/* gather the offsets of the control structs relative to the base
|
||||
* of the shared memory file, and fill in the table with the
|
||||
* address of all the control structues.
|
||||
*/
|
||||
ret = base_bcol_basesmuma_exchange_ctl_params(sm_bcol_module, cs,
|
||||
&(sm_bcol_module->colls_no_user_data),sm_bcol_module->no_userdata_ctl);
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = base_bcol_basesmuma_exchange_ctl_params(sm_bcol_module, cs,
|
||||
&(sm_bcol_module->colls_with_user_data),sm_bcol_module->userdata_ctl);
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(list_data_t,
|
||||
opal_list_item_t, NULL, NULL);
|
@ -1,460 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_STRINGS_H
|
||||
#include <strings.h>
|
||||
#endif
|
||||
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "ompi/patterns/comm/coll_ops.h"
|
||||
#include "opal/align.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
|
||||
#include "bcol_basesmuma.h"
|
||||
|
||||
|
||||
|
||||
#define SM_BACKING_FILE_NAME_MAX_LEN 256
|
||||
|
||||
static bcol_basesmuma_smcm_mmap_t * bcol_basesmuma_smcm_reg_mmap(void *in_ptr, int fd, size_t length,
|
||||
size_t addr_offset, size_t alignment,
|
||||
char *file_name);
|
||||
|
||||
struct file_info_t {
|
||||
uint32_t vpid;
|
||||
uint32_t jobid;
|
||||
uint64_t file_size;
|
||||
uint64_t size_ctl_structure;
|
||||
uint64_t data_seg_alignment;
|
||||
char file_name[SM_BACKING_FILE_NAME_MAX_LEN];
|
||||
};
|
||||
|
||||
/* need to allocate space for the peer */
|
||||
static void bcol_basesmuma_smcm_proc_item_t_construct (bcol_basesmuma_smcm_proc_item_t * item)
|
||||
{
|
||||
memset ((char *) item + sizeof (item->item), 0, sizeof (*item) - sizeof (item->item));
|
||||
}
|
||||
|
||||
/* need to free the space for the peer */
|
||||
static void bcol_basesmuma_smcm_proc_item_t_destruct (bcol_basesmuma_smcm_proc_item_t * item)
|
||||
{
|
||||
if (item->sm_mmap) {
|
||||
OBJ_RELEASE(item->sm_mmap);
|
||||
}
|
||||
|
||||
if (item->sm_file.file_name) {
|
||||
free (item->sm_file.file_name);
|
||||
item->sm_file.file_name = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(bcol_basesmuma_smcm_proc_item_t,
|
||||
opal_list_item_t,
|
||||
bcol_basesmuma_smcm_proc_item_t_construct,
|
||||
bcol_basesmuma_smcm_proc_item_t_destruct);
|
||||
|
||||
static void bcol_basesmuma_smcm_mmap_construct (bcol_basesmuma_smcm_mmap_t *smcm_mmap)
|
||||
{
|
||||
memset ((char *) smcm_mmap + sizeof (smcm_mmap->super), 0, sizeof (*smcm_mmap) - sizeof (smcm_mmap->super));
|
||||
}
|
||||
|
||||
static void bcol_basesmuma_smcm_mmap_destruct (bcol_basesmuma_smcm_mmap_t *smcm_mmap)
|
||||
{
|
||||
if (smcm_mmap->map_seg) {
|
||||
munmap ((void *)smcm_mmap->map_seg, smcm_mmap->map_size);
|
||||
smcm_mmap->map_seg = NULL;
|
||||
}
|
||||
|
||||
if (smcm_mmap->map_path) {
|
||||
free (smcm_mmap->map_path);
|
||||
smcm_mmap->map_path = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(bcol_basesmuma_smcm_mmap_t, opal_list_item_t,
|
||||
bcol_basesmuma_smcm_mmap_construct,
|
||||
bcol_basesmuma_smcm_mmap_destruct);
|
||||
|
||||
|
||||
/* smcm_allgather_connection:
|
||||
This function is called when a shared memory subgroup wants to establish shared memory "connections" among
|
||||
a group of processes.
|
||||
|
||||
This function DOES NOT create any shared memory backing files, it only mmaps already existing files. Shared
|
||||
memory files are created by the shared memory registration function
|
||||
-----------------------------------------------------------------------------------------------------------
|
||||
Input params:
|
||||
|
||||
- sbgp module The subgrouping module contains the list of ranks to wire up.
|
||||
|
||||
- peer_list An opal list containing a list of bcol_basesmuma_smcm_proc_item_t types. This
|
||||
contains a list of peers whose shared memory files I have already mapped.
|
||||
Upon completion of the allgather exchange with all members of the group and depending on the
|
||||
value of "map_all", my peers' shared memory files are mapped into my local virtual memory
|
||||
space, with all pertinent information being stored in an bcol_basesmuma_smcm_proc_item_t which is
|
||||
subsequently appended onto the "peer_list".
|
||||
|
||||
- comm The ompi_communicator_t communicator.
|
||||
|
||||
- input A data struct that caches the information about my shared memory file.
|
||||
|
||||
- map_all Bool that determines whether or not to go ahead and map the files from all of the peers
|
||||
defined in the sbgp-ing module. If map_all == true, then go ahead and mmap all of the files
|
||||
obtained in the exchange and append the information to the "peer_list". If map_all == false
|
||||
then make a check and only mmap those peers' files whose vpid/jobid/filename combination do
|
||||
not already exist in the "peer_list". Once mapping is completed, append this peer's information
|
||||
to the "peer_list".
|
||||
-----------------------------------------------------------------------------------------------------------
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
int bcol_basesmuma_smcm_allgather_connection(
|
||||
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
||||
mca_sbgp_base_module_t *module,
|
||||
opal_list_t *peer_list,
|
||||
bcol_basesmuma_smcm_proc_item_t ***back_files,
|
||||
ompi_communicator_t *comm,
|
||||
bcol_basesmuma_smcm_file_t input,
|
||||
char *base_fname,
|
||||
bool map_all)
|
||||
{
|
||||
|
||||
/* define local variables */
|
||||
|
||||
int rc, i, fd;
|
||||
ptrdiff_t mem_offset;
|
||||
ompi_proc_t *proc_temp, *my_id;
|
||||
bcol_basesmuma_smcm_proc_item_t *temp;
|
||||
bcol_basesmuma_smcm_proc_item_t *item_ptr;
|
||||
bcol_basesmuma_smcm_proc_item_t **backing_files;
|
||||
struct file_info_t local_file;
|
||||
struct file_info_t *all_files=NULL;
|
||||
|
||||
/* sanity check */
|
||||
if (strlen(input.file_name) > SM_BACKING_FILE_NAME_MAX_LEN-1) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "backing file name too long: %s len :: %d",
|
||||
input.file_name, (int) strlen(input.file_name));
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
backing_files = (bcol_basesmuma_smcm_proc_item_t **)
|
||||
calloc(module->group_size, sizeof(bcol_basesmuma_smcm_proc_item_t *));
|
||||
if (!backing_files) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* FIXME *back_files might have been already allocated
|
||||
* so free it in order to avoid a memory leak */
|
||||
if (NULL != *back_files) {
|
||||
free (*back_files);
|
||||
}
|
||||
*back_files = backing_files;
|
||||
|
||||
my_id = ompi_proc_local();
|
||||
|
||||
/* Phase One:
|
||||
gather a list of processes that will participate in the allgather - I'm
|
||||
preparing this list from the sbgp-ing module that was passed into the function */
|
||||
|
||||
/* fill in local file information */
|
||||
local_file.vpid = ((orte_process_name_t*)&my_id->super.proc_name)->vpid;
|
||||
local_file.jobid = ((orte_process_name_t*)&my_id->super.proc_name)->jobid;
|
||||
local_file.file_size=input.size;
|
||||
local_file.size_ctl_structure=input.size_ctl_structure;
|
||||
local_file.data_seg_alignment=input.data_seg_alignment;
|
||||
|
||||
strcpy (local_file.file_name, input.file_name);
|
||||
|
||||
/* will exchange this data type as a string of characters -
|
||||
* this routine is first called before MPI_init() completes
|
||||
* and before error handling is setup, so can't use the
|
||||
* MPI data types to send this data */
|
||||
all_files = (struct file_info_t *) calloc(module->group_size,
|
||||
sizeof (struct file_info_t));
|
||||
if (!all_files) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* exchange data */
|
||||
rc = comm_allgather_pml(&local_file,all_files,sizeof(struct file_info_t), MPI_CHAR,
|
||||
sm_bcol_module->super.sbgp_partner_module->my_index,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_size,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_list,
|
||||
sm_bcol_module->super.sbgp_partner_module->group_comm);
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "failed in comm_allgather_pml. Error code: %d", rc);
|
||||
goto Error;
|
||||
}
|
||||
|
||||
/* Phase four:
|
||||
loop through the receive buffer, unpack the data recieved from remote peers */
|
||||
|
||||
for (i = 0; i < module->group_size; i++) {
|
||||
struct file_info_t *rem_file = all_files + i;
|
||||
|
||||
/* check if this is my index or if the file is already mapped (set above). ther
|
||||
* is no reason to look through the peer list again because no two members of
|
||||
* the group will have the same vpid/jobid pair. ignore this previously found
|
||||
* mapping if map_all was requested (NTH: not sure why exactly since we re-map
|
||||
* and already mapped file) */
|
||||
if (sm_bcol_module->super.sbgp_partner_module->my_index == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]);
|
||||
|
||||
OPAL_LIST_FOREACH(item_ptr, peer_list, bcol_basesmuma_smcm_proc_item_t) {
|
||||
/* if the vpid/jobid/filename combination already exists in the list,
|
||||
then do not map this peer's file --- because you already have */
|
||||
if (0 == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
|
||||
OMPI_CAST_RTE_NAME(&proc_temp->super.proc_name),
|
||||
&item_ptr->peer) &&
|
||||
0 == strcmp (item_ptr->sm_file.file_name, rem_file->file_name)) {
|
||||
++item_ptr->refcnt;
|
||||
/* record file data */
|
||||
backing_files[i] = item_ptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!map_all && backing_files[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
temp = OBJ_NEW(bcol_basesmuma_smcm_proc_item_t);
|
||||
if (!temp) {
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto Error;
|
||||
}
|
||||
|
||||
temp->peer.vpid = rem_file->vpid;
|
||||
temp->peer.jobid = rem_file->jobid;
|
||||
|
||||
temp->sm_file.file_name = strdup (rem_file->file_name);
|
||||
if (!temp->sm_file.file_name) {
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
OBJ_RELEASE(temp);
|
||||
goto Error;
|
||||
}
|
||||
|
||||
temp->sm_file.size = (size_t) rem_file->file_size;
|
||||
temp->sm_file.mpool_size = (size_t) rem_file->file_size;
|
||||
temp->sm_file.size_ctl_structure = (size_t) rem_file->size_ctl_structure;
|
||||
temp->sm_file.data_seg_alignment = (size_t) rem_file->data_seg_alignment;
|
||||
temp->refcnt = 1;
|
||||
|
||||
/* Phase Five:
|
||||
If map_all == true, then we map every peer's file
|
||||
else we check to see if I have already mapped this
|
||||
vpid/jobid/filename combination and if I have, then
|
||||
I do not mmap this peer's file.
|
||||
*
|
||||
*/
|
||||
fd = open(temp->sm_file.file_name, O_RDWR, 0600);
|
||||
if (0 > fd) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "SMCM Allgather failed to open sm backing file %s. errno = %d",
|
||||
temp->sm_file.file_name, errno);
|
||||
rc = OMPI_ERROR;
|
||||
goto Error;
|
||||
}
|
||||
|
||||
/* map the file */
|
||||
temp->sm_mmap = bcol_basesmuma_smcm_reg_mmap (NULL, fd, temp->sm_file.size,
|
||||
temp->sm_file.size_ctl_structure,
|
||||
temp->sm_file.data_seg_alignment,
|
||||
temp->sm_file.file_name);
|
||||
close (fd);
|
||||
if (NULL == temp->sm_mmap) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "mmapping failed to map remote peer's file");
|
||||
OBJ_RELEASE(temp);
|
||||
rc = OMPI_ERROR;
|
||||
goto Error;
|
||||
}
|
||||
|
||||
/* compute memory offset */
|
||||
mem_offset = (ptrdiff_t) temp->sm_mmap->data_addr -
|
||||
(ptrdiff_t) temp->sm_mmap->map_seg;
|
||||
temp->sm_mmap->map_seg->seg_offset = mem_offset;
|
||||
temp->sm_mmap->map_seg->seg_size = temp->sm_file.size - mem_offset;
|
||||
/* more stuff to follow */
|
||||
|
||||
/* append this peer's info, including shared memory map addr, onto the
|
||||
peer_list */
|
||||
|
||||
/* record file data */
|
||||
backing_files[i] = (bcol_basesmuma_smcm_proc_item_t *) temp;
|
||||
|
||||
opal_list_append(peer_list, (opal_list_item_t*) temp);
|
||||
}
|
||||
|
||||
rc = OMPI_SUCCESS;
|
||||
|
||||
Error:
|
||||
|
||||
/* error clean-up and return */
|
||||
if (NULL != all_files) {
|
||||
free(all_files);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int bcol_basesmuma_smcm_release_connections (mca_bcol_basesmuma_module_t *sm_bcol_module,
|
||||
mca_sbgp_base_module_t *sbgp_module, opal_list_t *peer_list,
|
||||
bcol_basesmuma_smcm_proc_item_t ***back_files)
|
||||
{
|
||||
bcol_basesmuma_smcm_proc_item_t **smcm_procs = *back_files;
|
||||
|
||||
for (int i = 0 ; i < sbgp_module->group_size ; ++i) {
|
||||
if (smcm_procs[i] && 0 == --smcm_procs[i]->refcnt) {
|
||||
opal_list_remove_item (peer_list, (opal_list_item_t *) smcm_procs[i]);
|
||||
OBJ_RELEASE(smcm_procs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
free (smcm_procs);
|
||||
*back_files = NULL;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* mmap the specified file as a shared file. No information exchange with other
|
||||
* processes takes place within this routine.
|
||||
* This function assumes that the memory has already been allocated, and only the
|
||||
* mmap needs to be done.
|
||||
*/
|
||||
bcol_basesmuma_smcm_mmap_t *bcol_basesmuma_smcm_mem_reg(void *in_ptr,
|
||||
size_t length,
|
||||
size_t alignment,
|
||||
char* file_name)
|
||||
{
|
||||
/* local variables */
|
||||
int fd = -1;
|
||||
bcol_basesmuma_smcm_mmap_t *map = NULL;
|
||||
int rc;
|
||||
|
||||
/* if pointer is not allocated - return error. We have no clue how the user will allocate or
|
||||
* free this memory.
|
||||
*/
|
||||
|
||||
/* open the shared memory backing file */
|
||||
|
||||
fd = open(file_name, O_CREAT|O_RDWR,0600);
|
||||
if (fd < 0) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "basesmuma shared memory allocation open failed with errno: %d",
|
||||
errno);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (0 != ftruncate(fd,length)) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "basesmuma shared memory allocation ftruncate failed with errno: %d",
|
||||
errno);
|
||||
} else {
|
||||
/* ensure there is enough space for the backing store */
|
||||
rc = ftruncate (fd, length);
|
||||
if (0 > rc) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "failed to truncate the file to be mapped. errno: %d", errno);
|
||||
close(fd);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
map = bcol_basesmuma_smcm_reg_mmap(in_ptr, fd, length, 0, alignment, file_name);
|
||||
if (NULL == map) {
|
||||
close(fd);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
/* no longer need this file descriptor. close it */
|
||||
close (fd);
|
||||
|
||||
/* takes us to the top of the control structure */
|
||||
|
||||
return map;
|
||||
|
||||
}
|
||||
|
||||
static bcol_basesmuma_smcm_mmap_t * bcol_basesmuma_smcm_reg_mmap(void *in_ptr, int fd, size_t length,
|
||||
size_t addr_offset, size_t alignment,
|
||||
char *file_name)
|
||||
{
|
||||
|
||||
/* local variables */
|
||||
bcol_basesmuma_smcm_mmap_t *map;
|
||||
bcol_basesmuma_smcm_file_header_t *seg;
|
||||
unsigned char* myaddr = NULL;
|
||||
int flags = MAP_SHARED;
|
||||
|
||||
/* set up the map object */
|
||||
map = OBJ_NEW(bcol_basesmuma_smcm_mmap_t);
|
||||
if (OPAL_UNLIKELY(NULL == map)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* map the file and initialize the segment state */
|
||||
if (NULL != in_ptr) {
|
||||
flags |= MAP_FIXED;
|
||||
}
|
||||
seg = (bcol_basesmuma_smcm_file_header_t *)
|
||||
mmap(in_ptr, length, PROT_READ|PROT_WRITE, flags, fd, 0);
|
||||
if((void*)-1 == seg) {
|
||||
OBJ_RELEASE(map);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
map->map_path = strdup (file_name);
|
||||
|
||||
/* the first entry in the file is the control structure. the first entry
|
||||
in the control structure is an mca_common_sm_file_header_t element */
|
||||
map->map_seg = seg;
|
||||
|
||||
myaddr = (unsigned char *) seg + addr_offset;
|
||||
/* if we have a data segment (i.e. if 0 != data_seg_alignement) */
|
||||
|
||||
if (alignment) {
|
||||
myaddr = OPAL_ALIGN_PTR(myaddr, alignment, unsigned char*);
|
||||
|
||||
/* is addr past the end of the file? */
|
||||
if ((unsigned char *) seg+length < myaddr) {
|
||||
opal_output (ompi_bcol_base_framework.framework_output, "mca_bcol_basesmuma_sm_alloc_mmap: memory region too small len %lu add %p",
|
||||
(unsigned long) length, (void*)myaddr);
|
||||
OBJ_RELEASE(map);
|
||||
munmap ((void *)seg, length);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
map->data_addr = (unsigned char*) myaddr;
|
||||
map->map_addr = (unsigned char*) seg;
|
||||
map->map_size = length;
|
||||
|
||||
return map;
|
||||
}
|
@ -1,105 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef BCOL_BASESMUMA_SMCM_H
|
||||
#define BCOL_BASESMUMA_SMCM_H
|
||||
|
||||
#include <sys/mman.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
|
||||
|
||||
|
||||
typedef struct bcol_basesmuma_smcm_file_header_t {
|
||||
/* lock to control atomic access */
|
||||
opal_atomic_lock_t seg_lock;
|
||||
|
||||
/* is the segment ready for use */
|
||||
volatile int32_t seg_inited;
|
||||
|
||||
/* Offset to next available memory location available for allocation */
|
||||
size_t seg_offset;
|
||||
|
||||
/* total size of the segment */
|
||||
size_t seg_size;
|
||||
} bcol_basesmuma_smcm_file_header_t;
|
||||
|
||||
|
||||
typedef struct bcol_basesmuma_smcm_mmap_t {
|
||||
/* double link list element */
|
||||
opal_list_item_t super;
|
||||
/* pointer to header imbeded in the shared memory file */
|
||||
bcol_basesmuma_smcm_file_header_t *map_seg;
|
||||
/* base address of the mmap'ed file */
|
||||
unsigned char *map_addr;
|
||||
/* base address of data segment */
|
||||
unsigned char *data_addr;
|
||||
/* How big it is (in bytes) */
|
||||
size_t map_size;
|
||||
/* Filename */
|
||||
char *map_path;
|
||||
} bcol_basesmuma_smcm_mmap_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(bcol_basesmuma_smcm_mmap_t);
|
||||
|
||||
|
||||
/* Struct that characterizes a shared memory file */
|
||||
struct bcol_basesmuma_smcm_file_t {
|
||||
|
||||
char *file_name;
|
||||
size_t size;
|
||||
size_t size_ctl_structure;
|
||||
size_t data_seg_alignment;
|
||||
size_t mpool_size;
|
||||
|
||||
};
|
||||
typedef struct bcol_basesmuma_smcm_file_t bcol_basesmuma_smcm_file_t;
|
||||
|
||||
|
||||
struct bcol_basesmuma_smcm_proc_item_t {
|
||||
opal_list_item_t item; /* can put me on a free list */
|
||||
int refcnt;
|
||||
ompi_process_name_t peer;
|
||||
bcol_basesmuma_smcm_file_t sm_file;
|
||||
bcol_basesmuma_smcm_mmap_t *sm_mmap; /* Pointer to peer's sm file */
|
||||
|
||||
};
|
||||
typedef struct bcol_basesmuma_smcm_proc_item_t bcol_basesmuma_smcm_proc_item_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(bcol_basesmuma_smcm_proc_item_t);
|
||||
|
||||
|
||||
/* allocate shared memory file
|
||||
* in_ptr - pointer to preallocated memory (if NULL, this will be mmaped)
|
||||
* alignment - region memory alignment
|
||||
* file name - fully qualified backing file name
|
||||
*/
|
||||
|
||||
OMPI_DECLSPEC extern bcol_basesmuma_smcm_mmap_t *bcol_basesmuma_smcm_mem_reg(void *in_ptr,
|
||||
size_t length,
|
||||
size_t alignment,
|
||||
char* file_name);
|
||||
|
||||
OMPI_DECLSPEC extern bcol_basesmuma_smcm_mmap_t* bcol_basesmuma_smcm_create_mmap(int fd,
|
||||
size_t size, char *file_name,
|
||||
size_t size_ctl_structure,
|
||||
size_t data_seg_alignment);
|
||||
|
||||
#endif
|
@ -1,103 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "bcol_basesmuma_utils.h"
|
||||
|
||||
/*
|
||||
* Return closet power of K that is either greater than
|
||||
* or equal to the group size.
|
||||
*/
|
||||
int pow_sm_k(int k, int number, int *pow_k)
|
||||
{
|
||||
int power = 0;
|
||||
int n = 1;
|
||||
|
||||
if( 2 == k){
|
||||
while(n <= number){
|
||||
power++;
|
||||
n <<= 1;
|
||||
}
|
||||
*pow_k = n >> 1;
|
||||
|
||||
} else {
|
||||
while (n <= number) {
|
||||
n *= k;
|
||||
power++;
|
||||
}
|
||||
*pow_k = n/k;
|
||||
}
|
||||
|
||||
|
||||
return (power-1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int get_k_nomial_src_list(int group_size,
|
||||
int radix, int my_index,
|
||||
int *src_list) {
|
||||
|
||||
/* local variables */
|
||||
int radix_power;
|
||||
int offset;
|
||||
int kount = 0;
|
||||
int src_temp;
|
||||
|
||||
radix_power = 1;
|
||||
offset = 1;
|
||||
while(offset < group_size) {
|
||||
if( offset % (radix * radix_power) ) {
|
||||
src_temp = my_index - offset;
|
||||
/* wrap around */
|
||||
if ( src_temp < 0 ) {
|
||||
src_temp += group_size;
|
||||
}
|
||||
/* don't probe ghost nodes */
|
||||
if( src_temp < group_size ) {
|
||||
src_list[kount] = src_temp;
|
||||
kount++;
|
||||
}
|
||||
offset+=radix_power;
|
||||
} else {
|
||||
|
||||
radix_power *= radix;
|
||||
}
|
||||
|
||||
}
|
||||
/* return the actual number of nodes to poll on */
|
||||
return kount;
|
||||
}
|
||||
|
||||
int get_k_nomial_dst_size(int group_size, int radix, int my_index)
|
||||
{
|
||||
int dst_count = 0;
|
||||
int radix_mask;
|
||||
int k;
|
||||
radix_mask = 1;
|
||||
while (radix_mask < group_size) {
|
||||
if (0 != my_index % (radix * radix_mask)) {
|
||||
/* I found my level in tree */
|
||||
break;
|
||||
}
|
||||
radix_mask *= radix;
|
||||
}
|
||||
radix_mask /= radix;
|
||||
|
||||
while(radix_mask > 0) {
|
||||
/* For each level of tree, do sends */
|
||||
for (k = 1;
|
||||
k < radix && my_index + radix_mask * k < group_size;
|
||||
++k) {
|
||||
dst_count += 1 ;
|
||||
}
|
||||
radix_mask /= radix;
|
||||
}
|
||||
|
||||
return dst_count;
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_BASESMUMA_UTILS_H
|
||||
#define MCA_BCOL_BASESMUMA_UTILS_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,radix,relative_index, \
|
||||
my_group_index, group_size, ready_flag) \
|
||||
do { \
|
||||
int k, child; \
|
||||
while(radix_mask > 0){ \
|
||||
for(k = 1; k < radix && relative_index+radix_mask*k<group_size; \
|
||||
k++) {\
|
||||
child = my_group_index+radix_mask*k; \
|
||||
if(child >= group_size) { \
|
||||
child -= group_size; \
|
||||
} \
|
||||
/*fprintf(stderr,"I am %d sending to child %d\n",my_group_index,child);*/ \
|
||||
child_ctl_pointer = data_buffs[child].ctl_struct; \
|
||||
child_ctl_pointer->src = my_group_index; \
|
||||
/* this can be improved to make better asynchronous progress, but it's
|
||||
* fine for now.
|
||||
*/ \
|
||||
while(child_ctl_pointer->sequence_number != sequence_number ); \
|
||||
child_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; \
|
||||
} \
|
||||
radix_mask = radix_mask/radix; \
|
||||
} \
|
||||
} while( 0 )
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Return closet power of K that is greater than or equal to "number".
|
||||
*/
|
||||
int pow_sm_k(int radix_k, int group_size, int *pow_k_group_size);
|
||||
|
||||
/*
|
||||
* Get list of possible sources from which data may arrive based on a K-nomial tree fan-out.
|
||||
*/
|
||||
|
||||
int get_k_nomial_src_list(int group_size, int radix,
|
||||
int my_index, int *src_list);
|
||||
|
||||
|
||||
int get_k_nomial_dst_size(int group_size, int radix, int my_index);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,7 +0,0 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: ORNL
|
||||
status: unmaintained
|
@ -1,805 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_H
|
||||
#define MCA_BCOL_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "ompi/mca/mca.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "ompi/mca/sbgp/sbgp.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "ompi/patterns/net/netpatterns_knomial_tree.h"
|
||||
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Forward declaration - please do not remove it */
|
||||
struct ml_buffers_t;
|
||||
|
||||
struct mca_bcol_base_coll_fn_comm_attributes_t;
|
||||
struct mca_bcol_base_coll_fn_invoke_attributes_t;
|
||||
struct mca_bcol_base_coll_fn_desc_t;
|
||||
|
||||
#define NUM_MSG_RANGES 5
|
||||
#define MSG_RANGE_INITIAL (1024)*12
|
||||
#define MSG_RANGE_INC 10
|
||||
#define BCOL_THRESHOLD_UNLIMITED (INT_MAX)
|
||||
/* Maximum size of a bcol's header. This allows us to correctly calculate the message
|
||||
* thresholds. If the header of any bcol exceeds this value then increase this one
|
||||
* to match. */
|
||||
#define BCOL_HEADER_MAX 96
|
||||
|
||||
#define BCOL_HEAD_ALIGN 32 /* will turn into an MCA parameter after debug */
|
||||
|
||||
/*
|
||||
* Functions supported
|
||||
*/
|
||||
enum bcol_coll {
|
||||
/* blocking functions */
|
||||
BCOL_ALLGATHER,
|
||||
BCOL_ALLGATHERV,
|
||||
BCOL_ALLREDUCE,
|
||||
BCOL_ALLTOALL,
|
||||
BCOL_ALLTOALLV,
|
||||
BCOL_ALLTOALLW,
|
||||
BCOL_BARRIER,
|
||||
BCOL_BCAST,
|
||||
BCOL_EXSCAN,
|
||||
BCOL_GATHER,
|
||||
BCOL_GATHERV,
|
||||
BCOL_REDUCE,
|
||||
BCOL_REDUCE_SCATTER,
|
||||
BCOL_SCAN,
|
||||
BCOL_SCATTER,
|
||||
BCOL_SCATTERV,
|
||||
BCOL_FANIN,
|
||||
BCOL_FANOUT,
|
||||
|
||||
/* nonblocking functions */
|
||||
BCOL_IALLGATHER,
|
||||
BCOL_IALLGATHERV,
|
||||
BCOL_IALLREDUCE,
|
||||
BCOL_IALLTOALL,
|
||||
BCOL_IALLTOALLV,
|
||||
BCOL_IALLTOALLW,
|
||||
BCOL_IBARRIER,
|
||||
BCOL_IBCAST,
|
||||
BCOL_IEXSCAN,
|
||||
BCOL_IGATHER,
|
||||
BCOL_IGATHERV,
|
||||
BCOL_IREDUCE,
|
||||
BCOL_IREDUCE_SCATTER,
|
||||
BCOL_ISCAN,
|
||||
BCOL_ISCATTER,
|
||||
BCOL_ISCATTERV,
|
||||
BCOL_IFANIN,
|
||||
BCOL_IFANOUT,
|
||||
|
||||
BCOL_SYNC,
|
||||
/* New function - needed for intermediate steps */
|
||||
BCOL_REDUCE_TO_LEADER,
|
||||
BCOL_NUM_OF_FUNCTIONS
|
||||
};
|
||||
typedef enum bcol_coll bcol_coll;
|
||||
|
||||
typedef enum bcol_elem_type {
|
||||
BCOL_SINGLE_ELEM_TYPE,
|
||||
BCOL_MULTI_ELEM_TYPE,
|
||||
BCOL_NUM_OF_ELEM_TYPES
|
||||
} bcol_elem_type;
|
||||
|
||||
typedef int (*mca_bcol_base_module_coll_support_all_types_fn_t)(bcol_coll coll_name);
|
||||
typedef int (*mca_bcol_base_module_coll_support_fn_t)(int op, int dtype, bcol_elem_type elem_num);
|
||||
|
||||
/*
|
||||
* Collective function status
|
||||
*/
|
||||
enum {
|
||||
BCOL_FN_NOT_STARTED = (OMPI_ERR_MAX - 1),
|
||||
BCOL_FN_STARTED = (OMPI_ERR_MAX - 2),
|
||||
BCOL_FN_COMPLETE = (OMPI_ERR_MAX - 3)
|
||||
};
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Collective component initialization
|
||||
*
|
||||
* Initialize the given collective component. This function should
|
||||
* initialize any component-level. data. It will be called exactly
|
||||
* once during MPI_INIT.
|
||||
*
|
||||
* @note The component framework is not lazily opened, so attempts
|
||||
* should be made to minimze the amount of memory allocated during
|
||||
* this function.
|
||||
*
|
||||
* @param[in] enable_progress_threads True if the component needs to
|
||||
* support progress threads
|
||||
* @param[in] enable_mpi_threads True if the component needs to
|
||||
* support MPI_THREAD_MULTIPLE
|
||||
*
|
||||
* @retval OMPI_SUCCESS Component successfully initialized
|
||||
* @retval ORTE_ERROR An unspecified error occurred
|
||||
*/
|
||||
typedef int (*mca_bcol_base_component_init_query_fn_t)
|
||||
(bool enable_progress_threads, bool enable_mpi_threads);
|
||||
|
||||
/**
|
||||
* Query whether a component is available for the given sub-group
|
||||
*
|
||||
* Query whether the component is available for the given
|
||||
* sub-group. If the component is available, an array of pointers should be
|
||||
* allocated and returned (with refcount at 1). The module will not
|
||||
* be used for collective operations until module_enable() is called
|
||||
* on the module, but may be destroyed (via OBJ_RELEASE) either before
|
||||
* or after module_enable() is called. If the module needs to release
|
||||
* resources obtained during query(), it should do so in the module
|
||||
* destructor.
|
||||
*
|
||||
* A component may provide NULL to this function to indicate it does
|
||||
* not wish to run or return an error during module_enable().
|
||||
*
|
||||
* @note The communicator is available for point-to-point
|
||||
* communication, but other functionality is not available during this
|
||||
* phase of initialization.
|
||||
*
|
||||
* @param[in] sbgp Pointer to sub-group module.
|
||||
* @param[out] priority Priority setting for component on
|
||||
* this communicator
|
||||
* @param[out] num_modules Number of modules that where generated
|
||||
* for the sub-group module.
|
||||
*
|
||||
* @returns An array of pointer to an initialized modules structures if the component can
|
||||
* provide a modules with the requested functionality or NULL if the
|
||||
* component should not be used on the given communicator.
|
||||
*/
|
||||
typedef struct mca_bcol_base_module_t **(*mca_bcol_base_component_comm_query_fn_t)
|
||||
(mca_sbgp_base_module_t *sbgp, int *num_modules);
|
||||
|
||||
|
||||
typedef int (*mca_bcol_barrier_init_fn_t)(struct mca_bcol_base_module_t *bcol_module,
|
||||
mca_sbgp_base_module_t *sbgp_module);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Macro for use in modules that are of type btl v2.0.0
|
||||
*/
|
||||
#define MCA_BCOL_BASE_VERSION_2_0_0 \
|
||||
OMPI_MCA_BASE_VERSION_2_1_0("bcol", 2, 0, 0)
|
||||
|
||||
|
||||
/* This is really an abstarction violation, but is the easiest way to get
|
||||
* started. For memory management we need to know what bcol components
|
||||
* have compatible memory management schemes. Such compatibility can
|
||||
* be used to eliminate memory copies between levels in the collective
|
||||
* operation hierarchy, by having the output buffer of one level be the
|
||||
* input buffer to the next level
|
||||
*/
|
||||
|
||||
enum {
|
||||
BCOL_SHARED_MEMORY_UMA=0,
|
||||
BCOL_SHARED_MEMORY_SOCKET,
|
||||
BCOL_POINT_TO_POINT,
|
||||
BCOL_IB_OFFLOAD,
|
||||
BCOL_SIZE
|
||||
};
|
||||
|
||||
OMPI_DECLSPEC extern int bcol_mpool_compatibility[BCOL_SIZE][BCOL_SIZE];
|
||||
OMPI_DECLSPEC extern int bcol_mpool_index[BCOL_SIZE][BCOL_SIZE];
|
||||
|
||||
/* what are the input parameters ? too many void * pointers here */
|
||||
typedef int (*bcol_register_mem_fn_t)(void *context_data, void *base,
|
||||
size_t size, void **reg_desc);
|
||||
/* deregistration function */
|
||||
typedef int (*bcol_deregister_mem_fn_t)(void *context_data, void *reg_desc);
|
||||
|
||||
/* Bcol network context definition */
|
||||
struct bcol_base_network_context_t {
|
||||
opal_object_t super;
|
||||
/* Context id - defined by upper layer, ML */
|
||||
int context_id;
|
||||
/* Any context information that bcol what to use */
|
||||
void *context_data;
|
||||
|
||||
/* registration function */
|
||||
bcol_register_mem_fn_t register_memory_fn;
|
||||
/* deregistration function */
|
||||
bcol_deregister_mem_fn_t deregister_memory_fn;
|
||||
};
|
||||
typedef struct bcol_base_network_context_t bcol_base_network_context_t;
|
||||
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(bcol_base_network_context_t);
|
||||
|
||||
/*
|
||||
*primitive function types
|
||||
*/
|
||||
|
||||
/* bcast */
|
||||
enum {
|
||||
/* small data function */
|
||||
BCOL_BCAST_SMALL_DATA,
|
||||
|
||||
/* small data - dynamic decision making supported */
|
||||
BCOL_BCAST_SMALL_DATA_DYNAMIC,
|
||||
|
||||
/* number of functions */
|
||||
BCOL_NUM_BCAST_FUNCTIONS
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* BCOL instance.
|
||||
*/
|
||||
|
||||
/* no limit on fragment size - this supports using user buffers rather
|
||||
* than library buffers
|
||||
*/
|
||||
#define FRAG_SIZE_NO_LIMIT -1
|
||||
|
||||
/* forward declaration */
|
||||
struct coll_bcol_collective_description_t;
|
||||
|
||||
struct mca_bcol_base_component_2_0_0_t {
|
||||
|
||||
/** Base component description */
|
||||
mca_base_component_t bcol_version;
|
||||
|
||||
/** Component initialization function */
|
||||
mca_bcol_base_component_init_query_fn_t collm_init_query;
|
||||
|
||||
/** Query whether component is useable for given communicator */
|
||||
mca_bcol_base_component_comm_query_fn_t collm_comm_query;
|
||||
|
||||
/** If bcol supports all possible data types */
|
||||
mca_bcol_base_module_coll_support_fn_t coll_support;
|
||||
|
||||
/** If bcol supports all possible data types for given collective operation */
|
||||
mca_bcol_base_module_coll_support_all_types_fn_t coll_support_all_types;
|
||||
|
||||
/** Use this flag to prevent init_query multiple calls
|
||||
in case we have the same bcol more than on a single level */
|
||||
bool init_done;
|
||||
|
||||
/** If collective calls with bcols of this type need to be ordered */
|
||||
bool need_ordering;
|
||||
|
||||
/** MCA parameter: Priority of this component */
|
||||
int priority;
|
||||
|
||||
/** Bcast function pointers */
|
||||
struct coll_bcol_collective_description_t *
|
||||
bcast_functions[BCOL_NUM_BCAST_FUNCTIONS];
|
||||
|
||||
/** Number of network contexts - need this for resource management */
|
||||
int n_net_contexts;
|
||||
|
||||
/** List of network contexts */
|
||||
bcol_base_network_context_t **network_contexts;
|
||||
|
||||
/*
|
||||
* Fragmentation support
|
||||
*/
|
||||
|
||||
/** Minimum fragement size */
|
||||
int min_frag_size;
|
||||
|
||||
/** Maximum fragment size */
|
||||
int max_frag_size;
|
||||
|
||||
/** Supports direct use of user-buffers */
|
||||
bool can_use_user_buffers;
|
||||
};
|
||||
typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_2_0_0_t;
|
||||
typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_t;
|
||||
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_component_t);
|
||||
|
||||
/* forward declaration */
|
||||
struct mca_coll_ml_descriptor_t;
|
||||
struct mca_bcol_base_payload_buffer_desc_t;
|
||||
struct mca_bcol_base_route_info_t;
|
||||
|
||||
typedef struct {
|
||||
int order_num; /* Seq num of collective fragment */
|
||||
int bcols_started; /* How many bcols need ordering have been started */
|
||||
int n_fns_need_ordering; /* The number of functions are called for bcols need ordering */
|
||||
} mca_bcol_base_order_info_t;
|
||||
|
||||
/* structure that encapsultes information propagated amongst multiple
|
||||
* fragments whereby completing the entire ensemble of fragments is
|
||||
* necessary in order to complete the entire collective
|
||||
*/
|
||||
struct bcol_fragment_descriptor_t {
|
||||
/* start iterator */
|
||||
int head;
|
||||
/* end iterator */
|
||||
int tail;
|
||||
/* current iteration */
|
||||
int start_iter;
|
||||
/* number of full iterations this frag */
|
||||
int num_iter;
|
||||
/* end iter */
|
||||
int end_iter;
|
||||
};
|
||||
typedef struct bcol_fragment_descriptor_t bcol_fragment_descriptor_t;
|
||||
|
||||
struct bcol_function_args_t {
|
||||
/* full message sequence number */
|
||||
int64_t sequence_num;
|
||||
/* full message descriptor - single copy of fragment invariant
|
||||
* parameters */
|
||||
/* Pasha: We don need this one for new flow - remove it */
|
||||
struct mca_coll_ml_descriptor_t *full_message_descriptor;
|
||||
struct mca_bcol_base_route_info_t *root_route;
|
||||
/* function status */
|
||||
int function_status;
|
||||
/* root, for rooted operations */
|
||||
int root;
|
||||
/* input buffer */
|
||||
const void *sbuf;
|
||||
void *rbuf;
|
||||
const void *userbuf;
|
||||
struct mca_bcol_base_payload_buffer_desc_t *src_desc;
|
||||
struct mca_bcol_base_payload_buffer_desc_t *dst_desc;
|
||||
/* ml buffer size */
|
||||
uint32_t buffer_size;
|
||||
/* index of buffer in ml payload cache */
|
||||
int buffer_index;
|
||||
int count;
|
||||
struct ompi_datatype_t *dtype;
|
||||
struct ompi_op_t *op;
|
||||
int sbuf_offset;
|
||||
int rbuf_offset;
|
||||
/* for bcol opaque data */
|
||||
void *bcol_opaque_data;
|
||||
/* An output argument that will be used by BCOL function to tell ML that the result of the BCOL is in rbuf */
|
||||
bool result_in_rbuf;
|
||||
bool root_flag; /* True if the rank is root of operation */
|
||||
bool need_dt_support; /* will trigger alternate code path for some colls */
|
||||
int status; /* Used for non-blocking collective completion */
|
||||
uint32_t frag_size; /* fragment size for large messages */
|
||||
int hier_factor; /* factor used when bcast is invoked as a service function back down
|
||||
* the tree in allgather for example, the pacl_len is not the actual
|
||||
* len of the data needing bcasting
|
||||
*/
|
||||
mca_bcol_base_order_info_t order_info;
|
||||
bcol_fragment_descriptor_t frag_info;
|
||||
|
||||
};
|
||||
|
||||
struct mca_bcol_base_route_info_t {
|
||||
int level;
|
||||
int rank;
|
||||
};
|
||||
typedef struct mca_bcol_base_route_info_t mca_bcol_base_route_info_t;
|
||||
|
||||
struct mca_bcol_base_lmngr_block_t {
|
||||
opal_list_item_t super;
|
||||
struct mca_coll_ml_lmngr_t *lmngr;
|
||||
void* base_addr;
|
||||
};
|
||||
typedef struct mca_bcol_base_lmngr_block_t mca_bcol_base_lmngr_block_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_base_lmngr_block_t);
|
||||
|
||||
struct mca_bcol_base_memory_block_desc_t {
|
||||
|
||||
/* memory block for payload buffers */
|
||||
struct mca_bcol_base_lmngr_block_t *block;
|
||||
|
||||
/* Address offset in bytes -- Indicates free memory in the block */
|
||||
uint64_t block_addr_offset;
|
||||
|
||||
/* size of the memory block */
|
||||
size_t size_block;
|
||||
|
||||
/* number of memory banks */
|
||||
uint32_t num_banks;
|
||||
|
||||
/* number of buffers per bank */
|
||||
uint32_t num_buffers_per_bank;
|
||||
|
||||
/* size of a payload buffer */
|
||||
uint32_t size_buffer;
|
||||
|
||||
/* pointer to buffer descriptors initialized */
|
||||
struct mca_bcol_base_payload_buffer_desc_t *buffer_descs;
|
||||
|
||||
/* index of the next free buffer in the block */
|
||||
uint64_t next_free_buffer;
|
||||
|
||||
uint32_t *bank_release_counters;
|
||||
|
||||
/* Counter that defines what bank should be synchronized next
|
||||
* since collectives could be completed out of order, we have to make
|
||||
* sure that memory synchronization collectives started in order ! */
|
||||
int memsync_counter;
|
||||
|
||||
/* This arrays of flags used to signal that the bank is ready for recycling */
|
||||
bool *ready_for_memsync;
|
||||
|
||||
/* This flags monitors if bank is open for usage. Usually we expect that user
|
||||
* will do the check only on buffer-zero allocation */
|
||||
bool *bank_is_busy;
|
||||
|
||||
};
|
||||
|
||||
/* convenience typedef */
|
||||
typedef struct mca_bcol_base_memory_block_desc_t mca_bcol_base_memory_block_desc_t;
|
||||
|
||||
typedef void (*mca_bcol_base_release_buff_fn_t)(struct mca_bcol_base_memory_block_desc_t *ml_memblock, uint32_t buff_id);
|
||||
|
||||
struct mca_bcol_base_payload_buffer_desc_t {
|
||||
void *base_data_addr; /* buffer address */
|
||||
void *data_addr; /* buffer address + header offset */
|
||||
uint64_t generation_number; /* my generation */
|
||||
uint64_t bank_index; /* my bank */
|
||||
uint64_t buffer_index; /* my buff index */
|
||||
};
|
||||
/* convenience typedef */
|
||||
typedef struct mca_bcol_base_payload_buffer_desc_t mca_bcol_base_payload_buffer_desc_t;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
typedef struct bcol_function_args_t bcol_function_args_t;
|
||||
|
||||
|
||||
/* The collective operation is defined by a series of collective operations
|
||||
* invoked through a function pointer. Each function may be different,
|
||||
* so will store the arguments in a struct and pass a pointer to the struct,
|
||||
* and use this as a way to hide the different function signatures.
|
||||
*
|
||||
* @param[in] input_args Structure with function arguments
|
||||
* @param[in] bcol_desc Component specific paremeters
|
||||
* @param[out] status return status of the function
|
||||
* MCA_BCOL_COMPLETE - function completed
|
||||
* MCA_BCOL_IN_PROGRESS - function incomplete
|
||||
*
|
||||
* @retval OMPI_SUCCESS successful completion
|
||||
* @retval OMPI_ERROR function returned error
|
||||
*/
|
||||
/* forward declaration */
|
||||
struct mca_bcol_base_module_t;
|
||||
|
||||
/* collective function prototype - all functions have the same interface
|
||||
* so that we can call them via a function pointer */
|
||||
struct mca_bcol_base_function_t;
|
||||
typedef int (*mca_bcol_base_module_collective_fn_primitives_t)
|
||||
(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
typedef int (*mca_bcol_base_module_collective_init_fn_primitives_t)
|
||||
(struct mca_bcol_base_module_t *bcol_module);
|
||||
|
||||
/**
|
||||
* function to query for collctive function attributes
|
||||
*
|
||||
* @param attribute (IN) the attribute of interest
|
||||
* @param algorithm_parameters (OUT) the value of attribute for this
|
||||
* function. If this attribute is not supported,
|
||||
* OMPI_ERR_NOT_FOUND is returned.
|
||||
*/
|
||||
typedef int (*mca_bcol_get_collective_attributes)(int attribute,
|
||||
void *algorithm_parameters);
|
||||
|
||||
/* data structure for tracking the relevant data needed for ml level
|
||||
* algorithm construction (e.g., function selection), initialization, and
|
||||
* usage.
|
||||
*/
|
||||
struct coll_bcol_collective_description_t {
|
||||
/* collective initiation function - first functin called */
|
||||
mca_bcol_base_module_collective_fn_primitives_t coll_fn;
|
||||
|
||||
/* collective progress function - first functin called */
|
||||
mca_bcol_base_module_collective_fn_primitives_t progress_fn;
|
||||
|
||||
/* collective progress function - first functin called */
|
||||
mca_bcol_get_collective_attributes get_attributes;
|
||||
|
||||
/* attributes supported - bit map */
|
||||
uint64_t attribute;
|
||||
|
||||
};
|
||||
typedef struct coll_bcol_collective_description_t
|
||||
coll_bcol_collective_description_t;
|
||||
|
||||
/* collective operation attributes */
|
||||
enum {
|
||||
/* supports dynamic decisions - e.g., do not need to have the collective
|
||||
* operation fully defined before it can be started
|
||||
*/
|
||||
BCOL_ATTRIBUTE_DYNAMIC,
|
||||
|
||||
/* number of attributes */
|
||||
BCOL_NUM_ATTRIBUTES
|
||||
};
|
||||
|
||||
/* For rooted collectives,
|
||||
* does the algorithm knows its data source ?
|
||||
*/
|
||||
enum {
|
||||
DATA_SRC_KNOWN=0,
|
||||
DATA_SRC_UNKNOWN,
|
||||
DATA_SRC_TYPES
|
||||
};
|
||||
|
||||
enum {
|
||||
BLOCKING,
|
||||
NON_BLOCKING
|
||||
};
|
||||
/* gvm For selection logic */
|
||||
struct mca_bcol_base_coll_fn_comm_attributes_t {
|
||||
int bcoll_type;
|
||||
int comm_size_min;
|
||||
int comm_size_max;
|
||||
int data_src;
|
||||
int waiting_semantics;
|
||||
};
|
||||
|
||||
typedef struct mca_bcol_base_coll_fn_comm_attributes_t
|
||||
mca_bcol_base_coll_fn_comm_attributes_t;
|
||||
|
||||
struct mca_bcol_base_coll_fn_invoke_attributes_t {
|
||||
int bcol_msg_min;
|
||||
int bcol_msg_max;
|
||||
uint64_t datatype_bitmap; /* Max is OMPI_DATATYPE_MAX_PREDEFINED defined to be 45 */
|
||||
uint32_t op_types_bitmap; /* bit map of optypes supported */
|
||||
};
|
||||
|
||||
typedef struct mca_bcol_base_coll_fn_invoke_attributes_t
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t;
|
||||
|
||||
struct mca_bcol_base_coll_fn_desc_t {
|
||||
opal_list_item_t super;
|
||||
struct mca_bcol_base_coll_fn_comm_attributes_t *comm_attr;
|
||||
struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attr;
|
||||
mca_bcol_base_module_collective_fn_primitives_t coll_fn;
|
||||
mca_bcol_base_module_collective_fn_primitives_t progress_fn;
|
||||
};
|
||||
|
||||
typedef struct mca_bcol_base_coll_fn_desc_t mca_bcol_base_coll_fn_desc_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_base_coll_fn_desc_t);
|
||||
|
||||
/* end selection logic */
|
||||
|
||||
typedef int (*mca_bcol_base_module_collective_init_fn_t)
|
||||
(struct mca_bcol_base_module_t *bcol_module,
|
||||
mca_sbgp_base_module_t *sbgp_module);
|
||||
|
||||
/* per communicator memory initialization function */
|
||||
typedef int (*mca_bcol_module_mem_init)(struct ml_buffers_t *registered_buffers,
|
||||
mca_bcol_base_component_t *module);
|
||||
|
||||
/* Initialize memory block - ml_memory_block initialization interface function
|
||||
*
|
||||
* Invoked at the ml level, used to pass bcol specific registration information
|
||||
* for the "ml_memory_block"
|
||||
*
|
||||
* @param[in] ml_memory_block Pointer to the ml_memory_block. This struct
|
||||
* contains bcol specific registration information and a call back function
|
||||
* used for resource recycling.
|
||||
*
|
||||
* @param[in] reg_data bcol specific registration data.
|
||||
*
|
||||
* @returns On Success: OMPI_SUCCESS
|
||||
* On Failure: OMPI_ERROR
|
||||
*
|
||||
*/
|
||||
/*typedef int (*mca_bcol_base_init_memory_fn_t)
|
||||
(struct mca_bcol_base_memory_block_desc_t *ml_block, void *reg_data);*/
|
||||
|
||||
typedef int (*mca_bcol_base_init_memory_fn_t)
|
||||
(struct mca_bcol_base_memory_block_desc_t *payload_block,
|
||||
uint32_t data_offset,
|
||||
struct mca_bcol_base_module_t *bcol,
|
||||
void *reg_data);
|
||||
|
||||
typedef int (*mca_common_allgather_init_fn_t)
|
||||
(struct mca_bcol_base_module_t *bcol_module);
|
||||
|
||||
typedef void (*mca_bcol_base_set_thresholds_fn_t)
|
||||
(struct mca_bcol_base_module_t *bcol_module);
|
||||
|
||||
enum {
|
||||
MCA_BCOL_BASE_ZERO_COPY = 1,
|
||||
MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG = 1 << 1,
|
||||
MCA_BCOL_BASE_NO_ML_BUFFER_FOR_BARRIER = 1 << 2
|
||||
};
|
||||
|
||||
/* base module */
|
||||
struct mca_bcol_base_module_t {
|
||||
/* base coll component */
|
||||
opal_object_t super;
|
||||
|
||||
/* bcol component (Pasha: Do we really need cache the component?)*/
|
||||
mca_bcol_base_component_t *bcol_component;
|
||||
|
||||
/* network context that is used by this bcol
|
||||
only one context per bcol is allowed */
|
||||
bcol_base_network_context_t *network_context;
|
||||
|
||||
/* We are going to use the context index a lot,
|
||||
int order to decrease number of dereferences
|
||||
bcol->network_context->index
|
||||
we are caching the value on bcol */
|
||||
int context_index;
|
||||
|
||||
/* Set of flags that describe features supported by bcol */
|
||||
uint64_t supported_mode;
|
||||
|
||||
/* per communicator memory initialization function */
|
||||
mca_bcol_module_mem_init init_module;
|
||||
|
||||
/* sub-grouping module partner */
|
||||
mca_sbgp_base_module_t *sbgp_partner_module;
|
||||
|
||||
/* size of subgroup - cache this, so can have access when
|
||||
* sbgp_partner_module no longer existes */
|
||||
int size_of_subgroup;
|
||||
|
||||
/* sequence number offset - want to make sure that we start
|
||||
* id'ing collectives with id 0, so we can have simple
|
||||
* resource management.
|
||||
*/
|
||||
int64_t squence_number_offset;
|
||||
|
||||
|
||||
/* number of times to poll for operation completion before
|
||||
* breaking out of a non-blocking collective operation
|
||||
*/
|
||||
int n_poll_loops;
|
||||
|
||||
/* size of header that will go in data buff, should not include
|
||||
* any info regarding alignment, let the ml level handle this
|
||||
*/
|
||||
uint32_t header_size;
|
||||
|
||||
|
||||
/* Each bcol is assigned a unique value
|
||||
* see if we can get away with 16-bit id
|
||||
*/
|
||||
int16_t bcol_id;
|
||||
|
||||
/*FIXME:
|
||||
* Since mca_bcol_base_module_t is the only parameter which will be passed
|
||||
* into the bcol_basesmuma_bcast_init(), add the flag to indicate whether
|
||||
* the hdl-based algorithms will get enabled.
|
||||
*/
|
||||
bool use_hdl;
|
||||
/*
|
||||
* Collective function pointers
|
||||
*/
|
||||
/* changing function signature - will replace bcol_functions */
|
||||
mca_bcol_base_module_collective_fn_primitives_t bcol_function_table[BCOL_NUM_OF_FUNCTIONS];
|
||||
|
||||
/* Tables hold pointers to functions */
|
||||
mca_bcol_base_module_collective_init_fn_primitives_t bcol_function_init_table[BCOL_NUM_OF_FUNCTIONS];
|
||||
opal_list_t bcol_fns_table[BCOL_NUM_OF_FUNCTIONS];
|
||||
struct mca_bcol_base_coll_fn_desc_t*
|
||||
filtered_fns_table[DATA_SRC_TYPES][2][BCOL_NUM_OF_FUNCTIONS][NUM_MSG_RANGES+1][OMPI_OP_NUM_OF_TYPES][OMPI_DATATYPE_MAX_PREDEFINED];
|
||||
|
||||
/*
|
||||
* Bcol interface function to pass bcol specific
|
||||
* info and memory recycling call back
|
||||
*/
|
||||
mca_bcol_base_init_memory_fn_t bcol_memory_init;
|
||||
|
||||
/*
|
||||
* netpatterns interface function, would like to invoke this on
|
||||
* on the ml level
|
||||
*/
|
||||
mca_common_allgather_init_fn_t k_nomial_tree;
|
||||
/* Each bcol caches a list which describes how many ranks
|
||||
* are "below" each rank in this bcol
|
||||
*/
|
||||
int *list_n_connected;
|
||||
|
||||
/* offsets for scatter/gather */
|
||||
int hier_scather_offset;
|
||||
|
||||
/* Small message threshold for each collective */
|
||||
int small_message_thresholds[BCOL_NUM_OF_FUNCTIONS];
|
||||
|
||||
/* Set small_message_thresholds array */
|
||||
mca_bcol_base_set_thresholds_fn_t set_small_msg_thresholds;
|
||||
|
||||
/* Pointer to the order counter on the upper layer,
|
||||
used if the bcol needs to be ordered */
|
||||
int *next_inorder;
|
||||
};
|
||||
typedef struct mca_bcol_base_module_t mca_bcol_base_module_t;
|
||||
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_module_t);
|
||||
|
||||
/* function description */
|
||||
struct mca_bcol_base_function_t {
|
||||
int fn_idx;
|
||||
/* module */
|
||||
struct mca_bcol_base_module_t *bcol_module;
|
||||
|
||||
/*
|
||||
* The following two parameters are used for bcol modules
|
||||
* that want to do some optimizations based on the fact that
|
||||
* n functions from the same bcol module are called in a row.
|
||||
* For example, in the iboffload case, on the first call one
|
||||
* will want to initialize the MWR, and start to instantiate
|
||||
* it, but only post it at the end of the last call.
|
||||
* The index of this function in a sequence of consecutive
|
||||
* functions from the same bcol
|
||||
*/
|
||||
int index_in_consecutive_same_bcol_calls;
|
||||
|
||||
/* number of times functions from this bcol are
|
||||
* called in order
|
||||
*/
|
||||
int n_of_this_type_in_a_row;
|
||||
|
||||
/*
|
||||
* number of times functions from this module are called in the
|
||||
* collective operation.
|
||||
*/
|
||||
int n_of_this_type_in_collective;
|
||||
int index_of_this_type_in_collective;
|
||||
};
|
||||
typedef struct mca_bcol_base_function_t mca_bcol_base_function_t;
|
||||
|
||||
|
||||
|
||||
|
||||
struct mca_bcol_base_descriptor_t {
|
||||
opal_free_list_item_t super;
|
||||
/* Vasily: will be described in the future */
|
||||
};
|
||||
typedef struct mca_bcol_base_descriptor_t mca_bcol_base_descriptor_t;
|
||||
|
||||
static inline __opal_attribute_always_inline__ size_t
|
||||
mca_bcol_base_get_buff_length(ompi_datatype_t *dtype, int count)
|
||||
{
|
||||
ptrdiff_t lb, extent;
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
|
||||
return (size_t) (extent * count);
|
||||
}
|
||||
|
||||
#define MCA_BCOL_CHECK_ORDER(module, bcol_function_args) \
|
||||
do { \
|
||||
if (*((module)->next_inorder) != \
|
||||
(bcol_function_args)->order_info.order_num) { \
|
||||
return BCOL_FN_NOT_STARTED; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define MCA_BCOL_UPDATE_ORDER_COUNTER(module, order_info) \
|
||||
do { \
|
||||
(order_info)->bcols_started++; \
|
||||
if ((order_info)->n_fns_need_ordering == \
|
||||
(order_info)->bcols_started) { \
|
||||
++(*((module)->next_inorder)); \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif /* MCA_BCOL_H */
|
@ -1,66 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = $(bcol_iboffload_CPPFLAGS) $(btl_openib_CPPFLAGS)
|
||||
|
||||
sources = \
|
||||
bcol_iboffload.h \
|
||||
bcol_iboffload_device.h \
|
||||
bcol_iboffload_module.c \
|
||||
bcol_iboffload_mca.h \
|
||||
bcol_iboffload_mca.c \
|
||||
bcol_iboffload_endpoint.h \
|
||||
bcol_iboffload_endpoint.c \
|
||||
bcol_iboffload_frag.h \
|
||||
bcol_iboffload_frag.c \
|
||||
bcol_iboffload_collfrag.h \
|
||||
bcol_iboffload_collfrag.c \
|
||||
bcol_iboffload_task.h \
|
||||
bcol_iboffload_task.c \
|
||||
bcol_iboffload_component.c \
|
||||
bcol_iboffload_barrier.c \
|
||||
bcol_iboffload_bcast.h \
|
||||
bcol_iboffload_bcast.c \
|
||||
bcol_iboffload_allgather.c \
|
||||
bcol_iboffload_collreq.h \
|
||||
bcol_iboffload_collreq.c \
|
||||
bcol_iboffload_qp_info.c \
|
||||
bcol_iboffload_qp_info.h \
|
||||
bcol_iboffload_fanin.c \
|
||||
bcol_iboffload_fanout.c \
|
||||
bcol_iboffload_allreduce.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
component_noinst =
|
||||
component_install =
|
||||
if MCA_BUILD_ompi_bcol_iboffload_DSO
|
||||
component_install += mca_bcol_iboffload.la
|
||||
else
|
||||
component_noinst += libmca_bcol_iboffload.la
|
||||
endif
|
||||
|
||||
# See ompi/mca/btl/sm/Makefile.am for an explanation of
|
||||
# libmca_common_sm.la.
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_bcol_iboffload_la_SOURCES = $(sources)
|
||||
mca_bcol_iboffload_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS) $(bcol_iboffload_LDFLAGS)
|
||||
mca_bcol_iboffload_la_LIBADD = $(btl_openib_LIBS) $(bcol_iboffload_LIBS) \
|
||||
$(OMPI_TOP_BUILDDIR)/ompi/mca/common/ofacm/libmca_common_ofacm.la \
|
||||
$(OMPI_TOP_BUILDDIR)/ompi/mca/common/verbs/libmca_common_verbs.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_bcol_iboffload_la_SOURCES =$(sources)
|
||||
libmca_bcol_iboffload_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS) $(bcol_iboffload_LDFLAGS)
|
@ -1,765 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_H
|
||||
#define MCA_BCOL_IBOFFLOAD_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <infiniband/mqe.h>
|
||||
#include <infiniband/verbs.h>
|
||||
#include <infiniband/mverbs.h>
|
||||
|
||||
#include "ompi/mca/mca.h"
|
||||
|
||||
#include "ompi/op/op.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/datatype/ompi_datatype_internal.h"
|
||||
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
|
||||
#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h"
|
||||
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "ompi/request/request.h"
|
||||
|
||||
#include "ompi/mca/common/ofacm/connect.h"
|
||||
|
||||
#include "bcol_iboffload_qp_info.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define IMM_RDMA 1
|
||||
#define INLINE 1
|
||||
#define NO_INLINE 0
|
||||
|
||||
#define MCA_IBOFFLOAD_CALC_SIZE_EXT 8
|
||||
#define MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE 8
|
||||
#define MCA_IBOFFLOAD_CACHE_LINE_SIZE 128
|
||||
|
||||
#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA
|
||||
#define MCA_BCOL_IBOFFLOAD_SEND_CALC IBV_M_WR_CALC_SEND
|
||||
#else
|
||||
#define MCA_BCOL_IBOFFLOAD_SEND_CALC IBV_M_WR_CALC
|
||||
#endif
|
||||
|
||||
|
||||
/* 0 - barrier rdma info
|
||||
1 - ML rdma info */
|
||||
#define MAX_REMOTE_RDMA_INFO 2
|
||||
|
||||
/* forward declarations */
|
||||
struct mca_bcol_iboffload_module_t;
|
||||
struct mca_bcol_iboffload_collreq_t;
|
||||
struct mca_bcol_iboffload_endpoint_t;
|
||||
struct mca_bcol_iboffload_frag_t;
|
||||
struct mca_bcol_iboffload_task_t;
|
||||
struct mca_bcol_iboffload_qp_info_t;
|
||||
struct mca_bcol_iboffload_collfrag_t;
|
||||
struct mca_bcol_iboffload_algth_lst_t;
|
||||
struct mca_bcol_iboffload_device_t;
|
||||
|
||||
typedef int (*mca_bcol_iboffload_coll_algth_fn_t) (
|
||||
struct mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request);
|
||||
|
||||
struct mca_bcol_iboffload_rdma_info_t {
|
||||
uint64_t addr;
|
||||
uint32_t rkey;
|
||||
uint32_t lkey;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_rdma_info_t mca_bcol_iboffload_rdma_info_t;
|
||||
|
||||
struct mca_bcol_iboffload_rdma_buffer_desc_t {
|
||||
void *data_addr; /* buffer address */
|
||||
uint64_t generation_number; /* my generation */
|
||||
uint64_t bank_index; /* my bank */
|
||||
uint64_t buffer_index; /* my buff index */
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_rdma_buffer_desc_t mca_bcol_iboffload_rdma_buffer_desc_t;
|
||||
|
||||
struct mca_bcol_iboffload_rdma_block_desc_t {
|
||||
/* number of memory banks */
|
||||
uint32_t num_banks;
|
||||
/* number of buffers per bank */
|
||||
uint32_t num_buffers_per_bank;
|
||||
/* size of a payload buffer */
|
||||
uint32_t size_buffer;
|
||||
/* data offset from ML */
|
||||
uint32_t data_offset;
|
||||
/* pointer to buffer descriptors initialized */
|
||||
mca_bcol_iboffload_rdma_buffer_desc_t *rdma_desc;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_rdma_block_desc_t mca_bcol_iboffload_rdma_block_desc_t;
|
||||
|
||||
/* Information that we need to keep in order to access remote
|
||||
memory. For each remote peer (endpoint) we will keep this
|
||||
structure */
|
||||
struct mca_bcol_iboffload_rem_rdma_block_t {
|
||||
/* IB related information first */
|
||||
mca_bcol_iboffload_rdma_info_t ib_info;
|
||||
|
||||
mca_bcol_iboffload_rdma_buffer_desc_t *rdma_desc;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_rem_rdma_block_t mca_bcol_iboffload_rem_rdma_block_t;
|
||||
|
||||
enum {
|
||||
MCA_BCOL_IBOFFLOAD_BK_COUNTER_INDEX = 0,
|
||||
MCA_BCOL_IBOFFLOAD_BK_SYNC_INDEX,
|
||||
MCA_BCOL_IBOFFLOAD_BK_LAST
|
||||
};
|
||||
|
||||
/* Information that we need to keep in order to access and
|
||||
track local memory that is used as source and destinatination
|
||||
for RDMA operations */
|
||||
struct mca_bcol_iboffload_local_rdma_block_t {
|
||||
/* sync counter keeps next to start bank id */
|
||||
int sync_counter;
|
||||
/* Counter for released ml buffers */
|
||||
int *bank_buffer_counter[MCA_BCOL_IBOFFLOAD_BK_LAST];
|
||||
/* IB related information first */
|
||||
struct mca_bcol_iboffload_rdma_info_t ib_info;
|
||||
/* back pointer to original ML memory descriptor */
|
||||
struct mca_bcol_base_memory_block_desc_t *ml_mem_desc;
|
||||
/* Pasha: do we really need this one ?*/
|
||||
/* caching ml memory descriptor configurations localy */
|
||||
mca_bcol_iboffload_rdma_block_desc_t bdesc;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_local_rdma_block_t mca_bcol_iboffload_local_rdma_block_t;
|
||||
|
||||
struct mca_bcol_iboffload_recv_wr_manager {
|
||||
opal_mutex_t lock;
|
||||
/** Array of ready to use receive work requests.
|
||||
* it is 2 dimensional array since for each
|
||||
* qp size we want to keep separate recv wr */
|
||||
struct ibv_recv_wr **recv_work_requests;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_recv_wr_manager mca_bcol_iboffload_recv_wr_manager;
|
||||
|
||||
/**
|
||||
* Structure to hold the basic shared memory coll component. First it holds the
|
||||
* base coll component, and then holds a bunch of
|
||||
* sm-coll-component-specific stuff (e.g., current MCA param
|
||||
* values).
|
||||
*/
|
||||
struct mca_bcol_iboffload_component_t {
|
||||
/** Base coll component */
|
||||
mca_bcol_base_component_2_0_0_t super;
|
||||
/** Enable disable verbose mode */
|
||||
int verbose;
|
||||
int num_qps;
|
||||
/** Whether we want a warning if non default GID prefix is not configured
|
||||
on multiport setup */
|
||||
bool warn_default_gid_prefix;
|
||||
/** Whether we want a warning if the user specifies a non-existent
|
||||
device and/or port via bcol_ibofflad_if_[in|ex]clude MCA params */
|
||||
bool warn_nonexistent_if;
|
||||
/** initial size of free lists */
|
||||
int free_list_num;
|
||||
/** maximum size of free lists */
|
||||
int free_list_max;
|
||||
/** number of elements to alloc when growing free lists */
|
||||
int free_list_inc;
|
||||
/** name of ib memory pool */
|
||||
char* mpool_name;
|
||||
/** max outstanding CQE on the CQ */
|
||||
int cq_size;
|
||||
/** Max size of inline data */
|
||||
unsigned int max_inline_data;
|
||||
/** IB partition definition */
|
||||
uint32_t pkey_val;
|
||||
/** Outstanding atomic reads */
|
||||
unsigned int qp_ous_rd_atom;
|
||||
/** IB MTU */
|
||||
int mtu;
|
||||
/** Recv not ready timer */
|
||||
int min_rnr_timer;
|
||||
/** IB timeout */
|
||||
int timeout;
|
||||
/** IB retry count */
|
||||
int retry_count;
|
||||
/** Recv not ready retry count */
|
||||
int rnr_retry;
|
||||
/** IB maximum pending RDMA */
|
||||
int max_rdma_dst_ops;
|
||||
/** IB Service level (QOS) */
|
||||
int service_level;
|
||||
/** Preferred communication buffer alignment in Bytes (must be power of two) */
|
||||
int buffer_alignment;
|
||||
/** Max tasks number for MQ */
|
||||
int max_mqe_tasks;
|
||||
/** Max MQ size */
|
||||
int max_mq_size;
|
||||
/** HCA/Port include exclude list */
|
||||
char *if_include;
|
||||
char **if_include_list;
|
||||
char *if_exclude;
|
||||
char **if_exclude_list;
|
||||
/** Dummy argv-style list; a copy of names from the
|
||||
if_[in|ex]clude list that we use for error checking (to ensure
|
||||
that they all exist) */
|
||||
char **if_list;
|
||||
/** Array of ibv devices */
|
||||
struct ibv_device **ib_devs;
|
||||
/** devices count */
|
||||
int num_devs;
|
||||
/** MCA param bcol_iboffload_receive_queues */
|
||||
char *receive_queues;
|
||||
/** Common info about all kinds of QPs on each iboffload module */
|
||||
struct mca_bcol_iboffload_qp_info_t qp_infos[MCA_BCOL_IBOFFLOAD_QP_LAST];
|
||||
/** Array of iboffload devices */
|
||||
opal_pointer_array_t devices;
|
||||
/** Free lists of collfrag descriptors */
|
||||
ompi_free_list_t collfrags_free;
|
||||
/** Free lists of outstanding collective operations */
|
||||
ompi_free_list_t collreqs_free;
|
||||
/** Free lists for free task operations */
|
||||
ompi_free_list_t tasks_free;
|
||||
/** Free lists for free calc task operations */
|
||||
ompi_free_list_t calc_tasks_free;
|
||||
/** Free list of empty frags, that do not keep any
|
||||
registration information */
|
||||
ompi_free_list_t ml_frags_free;
|
||||
/** Recv work request mananger */
|
||||
mca_bcol_iboffload_recv_wr_manager recv_wrs;
|
||||
/** We allocate some resources on the component
|
||||
* with creating of the first iboffload module
|
||||
* and set this flag to true */
|
||||
bool init_done;
|
||||
/** Maximal number of fragments of the same colective request that can be sent in parallel */
|
||||
unsigned int max_pipeline_depth;
|
||||
/** array mapping Open MPI reduction operators to MVerbs reduction operators */
|
||||
enum ibv_m_wr_calc_op map_ompi_to_ib_calcs[OMPI_OP_NUM_OF_TYPES];
|
||||
/** array mapping Open MPI data types to MVerbs data types */
|
||||
enum ibv_m_wr_data_type map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_MAX_PREDEFINED];
|
||||
/** The order of the exchange tree */
|
||||
int exchange_tree_order;
|
||||
/** Knomial tree order */
|
||||
int knomial_tree_order;
|
||||
/** K-nomial radix */
|
||||
int k_nomial_radix;
|
||||
/** Maximum number of pulls for completion check */
|
||||
int max_progress_pull;
|
||||
/** Barrier function selector */
|
||||
int barrier_mode;
|
||||
/** MCA for selecting Bruck's alltoall algorithms */
|
||||
int use_brucks_smsg_alltoall_rdma;
|
||||
int use_brucks_smsg_alltoall_sr;
|
||||
/** radix of small-data alltoall Bruck-like algorithm */
|
||||
int k_alltoall_bruck_radix;
|
||||
/** alltoall small data buffer alignment */
|
||||
int tmp_buf_alignment;
|
||||
};
|
||||
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct mca_bcol_iboffload_component_t mca_bcol_iboffload_component_t;
|
||||
|
||||
/* List of all algorithms that we use */
|
||||
enum {
|
||||
FANIN_ALG,
|
||||
FANOUT_ALG,
|
||||
RECURSIVE_DOUBLING_BARRIER_ALG,
|
||||
RECURSIVE_KNOMIAL_BARRIER_ALG,
|
||||
RECURSIVE_DOUBLING_ALLREDUCE_ALG,
|
||||
RECURSIVE_DOUBLING_REDUCE_ALG,
|
||||
RECURSIVE_DOUBLING_TREE_BCAST,
|
||||
ALL_ENDPOINTS, /* connected to all peers */
|
||||
ALLGATHER_KNOMIAL_ALG,
|
||||
ALLGATHER_NEIGHBOR_ALG,
|
||||
REMOTE_EXCHANGE_ALG,
|
||||
LAST_ALG
|
||||
};
|
||||
|
||||
struct mca_bcol_iboffload_port_t {
|
||||
int id; /** Port number on device: 1 or 2 */
|
||||
int stat; /** Port status - Active,Init,etc.. */
|
||||
enum ibv_mtu mtu; /** MTU on this port */
|
||||
uint64_t subnet_id; /** Sunnet id for the port */
|
||||
uint16_t lid;
|
||||
uint16_t lmc;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_port_t mca_bcol_iboffload_port_t;
|
||||
|
||||
enum {
|
||||
COLL_MQ = 0,
|
||||
SERVICE_MQ,
|
||||
BCOL_IBOFFLOAD_MQ_NUM
|
||||
};
|
||||
|
||||
struct mca_bcol_iboffload_module_t {
|
||||
/* base structure */
|
||||
mca_bcol_base_module_t super;
|
||||
|
||||
/* size */
|
||||
int group_size;
|
||||
int log_group_size;
|
||||
|
||||
/* size of each memory segment */
|
||||
size_t segment_size;
|
||||
|
||||
/* collective tag */
|
||||
long long collective_tag;
|
||||
|
||||
/* pointer to device */
|
||||
struct mca_bcol_iboffload_device_t *device;
|
||||
|
||||
/* caching port number */
|
||||
uint32_t port;
|
||||
|
||||
/* Connecting iboffload with ibnet module information */
|
||||
/* pointer to sbgp ibnet */
|
||||
mca_sbgp_ibnet_module_t *ibnet;
|
||||
|
||||
/* connection group inder for the ibnet */
|
||||
int cgroup_index;
|
||||
|
||||
/* array of endpoints */
|
||||
struct mca_bcol_iboffload_endpoint_t **endpoints;
|
||||
|
||||
/* Size of the endpoints array */
|
||||
int num_endpoints;
|
||||
|
||||
/* caching port subnet id and lid
|
||||
* the same information we have on device */
|
||||
uint64_t subnet_id;
|
||||
uint16_t lid;
|
||||
|
||||
/* Pointer to management queue */
|
||||
struct mqe_context *mq[BCOL_IBOFFLOAD_MQ_NUM];
|
||||
int mq_credit[BCOL_IBOFFLOAD_MQ_NUM];
|
||||
|
||||
/* pending list of collfrags */
|
||||
opal_list_t collfrag_pending;
|
||||
|
||||
/* recursive-doubling tree node */
|
||||
netpatterns_pair_exchange_node_t recursive_doubling_tree;
|
||||
|
||||
/* N exchange tree */
|
||||
netpatterns_pair_exchange_node_t n_exchange_tree;
|
||||
|
||||
/* Knomial exchange tree */
|
||||
netpatterns_k_exchange_node_t knomial_exchange_tree;
|
||||
|
||||
/* Knomial exchange tree */
|
||||
netpatterns_k_exchange_node_t knomial_allgather_tree;
|
||||
|
||||
/* The array will keep pre-calculated task consumption per
|
||||
* algorithm
|
||||
*/
|
||||
uint32_t alg_task_consump[LAST_ALG];
|
||||
|
||||
/* Pointer to a func that's implementation of a barrier algorithm */
|
||||
mca_bcol_iboffload_coll_algth_fn_t barrier_algth;
|
||||
|
||||
/* Pointer to a func that's implementation of a fanin algorithm */
|
||||
mca_bcol_iboffload_coll_algth_fn_t fanin_algth;
|
||||
|
||||
/* Pointer to a func that's implementation of a fanin algorithm */
|
||||
mca_bcol_iboffload_coll_algth_fn_t fanout_algth;
|
||||
|
||||
/* Pointer to a func that's implementation of a allreduce algorithm */
|
||||
mca_bcol_iboffload_coll_algth_fn_t allreduce_algth;
|
||||
|
||||
/* Pointer to a func that's implementation of a non blocking memory syncronization algorithm */
|
||||
mca_bcol_iboffload_coll_algth_fn_t memsync_algth;
|
||||
|
||||
/* rdma block memory information */
|
||||
mca_bcol_iboffload_local_rdma_block_t rdma_block;
|
||||
|
||||
/* The largest power of two which 1 << power_of_2
|
||||
is not larger than the group size */
|
||||
int power_of_2;
|
||||
|
||||
/* The largest power of two number which is not larger than the group size */
|
||||
int power_of_2_ranks;
|
||||
|
||||
/* Connection status array */
|
||||
bool connection_status[LAST_ALG];
|
||||
|
||||
/* map from communicator ranks to ibsubnet */
|
||||
int *comm_to_ibnet_map;
|
||||
|
||||
/* order preserving value */
|
||||
int64_t prev_sequence_num;
|
||||
|
||||
/* Temp iovec to send the data fragments -- alltoall Brucks */
|
||||
struct iovec *alltoall_iovec;
|
||||
struct iovec *alltoall_recv_iovec;
|
||||
|
||||
/* tree radix for the knomial bruck small data alltoall */
|
||||
int k_alltoall_bruck_radix;
|
||||
|
||||
/* Temp buffer alignment for knomial bruck small data alltoall */
|
||||
int tmp_buf_alignment;
|
||||
|
||||
/* Free task list with sge's array */
|
||||
ompi_free_list_t iovec_tasks_free;
|
||||
};
|
||||
|
||||
typedef struct mca_bcol_iboffload_module_t mca_bcol_iboffload_module_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_module_t);
|
||||
|
||||
/**
|
||||
* Global component instance
|
||||
*/
|
||||
OMPI_MODULE_DECLSPEC
|
||||
extern mca_bcol_iboffload_component_t mca_bcol_iboffload_component;
|
||||
|
||||
static inline int mca_bcol_iboffload_err(const char* fmt, ...)
|
||||
{
|
||||
va_list list;
|
||||
int ret;
|
||||
|
||||
va_start(list, fmt);
|
||||
ret = vfprintf(stderr, fmt, list);
|
||||
va_end(list);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(ompi_op, c_type, l_operand, r_operand, result) \
|
||||
do { \
|
||||
switch (ompi_op) { \
|
||||
case OMPI_OP_MAX: \
|
||||
*((c_type *)&result) = ((*(c_type *)&(l_operand) > *(c_type *)&(r_operand)) ? \
|
||||
*(c_type *)&(l_operand) : *(c_type *)&(r_operand)); \
|
||||
break; \
|
||||
case OMPI_OP_MIN: \
|
||||
*((c_type *)&result) = ((*(c_type *)&(l_operand) < *(c_type *)&(r_operand)) ? \
|
||||
*(c_type *)&(l_operand) : *(c_type *)&(r_operand)); \
|
||||
break; \
|
||||
case OMPI_OP_SUM: \
|
||||
*((c_type *)&result) = (*((c_type *)&(l_operand)) + *((c_type *)&(r_operand))); \
|
||||
break; \
|
||||
default: \
|
||||
break; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define MCA_BCOL_IBOFFLOAD_PKEY_MASK 0x7fff
|
||||
#define MCA_BCOL_IBOFFLOAD_DEFAULT_GID_PREFIX 0xfe80000000000000ll
|
||||
|
||||
#define IBOFFLOAD_ERROR(args) \
|
||||
do { \
|
||||
mca_bcol_iboffload_err("[%s]%s[%s:%d:%s] IBOFFLOAD ", \
|
||||
ompi_process_info.nodename, \
|
||||
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__); \
|
||||
mca_bcol_iboffload_err args; \
|
||||
mca_bcol_iboffload_err("\n"); \
|
||||
} while(0)
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
#define IBOFFLOAD_VERBOSE(level, args) \
|
||||
do { \
|
||||
if (mca_bcol_iboffload_component.verbose >= level) { \
|
||||
mca_bcol_iboffload_err("[%s]%s[%s:%d:%s] IBOFFLOAD ", \
|
||||
ompi_process_info.nodename, \
|
||||
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__); \
|
||||
mca_bcol_iboffload_err args; \
|
||||
mca_bcol_iboffload_err("\n"); \
|
||||
} \
|
||||
} while(0)
|
||||
#else
|
||||
#define IBOFFLOAD_VERBOSE(level, args)
|
||||
#endif
|
||||
|
||||
#define MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(coll_req, coll_work_req) \
|
||||
do { \
|
||||
opal_list_append(&(coll_req)->work_requests, \
|
||||
(opal_list_item_t*) (coll_work_req)); \
|
||||
(coll_work_req)->coll_full_req = (coll_req); \
|
||||
} while(0)
|
||||
/* Vasily: will be removed soon */
|
||||
#define APPEND_TO_TASKLIST(task_ptr_to_set, event, last_event_type) \
|
||||
do { \
|
||||
*task_ptr_to_set = &(event)->element; \
|
||||
last_event_type = &(event)->element; \
|
||||
task_ptr_to_set = &((event)->element.next); \
|
||||
} while(0)
|
||||
|
||||
#define MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(task_ptr_to_set, task) \
|
||||
do { \
|
||||
*task_ptr_to_set = (task); \
|
||||
task_ptr_to_set = &((task)->next_task); \
|
||||
} while(0)
|
||||
|
||||
#define MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(task_ptr_to_set, event) \
|
||||
do { \
|
||||
*task_ptr_to_set = &(event)->element; \
|
||||
task_ptr_to_set = &((event)->element.next); \
|
||||
} while(0)
|
||||
|
||||
#define BCOL_IS_COMPLETED(req) (((req)->n_frag_mpi_complete == (req)->n_fragments) && \
|
||||
((req)->n_fragments > 0))
|
||||
|
||||
#define BCOL_AND_NET_ARE_COMPLETED(req) (BCOL_IS_COMPLETED(req) && \
|
||||
((req)->n_frag_net_complete == (req)->n_fragments))
|
||||
|
||||
/* Pasha: Need to add locks here */
|
||||
#define BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(module, mq_index, num_of_credits) \
|
||||
(((module)->mq_credit[mq_index] -= (num_of_credits)) < 0 ? false : true)
|
||||
/* Pasha: Need to add locks here */
|
||||
#define BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(module, mq_index, num_of_credits) \
|
||||
((module)->mq_credit[mq_index] += (num_of_credits))
|
||||
|
||||
#define BCOL_IBOFFLOAD_IS_FIRST_CALL(args) (0 == (args)->index_in_consecutive_same_bcol_calls)
|
||||
|
||||
#define BCOL_IBOFFLOAD_IS_LAST_CALL(args) (((args)->n_of_this_type_in_collective - 1) == \
|
||||
(args)->index_of_this_type_in_collective)
|
||||
|
||||
#define BCOL_IBOFFLOAD_READY_TO_POST(args) (((args)->n_of_this_type_in_a_row - 1) == \
|
||||
(args)->index_in_consecutive_same_bcol_calls)
|
||||
/*
|
||||
* bcol module functions
|
||||
*/
|
||||
|
||||
int mca_bcol_iboffload_rec_doubling_start_connections(struct mca_bcol_iboffload_module_t *iboffload);
|
||||
|
||||
/* RDMA addr exchange with rem proc */
|
||||
int mca_bcol_iboffload_exchange_rem_addr(struct mca_bcol_iboffload_endpoint_t *ep);
|
||||
|
||||
/* Progress function */
|
||||
int mca_bcol_iboffload_component_progress(void);
|
||||
|
||||
/* Register memory */
|
||||
int mca_bcol_iboffload_register_mr(void *reg_data, void * base, size_t size,
|
||||
mca_mpool_base_registration_t *reg);
|
||||
|
||||
/* Deregister memory */
|
||||
int mca_bcol_iboffload_deregister_mr(void *reg_data, mca_mpool_base_registration_t *reg);
|
||||
|
||||
/*
|
||||
* The function is used for create CQ in this module.
|
||||
*/
|
||||
int mca_bcol_iboffload_adjust_cq(struct mca_bcol_iboffload_device_t *device,
|
||||
struct ibv_cq **ib_cq);
|
||||
/*
|
||||
* Query to see if the component is available for use,
|
||||
* and can satisfy the thread and progress requirements
|
||||
*/
|
||||
int mca_bcol_iboffload_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads);
|
||||
|
||||
|
||||
/* Interface to setup the allgather tree */
|
||||
int mca_bcol_iboffload_setup_knomial_tree(mca_bcol_base_module_t *super);
|
||||
|
||||
/*
|
||||
* Query to see if the module is available for use on
|
||||
* the given communicator, and if so, what it's priority is.
|
||||
*/
|
||||
mca_bcol_base_module_t **
|
||||
mca_bcol_iboffload_comm_query(mca_sbgp_base_module_t *sbgp, int *num_modules);
|
||||
|
||||
int
|
||||
mca_bcol_iboffload_free_tasks_frags_resources(
|
||||
struct mca_bcol_iboffload_collfrag_t *collfrag,
|
||||
ompi_free_list_t *frags_free);
|
||||
|
||||
/**
|
||||
* Shared memory blocking barrier
|
||||
*/
|
||||
|
||||
int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments,
|
||||
struct mca_bcol_base_function_t
|
||||
*const_args);
|
||||
|
||||
int mca_bcol_iboffload_barrier_intra_recursive_doubling_start(
|
||||
struct mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request);
|
||||
|
||||
int mca_bcol_iboffload_barrier_intra_recursive_knomial_start(
|
||||
struct mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request);
|
||||
|
||||
int mca_bcol_iboffload_barrier_intra_recursive_doubling(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request);
|
||||
|
||||
int mca_bcol_iboffload_nb_memory_service_barrier_start(
|
||||
struct mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request);
|
||||
|
||||
int mca_bcol_iboffload_fanin_register(mca_bcol_base_module_t *super);
|
||||
int mca_bcol_iboffload_fanout_register(mca_bcol_base_module_t *super);
|
||||
int mca_bcol_iboffload_barrier_register(mca_bcol_base_module_t *super);
|
||||
int mca_bcol_iboffload_memsync_register(mca_bcol_base_module_t *super);
|
||||
int mca_bcol_iboffload_allreduce_register(mca_bcol_base_module_t *super);
|
||||
|
||||
int mca_bcol_iboffload_new_style_fanin_first_call(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request);
|
||||
|
||||
int mca_bcol_iboffload_new_style_fanout_first_call(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request);
|
||||
|
||||
int mca_bcol_iboffload_nb_memory_service_barrier_intra(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
int mca_bcol_iboffload_coll_support_all_types(bcol_coll coll_name);
|
||||
int mca_bcol_iboffload_coll_supported(int op, int dtype, bcol_elem_type elem_type);
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_fls(int num)
|
||||
{
|
||||
int i = 1;
|
||||
int j = 0;
|
||||
|
||||
if (0 == num) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (i < num) {
|
||||
i <<= 1;
|
||||
j++;
|
||||
}
|
||||
|
||||
if (i > num) {
|
||||
j--;
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
#define BCOL_IBOFFLOAD_IS_EVEN(num) (!((num) & 1))
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_ffs(int num)
|
||||
{
|
||||
int j = 0;
|
||||
|
||||
if (0 == num) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (BCOL_IBOFFLOAD_IS_EVEN(num)) {
|
||||
num >>= 1;
|
||||
j++;
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
|
||||
/* Post task list MQ */
|
||||
#define IS_IMM(a) (a & MQE_WR_FLAG_IMM_EXE)
|
||||
#define IS_SIG(a) (a & MQE_WR_FLAG_SIGNAL)
|
||||
#define IS_BLK(a) (a & MQE_WR_FLAG_BLOCK)
|
||||
|
||||
int task_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task);
|
||||
int wait_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task);
|
||||
|
||||
#endif
|
||||
|
||||
/* MQ posting function */
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_post_mqe_tasks(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mqe_task *head_mqe)
|
||||
{
|
||||
int rc;
|
||||
struct mqe_task *bad_mqe = NULL;
|
||||
|
||||
#if OPAL_ENABLE_DEBUG /* debug code */
|
||||
|
||||
struct mqe_task *curr_mqe_task = NULL;
|
||||
int send_count = 0, recv_count = 0, wait_count = 0;
|
||||
|
||||
curr_mqe_task = head_mqe;
|
||||
IBOFFLOAD_VERBOSE(10, ("Processing MQE Head with addr %p <START>\n",
|
||||
(uintptr_t) (void*) curr_mqe_task));
|
||||
|
||||
while (NULL != curr_mqe_task) {
|
||||
switch(curr_mqe_task->opcode) {
|
||||
case MQE_WR_SEND:
|
||||
IBOFFLOAD_VERBOSE(10, ("Posting task %p id 0x%x: send on QP 0x%x\n"
|
||||
"rank %d, sg_entry: addr %p LEN %d lkey %u, flag[%d-%d-%d]\n",
|
||||
(void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id,
|
||||
curr_mqe_task->post.qp->qp_num,
|
||||
task_to_rank(iboffload, curr_mqe_task),
|
||||
curr_mqe_task->post.send_wr->sg_list->addr,
|
||||
curr_mqe_task->post.send_wr->sg_list->length,
|
||||
curr_mqe_task->post.send_wr->sg_list->lkey,
|
||||
IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags)));
|
||||
|
||||
++send_count;
|
||||
break;
|
||||
case MQE_WR_RECV:
|
||||
IBOFFLOAD_VERBOSE(10, ("Posting task %p id 0x%x: recv on QP 0x%x rank %d flag[%d-%d-%d]\n",
|
||||
(void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id,
|
||||
curr_mqe_task->post.qp->qp_num, task_to_rank(iboffload, curr_mqe_task),
|
||||
IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags)));
|
||||
|
||||
++recv_count;
|
||||
break;
|
||||
case MQE_WR_CQE_WAIT:
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Posting task %p id %x: wait on CQ %p for rank %d num of waits %d flag[%d-%d-%d]\n",
|
||||
(void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id,
|
||||
(void*) curr_mqe_task->wait.cq, wait_to_rank(iboffload, curr_mqe_task),
|
||||
curr_mqe_task->wait.count,
|
||||
IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags)));
|
||||
|
||||
wait_count += curr_mqe_task->wait.count;
|
||||
break;
|
||||
default:
|
||||
IBOFFLOAD_ERROR(("Fatal error, unknow packet type %d\n",
|
||||
curr_mqe_task->opcode));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* pointer to next task */
|
||||
curr_mqe_task = curr_mqe_task->next;
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("wait[%d] send[%d] recv[%d]\n",
|
||||
wait_count, send_count, recv_count));
|
||||
#endif
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Posting MQ %p <DONE>\n", (uintptr_t) head_mqe->wr_id));
|
||||
|
||||
rc = mqe_post_task(iboffload->mq[0], head_mqe, &bad_mqe);
|
||||
if (OPAL_UNLIKELY(0 != rc)) {
|
||||
IBOFFLOAD_ERROR(("ibv_post_mqe failed, errno says: %s,"
|
||||
" the return code is [%d]\n",
|
||||
strerror(errno), rc));
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int lognum(int n) {
|
||||
int count = 1, lognum = 0;
|
||||
|
||||
while (count < n) {
|
||||
count = count << 1;
|
||||
lognum++;
|
||||
}
|
||||
|
||||
return lognum;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_BCOL_IBOFFLOAD_H */
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,934 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
#include "bcol_iboffload_task.h"
|
||||
#include "bcol_iboffload_collreq.h"
|
||||
#include "bcol_iboffload_collfrag.h"
|
||||
#include "bcol_iboffload_endpoint.h"
|
||||
#include "ompi/mca/coll/ml/coll_ml_allocation.h"
|
||||
|
||||
static int mca_bcol_iboffload_barrier_init(
|
||||
bcol_function_args_t *input_args,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
collective_message_completion_callback_function cb_fn,
|
||||
struct mca_bcol_iboffload_collreq_t **coll_request);
|
||||
|
||||
/**
|
||||
* Start barrier
|
||||
*/
|
||||
|
||||
int mca_bcol_iboffload_barrier_intra_recursive_doubling(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
/* local variables */
|
||||
mca_bcol_iboffload_task_t *send_task = NULL,
|
||||
*wait_task = NULL;
|
||||
|
||||
struct mqe_task **mqe_ptr_to_set = NULL;
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment = NULL;
|
||||
|
||||
struct mqe_task *last_wait = NULL, /* we need ask from completion on last wait */
|
||||
*last_send = NULL; /* If it no wait, we need ask for completion on last send */
|
||||
|
||||
int rc, exchange, extra_rank, pair_rank;
|
||||
|
||||
|
||||
mca_bcol_iboffload_frag_t *send_fragment = NULL,
|
||||
*preposted_recv_frag = NULL;
|
||||
|
||||
netpatterns_pair_exchange_node_t *my_exchange_node =
|
||||
&iboffload->recursive_doubling_tree;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_intra_recursive_doubling.\n"));
|
||||
|
||||
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
|
||||
opal_list_get_last(&coll_request->work_requests);
|
||||
/* Set mq credits */
|
||||
coll_fragment->mq_credits = iboffload->alg_task_consump[RECURSIVE_DOUBLING_BARRIER_ALG];
|
||||
|
||||
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
|
||||
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
|
||||
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
|
||||
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
coll_fragment->alg = RECURSIVE_DOUBLING_BARRIER_ALG;
|
||||
|
||||
/*
|
||||
* NOTE: need to generate template, if this will be a multiple fragment
|
||||
* message. This way we can progress the collective w/o knowing it's
|
||||
* type - actually, this is not the case for barrier, but just a note
|
||||
* to remind us that we need to generalize this.
|
||||
*/
|
||||
|
||||
mqe_ptr_to_set = &coll_fragment->to_post;
|
||||
|
||||
/*
|
||||
* Fill in the communication pattern
|
||||
*/
|
||||
|
||||
/*
|
||||
* If non power of 2, may need to wait for message from "extra" proc.
|
||||
*/
|
||||
|
||||
if (0 < my_exchange_node->n_extra_sources) {
|
||||
if (EXCHANGE_NODE == my_exchange_node->node_type) {
|
||||
/* I will participate in the exchange (of the algorithm) -
|
||||
* wait for signal from extra process */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
iboffload, extra_rank, coll_request->qp_index);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
|
||||
"Failing for getting prepost recv frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
wait_task = mca_bcol_iboffload_get_wait_task(iboffload,
|
||||
extra_rank, 1, preposted_recv_frag, coll_request->qp_index, NULL);
|
||||
if (OPAL_UNLIKELY(NULL == wait_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
|
||||
"Failing for getting wait task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
|
||||
} else {
|
||||
/* I will not participate in the exchange - so just "register" as here */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
/* send - no need to send any data, in-order delivery */
|
||||
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
|
||||
extra_rank, coll_request->qp_index, 0,
|
||||
0, SBUF,MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
||||
|
||||
send_task = mca_bcol_iboffload_get_send_task(iboffload, extra_rank,
|
||||
coll_request->qp_index, send_fragment, coll_fragment, INLINE);
|
||||
if (OPAL_UNLIKELY(NULL == send_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
|
||||
"Failing for getting send task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
|
||||
}
|
||||
}
|
||||
|
||||
/* loop over exchange send/recv pairs */
|
||||
for (exchange = 0; exchange < my_exchange_node->n_exchanges; ++exchange) {
|
||||
/* rank of exchange partner */
|
||||
pair_rank = my_exchange_node->rank_exchanges[exchange];
|
||||
/* post send */
|
||||
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
|
||||
pair_rank, coll_request->qp_index, 0,
|
||||
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
||||
|
||||
assert(NULL != send_fragment);
|
||||
|
||||
send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank,
|
||||
coll_request->qp_index,
|
||||
send_fragment, coll_fragment, INLINE);
|
||||
if (OPAL_UNLIKELY(NULL == send_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
|
||||
"Failing for getting send task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
|
||||
|
||||
/* post wait */
|
||||
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
iboffload, pair_rank, coll_request->qp_index);
|
||||
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
|
||||
"Failing for getting prepost recv frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1,
|
||||
preposted_recv_frag,
|
||||
coll_request->qp_index, NULL);
|
||||
if (OPAL_UNLIKELY(NULL == wait_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
|
||||
"Failing for getting wait task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
|
||||
}
|
||||
|
||||
/* if non power of 2, may need to send message to "extra" proc */
|
||||
if (0 < my_exchange_node->n_extra_sources) {
|
||||
if (EXTRA_NODE == my_exchange_node->node_type) {
|
||||
/* I will not participate in the exchange -
|
||||
* wait for signal from exchange process */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
/* post wait */
|
||||
preposted_recv_frag =
|
||||
mca_bcol_iboffload_get_preposted_recv_frag(iboffload, extra_rank,
|
||||
coll_request->qp_index);
|
||||
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
|
||||
"Failing for getting prepost recv frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1,
|
||||
preposted_recv_frag,
|
||||
coll_request->qp_index, NULL);
|
||||
if (OPAL_UNLIKELY(NULL == wait_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
|
||||
"Failing for getting wait task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
|
||||
|
||||
} else {
|
||||
/* I will participate in the exchange -
|
||||
* send signal to extra process */
|
||||
extra_rank = my_exchange_node->rank_extra_source;
|
||||
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
|
||||
extra_rank, coll_request->qp_index, 0,
|
||||
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
||||
|
||||
send_task = mca_bcol_iboffload_get_send_task(
|
||||
iboffload, extra_rank,
|
||||
coll_request->qp_index,
|
||||
send_fragment, coll_fragment, INLINE);
|
||||
if (OPAL_UNLIKELY(NULL == send_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
|
||||
"Failing for getting send task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
|
||||
}
|
||||
}
|
||||
|
||||
/* Fill in the the rest of the coll_fragment */
|
||||
IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n"));
|
||||
/* end of list */
|
||||
*mqe_ptr_to_set = NULL;
|
||||
|
||||
/* finish initializing full message descriptor */
|
||||
coll_request->n_fragments = 1;
|
||||
coll_request->n_frags_sent = 1;
|
||||
|
||||
coll_request->n_frag_mpi_complete = 0;
|
||||
coll_request->n_frag_net_complete = 0;
|
||||
|
||||
coll_request->user_handle_freed = false;
|
||||
|
||||
last_wait->flags |= MQE_WR_FLAG_SIGNAL;
|
||||
coll_fragment->signal_task_wr_id = last_wait->wr_id;
|
||||
last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
|
||||
|
||||
/* post the mwr */
|
||||
if (MCA_BCOL_IBOFFLOAD_QP_SYNC != coll_request->qp_index) {
|
||||
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
|
||||
/* Note: need to clean up */
|
||||
return rc;
|
||||
}
|
||||
|
||||
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
|
||||
} else {
|
||||
/* Special flow for ML service barrier , only this function supposed to
|
||||
post service requests */
|
||||
struct mqe_task *bad_mqe = NULL;
|
||||
assert (MCA_BCOL_IBOFFLOAD_QP_SYNC == coll_request->qp_index );
|
||||
/* Post to special service MQ - 1 */
|
||||
rc = mqe_post_task(iboffload->mq[1], coll_fragment->to_post, &bad_mqe);
|
||||
if (OPAL_UNLIKELY(0 != rc)) {
|
||||
IBOFFLOAD_ERROR(("ibv_post_mqe failed on device (%s), errno says: %s,"
|
||||
" the return code is [%d]\n",
|
||||
ibv_get_device_name(iboffload->device->dev.ib_dev),
|
||||
strerror(errno), rc));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
out_of_resources:
|
||||
/* Release all resources */
|
||||
IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
|
||||
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_barrier_intra_recursive_doubling_start(
|
||||
struct mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = mca_bcol_iboffload_rec_doubling_start_connections(iboffload);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
iboffload->barrier_algth =
|
||||
mca_bcol_iboffload_barrier_intra_recursive_doubling;
|
||||
return
|
||||
mca_bcol_iboffload_barrier_intra_recursive_doubling(iboffload, coll_request);
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_nb_memory_service_barrier_start(
|
||||
struct mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = mca_bcol_iboffload_rec_doubling_start_connections(iboffload);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
iboffload->memsync_algth =
|
||||
mca_bcol_iboffload_barrier_intra_recursive_doubling;
|
||||
|
||||
return
|
||||
mca_bcol_iboffload_barrier_intra_recursive_doubling
|
||||
(iboffload, coll_request);
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_nb_memory_service_barrier_intra(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
|
||||
/* local variables */
|
||||
int rc;
|
||||
mca_bcol_iboffload_collreq_t *coll_request;
|
||||
mca_bcol_iboffload_module_t *iboffload =
|
||||
(mca_bcol_iboffload_module_t *) const_args->bcol_module;
|
||||
/*
|
||||
* recursive doubling
|
||||
*/
|
||||
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Memory syncranization barrier was started\n"));
|
||||
|
||||
/* init barrier collective request */
|
||||
rc = mca_bcol_iboffload_barrier_init(input_args, iboffload, NULL, &coll_request);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_ERROR(("Get error from mca_bcol_iboffload_barrier_init"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* set the qp index to special qp that is used only for synchronization */
|
||||
coll_request->qp_index = MCA_BCOL_IBOFFLOAD_QP_SYNC;
|
||||
/* overwrite mq index to run over service setup */
|
||||
coll_request->first_collfrag.mq_index = SERVICE_MQ;
|
||||
|
||||
/* start the barrier */
|
||||
rc = iboffload->memsync_algth(iboffload, coll_request);
|
||||
if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* complete the barrier - progress releases full request descriptors */
|
||||
IBOFFLOAD_VERBOSE(10, ("Memory syncranization barrier was started\n"));
|
||||
|
||||
/* done */
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
/* Recursive K - ing*/
|
||||
static int recursive_knomial_start_connections(struct mca_bcol_iboffload_module_t *iboffload)
|
||||
{
|
||||
netpatterns_k_exchange_node_t *my_exchange_node =
|
||||
&iboffload->knomial_exchange_tree;
|
||||
int k, i, n_exchanges = my_exchange_node->n_exchanges,
|
||||
**exchanges = my_exchange_node->rank_exchanges,
|
||||
n_extra_src = my_exchange_node->n_extra_sources,
|
||||
tree_order = my_exchange_node->tree_order - 1,
|
||||
rank_extra_src;
|
||||
|
||||
mca_bcol_iboffload_endpoint_t *ep;
|
||||
|
||||
iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG] += 0;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("\nMy sbgp rank (index) - %d, "
|
||||
"num of endpoints = %d, iboffload module - %p"
|
||||
" extra n %d, n_exchanges %d",
|
||||
iboffload->ibnet->super.my_index, iboffload->num_endpoints, iboffload,
|
||||
n_extra_src, n_exchanges));
|
||||
if (0 < n_extra_src) {
|
||||
for (k = 0; k < n_extra_src; k++) {
|
||||
iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG] += 2; /* One send task one wait */
|
||||
rank_extra_src = my_exchange_node->rank_extra_sources_array[k];
|
||||
ep = iboffload->endpoints[rank_extra_src];
|
||||
if (iboffload->ibnet->super.my_index < ep->index) {
|
||||
while(0 == (ep)->remote_zero_rdma_addr.addr) {
|
||||
opal_progress();
|
||||
}
|
||||
} else {
|
||||
IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
|
||||
while (OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < n_exchanges; ++i) {
|
||||
for (k = 0; k < tree_order; k++) {
|
||||
iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG] += 2; /* One send task one wait */
|
||||
ep = iboffload->endpoints[exchanges[i][k]];
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
|
||||
if (iboffload->ibnet->super.my_index < ep->index) {
|
||||
while(0 == (ep)->remote_zero_rdma_addr.addr) {
|
||||
opal_progress();
|
||||
}
|
||||
} else {
|
||||
while (OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_barrier_intra_recursive_knomial(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
/* local variables */
|
||||
mca_bcol_iboffload_task_t *send_task = NULL,
|
||||
*wait_task = NULL;
|
||||
|
||||
struct mqe_task **mqe_ptr_to_set = NULL;
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment = NULL;
|
||||
|
||||
struct mqe_task *last_wait = NULL, /* we need ask from completion on last wait */
|
||||
*last_send = NULL; /* If it no wait, we need ask for completion on last send */
|
||||
|
||||
int rc, exchange, extra_rank, pair_rank, k;
|
||||
|
||||
|
||||
mca_bcol_iboffload_frag_t *send_fragment = NULL,
|
||||
*preposted_recv_frag = NULL;
|
||||
|
||||
netpatterns_k_exchange_node_t *my_exchange_node =
|
||||
&iboffload->knomial_exchange_tree;
|
||||
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_intra_recursive_knomial. Node type %d\n", my_exchange_node->node_type));
|
||||
|
||||
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
|
||||
opal_list_get_last(&coll_request->work_requests);
|
||||
|
||||
/* Set mq credits */
|
||||
coll_fragment->mq_credits = iboffload->alg_task_consump[RECURSIVE_KNOMIAL_BARRIER_ALG];
|
||||
|
||||
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
|
||||
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
|
||||
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
|
||||
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
coll_fragment->alg = RECURSIVE_KNOMIAL_BARRIER_ALG;
|
||||
|
||||
/*
|
||||
* NOTE: need to generate template, if this will be a multiple fragment
|
||||
* message. This way we can progress the collective w/o knowing it's
|
||||
* type - actually, this is not the case for barrier, but just a note
|
||||
* to remind us that we need to generalize this.
|
||||
*/
|
||||
|
||||
mqe_ptr_to_set = &coll_fragment->to_post;
|
||||
|
||||
/*
|
||||
* Fill in the communication pattern
|
||||
*/
|
||||
|
||||
/*
|
||||
* If non power of 2, may need to wait for message from "extra" proc.
|
||||
*/
|
||||
|
||||
if (0 < my_exchange_node->n_extra_sources) {
|
||||
if (EXCHANGE_NODE == my_exchange_node->node_type) {
|
||||
/* I will participate in the exchange (of the algorithm) -
|
||||
* wait for signal from extra process */
|
||||
for (k = 0; k < my_exchange_node->n_extra_sources; k++) {
|
||||
extra_rank = my_exchange_node->rank_extra_sources_array[k];
|
||||
IBOFFLOAD_VERBOSE(10,("Exchange [ %d ] extra get %d", k, extra_rank));
|
||||
|
||||
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
iboffload, extra_rank, coll_request->qp_index);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
|
||||
"Failing for getting prepost recv frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
wait_task = mca_bcol_iboffload_get_wait_task(iboffload,
|
||||
extra_rank, 1, preposted_recv_frag, coll_request->qp_index, NULL);
|
||||
if (OPAL_UNLIKELY(NULL == wait_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
|
||||
"Failing for getting wait task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
|
||||
}
|
||||
} else {
|
||||
/* I will not participate in the exchange - so just "register" as here */
|
||||
extra_rank = my_exchange_node->rank_extra_sources_array[0];
|
||||
IBOFFLOAD_VERBOSE(10,("Send to proxy %d", extra_rank));
|
||||
/* send - no need to send any data, in-order delivery */
|
||||
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
|
||||
extra_rank, coll_request->qp_index, 0,
|
||||
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
||||
|
||||
send_task = mca_bcol_iboffload_get_send_task(iboffload, extra_rank,
|
||||
coll_request->qp_index, send_fragment, coll_fragment, INLINE);
|
||||
if (OPAL_UNLIKELY(NULL == send_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Non power of 2 case: "
|
||||
"Failing for getting send task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
|
||||
}
|
||||
}
|
||||
|
||||
/* loop over exchange send/recv pairs */
|
||||
for (exchange = 0; exchange < my_exchange_node->n_exchanges; ++exchange) {
|
||||
for (k = 0; k < my_exchange_node->tree_order - 1; k++) {
|
||||
/* rank of exchange partner */
|
||||
pair_rank = my_exchange_node->rank_exchanges[exchange][k];
|
||||
IBOFFLOAD_VERBOSE(10,("Exchange [ %d ,%d ] send to %d", exchange, k, pair_rank));
|
||||
/* post send */
|
||||
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
|
||||
pair_rank, coll_request->qp_index, 0,
|
||||
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
||||
|
||||
send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank,
|
||||
coll_request->qp_index,
|
||||
send_fragment, coll_fragment, INLINE);
|
||||
if (OPAL_UNLIKELY(NULL == send_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
|
||||
"Failing for getting send task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
|
||||
}
|
||||
|
||||
for (k = 0; k < my_exchange_node->tree_order - 1; k++) {
|
||||
|
||||
pair_rank = my_exchange_node->rank_exchanges[exchange][k];
|
||||
IBOFFLOAD_VERBOSE(10,("Exchange [ %d ,%d ] recv %d", exchange, k, pair_rank));
|
||||
/* post wait */
|
||||
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
iboffload, pair_rank, coll_request->qp_index);
|
||||
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
|
||||
"Failing for getting prepost recv frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1,
|
||||
preposted_recv_frag, coll_request->qp_index, NULL);
|
||||
if (OPAL_UNLIKELY(NULL == wait_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Exchaging: "
|
||||
"Failing for getting wait task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
|
||||
}
|
||||
}
|
||||
|
||||
/* if non power of 2, may need to send message to "extra" proc */
|
||||
if (0 < my_exchange_node->n_extra_sources) {
|
||||
if (EXTRA_NODE == my_exchange_node->node_type) {
|
||||
/* I will not participate in the exchange -
|
||||
* wait for signal from exchange process */
|
||||
extra_rank = my_exchange_node->rank_extra_sources_array[0];
|
||||
IBOFFLOAD_VERBOSE(10,("Wait from proxy %d", extra_rank));
|
||||
/* post wait */
|
||||
preposted_recv_frag =
|
||||
mca_bcol_iboffload_get_preposted_recv_frag(iboffload, extra_rank,
|
||||
coll_request->qp_index);
|
||||
if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
|
||||
"Failing for getting prepost recv frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1,
|
||||
preposted_recv_frag,
|
||||
coll_request->qp_index, NULL);
|
||||
if (OPAL_UNLIKELY(NULL == wait_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
|
||||
"Failing for getting wait task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
|
||||
|
||||
} else {
|
||||
/* I will participate in the exchange -
|
||||
* send signal to extra process */
|
||||
for (k = 0; k < my_exchange_node->n_extra_sources; k++) {
|
||||
extra_rank = my_exchange_node->rank_extra_sources_array[k];
|
||||
IBOFFLOAD_VERBOSE(10,("Exchange [ %d ] extra release %d", k, extra_rank));
|
||||
|
||||
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
|
||||
extra_rank, coll_request->qp_index, 0,
|
||||
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
||||
|
||||
send_task = mca_bcol_iboffload_get_send_task(
|
||||
iboffload, extra_rank,
|
||||
coll_request->qp_index,
|
||||
send_fragment, coll_fragment, INLINE);
|
||||
if (OPAL_UNLIKELY(NULL == send_task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Sending to 'extra' node: "
|
||||
"Failing for getting send task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Fill in the the rest of the coll_fragment */
|
||||
IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n"));
|
||||
/* end of list */
|
||||
*mqe_ptr_to_set = NULL;
|
||||
|
||||
/* finish initializing full message descriptor */
|
||||
coll_request->n_fragments = 1;
|
||||
coll_request->n_frags_sent = 1;
|
||||
|
||||
coll_request->n_frag_mpi_complete = 0;
|
||||
coll_request->n_frag_net_complete = 0;
|
||||
|
||||
coll_request->user_handle_freed = false;
|
||||
|
||||
last_wait->flags |= MQE_WR_FLAG_SIGNAL;
|
||||
coll_fragment->signal_task_wr_id = last_wait->wr_id;
|
||||
last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
|
||||
|
||||
/* post the mwr */
|
||||
if (MCA_BCOL_IBOFFLOAD_QP_SYNC != coll_request->qp_index) {
|
||||
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
|
||||
/* Note: need to clean up */
|
||||
return rc;
|
||||
}
|
||||
|
||||
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
|
||||
} else {
|
||||
/* Special flow for ML service barrier , only this function supposed to
|
||||
post service requests */
|
||||
struct mqe_task *bad_mqe = NULL;
|
||||
assert (MCA_BCOL_IBOFFLOAD_QP_SYNC == coll_request->qp_index );
|
||||
/* Post to special service MQ - 1 */
|
||||
rc = mqe_post_task(iboffload->mq[1], coll_fragment->to_post, &bad_mqe);
|
||||
if (OPAL_UNLIKELY(0 != rc)) {
|
||||
IBOFFLOAD_ERROR(("ibv_post_mqe failed on device (%s), errno says: %s,"
|
||||
" the return code is [%d]\n",
|
||||
ibv_get_device_name(iboffload->device->dev.ib_dev),
|
||||
strerror(errno), rc));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
out_of_resources:
|
||||
/* Release all resources */
|
||||
IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
|
||||
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_barrier_intra_recursive_knomial_start(
|
||||
struct mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = recursive_knomial_start_connections(iboffload);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
iboffload->barrier_algth =
|
||||
mca_bcol_iboffload_barrier_intra_recursive_knomial;
|
||||
return
|
||||
mca_bcol_iboffload_barrier_intra_recursive_knomial(iboffload, coll_request);
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_rec_doubling_start_connections(mca_bcol_iboffload_module_t *iboffload)
|
||||
{
|
||||
netpatterns_pair_exchange_node_t *my_exchange_node =
|
||||
&iboffload->recursive_doubling_tree;
|
||||
|
||||
int i, n_exchanges = my_exchange_node->n_exchanges,
|
||||
*exchanges = my_exchange_node->rank_exchanges,
|
||||
n_extra_src = my_exchange_node->n_extra_sources,
|
||||
rank_extra_src = my_exchange_node->rank_extra_source;
|
||||
|
||||
mca_bcol_iboffload_endpoint_t *ep;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("\nMy sbgp rank (index) - %d, "
|
||||
"num of endpoints = %d, iboffload module - %p\n",
|
||||
iboffload->ibnet->super.my_index, iboffload->num_endpoints, iboffload));
|
||||
if (0 < n_extra_src) {
|
||||
iboffload->alg_task_consump[RECURSIVE_DOUBLING_BARRIER_ALG] += 2; /* One send task one wait */
|
||||
ep = iboffload->endpoints[rank_extra_src];
|
||||
|
||||
if (iboffload->ibnet->super.my_index < ep->index) {
|
||||
while(0 == (ep)->remote_zero_rdma_addr.addr) {
|
||||
opal_progress();
|
||||
}
|
||||
} else {
|
||||
IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
|
||||
while (OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < n_exchanges; ++i) {
|
||||
iboffload->alg_task_consump[RECURSIVE_DOUBLING_BARRIER_ALG] += 2; /* One send task one wait */
|
||||
ep = iboffload->endpoints[exchanges[i]];
|
||||
|
||||
if (iboffload->ibnet->super.my_index < ep->index) {
|
||||
while(0 == (ep)->remote_zero_rdma_addr.addr) {
|
||||
opal_progress();
|
||||
}
|
||||
} else {
|
||||
IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
|
||||
while (OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_barrier_init(
|
||||
bcol_function_args_t *input_args,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
collective_message_completion_callback_function cb_fn,
|
||||
struct mca_bcol_iboffload_collreq_t **coll_request)
|
||||
{
|
||||
ompi_free_list_item_t *item;
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init"));
|
||||
|
||||
OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n"));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
(*coll_request) = (mca_bcol_iboffload_collreq_t *) item;
|
||||
(*coll_request)->progress_fn = iboffload->barrier_algth;
|
||||
|
||||
/*
|
||||
* For usual barrier it is null. For memory
|
||||
* service barrier we need some work to do
|
||||
*/
|
||||
(*coll_request)->completion_cb_fn = cb_fn;
|
||||
(*coll_request)->order_info = &input_args->order_info;
|
||||
|
||||
(*coll_request)->module = iboffload;
|
||||
(*coll_request)->ml_buffer_index = input_args->buffer_index;
|
||||
(*coll_request)->buffer_info[SBUF].offset = 0;
|
||||
(*coll_request)->buffer_info[RBUF].offset = 0;
|
||||
(*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;
|
||||
|
||||
input_args->bcol_opaque_data = (void *) (*coll_request);
|
||||
|
||||
/*
|
||||
* setup collective work request
|
||||
*/
|
||||
|
||||
/* get collective frag */
|
||||
coll_fragment = &(*coll_request)->first_collfrag;
|
||||
mca_bcol_iboffload_collfrag_init(coll_fragment);
|
||||
|
||||
coll_fragment->mq_index = COLL_MQ;
|
||||
|
||||
/* set pointers for (coll frag) <-> (coll full request) */
|
||||
MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
************************ New style Barrier *****************************
|
||||
***********************************************************************/
|
||||
|
||||
static int mca_bcol_iboffload_new_style_barrier_progress(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
mca_bcol_iboffload_collreq_t *coll_request =
|
||||
(mca_bcol_iboffload_collreq_t *)
|
||||
input_args->bcol_opaque_data;
|
||||
|
||||
if (BCOL_IS_COMPLETED(coll_request)) {
|
||||
coll_request->user_handle_freed = true;
|
||||
if (COLLREQ_IS_DONE(coll_request)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
|
||||
RELEASE_COLLREQ(coll_request);
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Barrier already done.\n"));
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_new_style_barrier_intra(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variables */
|
||||
int rc;
|
||||
mca_bcol_iboffload_collreq_t *coll_request;
|
||||
mca_bcol_iboffload_module_t *iboffload =
|
||||
(mca_bcol_iboffload_module_t *) const_args->bcol_module;
|
||||
|
||||
/* check for ordering */
|
||||
MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args);
|
||||
|
||||
/*
|
||||
* recursive doubling
|
||||
*/
|
||||
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Barrier starts.\n"));
|
||||
|
||||
/* init barrier collective reqeust */
|
||||
rc = mca_bcol_iboffload_barrier_init(input_args, iboffload, NULL, &coll_request);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_ERROR(("Get error from mca_bcol_iboffload_barrier_init"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* start the barrier */
|
||||
rc = iboffload->barrier_algth(iboffload, coll_request);
|
||||
if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
}
|
||||
|
||||
/* done */
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_barrier_register(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Register iboffload Barrier.\n"));
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_BARRIER;
|
||||
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super,
|
||||
&comm_attribs, &inv_attribs,
|
||||
mca_bcol_iboffload_new_style_barrier_intra,
|
||||
mca_bcol_iboffload_new_style_barrier_progress);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_memsync_register(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Register sync function\n"));
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_SYNC;
|
||||
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super,
|
||||
&comm_attribs, &inv_attribs,
|
||||
mca_bcol_iboffload_nb_memory_service_barrier_intra,
|
||||
mca_bcol_iboffload_new_style_barrier_progress);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,606 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_BCAST_H
|
||||
#define MCA_BCOL_IBOFFLOAD_BCAST_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
#include "bcol_iboffload_task.h"
|
||||
#include "bcol_iboffload_collreq.h"
|
||||
#include "bcol_iboffload_collfrag.h"
|
||||
#include "bcol_iboffload_endpoint.h"
|
||||
|
||||
#include "opal/include/opal/types.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
int mca_bcol_iboffload_small_msg_bcast_progress(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int mca_bcol_iboffload_small_msg_bcast_extra_intra(bcol_function_args_t *fn_arguments,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int mca_bcol_iboffload_bcast_scatter_allgather_intra(bcol_function_args_t *fn_arguments,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int mca_bcol_iboffload_zero_copy_progress(bcol_function_args_t *fn_arguments,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int mca_bcol_iboffload_bcast_scatter_allgather_extra_intra(bcol_function_args_t *fn_arguments,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int mca_bcol_iboffload_bcast_register(mca_bcol_base_module_t *super);
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_recv_rtr_setup(
|
||||
struct mqe_task **last_wait,
|
||||
uint32_t dest_rank,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_frag_t *fragment;
|
||||
|
||||
/* Wait for RTR message over credit QP */
|
||||
fragment = mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
iboffload, dest_rank,
|
||||
MCA_BCOL_IBOFFLOAD_QP_CREDIT);
|
||||
if (OPAL_UNLIKELY(NULL == fragment)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
task = mca_bcol_iboffload_get_wait_task(
|
||||
iboffload, dest_rank, 1, fragment, MCA_BCOL_IBOFFLOAD_QP_CREDIT,
|
||||
iboffload->endpoints[dest_rank]->qps[MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF].qp->lcl_qp);
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_send_small_buff_setup(
|
||||
struct mqe_task **last_send,
|
||||
size_t len, uint32_t dest_rank,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_frag_t *fragment;
|
||||
|
||||
mca_bcol_iboffload_collreq_t *coll_request =
|
||||
coll_fragment->coll_full_req;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10,("Get ml frag that I will send dest rank %d, len %d, lkey %d",
|
||||
dest_rank, len, iboffload->rdma_block.ib_info.lkey));
|
||||
|
||||
fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank,
|
||||
coll_request->qp_index, len, 0,
|
||||
SBUF, /* this could be problematic */
|
||||
MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML);
|
||||
if (OPAL_UNLIKELY(NULL == fragment)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10,("Get an rdma task for dest %d for packet size %d",
|
||||
dest_rank,len));
|
||||
task = mca_bcol_iboffload_get_rdma_task(
|
||||
dest_rank, 0,
|
||||
fragment, iboffload, coll_fragment);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
*last_send = &task->element;
|
||||
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_send_large_buff_setup(
|
||||
struct mqe_task **last_send,
|
||||
int buf_index, int offset,
|
||||
size_t len, uint32_t dest_rank,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_frag_t *fragment;
|
||||
|
||||
mca_bcol_iboffload_collreq_t *coll_request =
|
||||
coll_fragment->coll_full_req;
|
||||
|
||||
fragment = mca_bcol_iboffload_get_send_frag(coll_request, dest_rank,
|
||||
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
|
||||
len,
|
||||
offset, buf_index, MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML);
|
||||
if (OPAL_UNLIKELY(NULL == fragment)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
task = mca_bcol_iboffload_get_send_task(
|
||||
iboffload, dest_rank,
|
||||
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
|
||||
fragment, coll_fragment, NO_INLINE);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
*last_send = &task->element;
|
||||
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_send_rtr_setup(
|
||||
struct mqe_task **last_send,
|
||||
uint32_t dest_rank,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_frag_t *fragment;
|
||||
|
||||
/* Recv is ready , Send RTR message */
|
||||
fragment = mca_bcol_iboffload_get_send_frag(coll_fragment->coll_full_req,
|
||||
dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT, 0,
|
||||
0, RBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
||||
if (OPAL_UNLIKELY(NULL == fragment)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get frag.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
task = mca_bcol_iboffload_get_send_task(iboffload, dest_rank,
|
||||
MCA_BCOL_IBOFFLOAD_QP_CREDIT,
|
||||
fragment, coll_fragment, INLINE);
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get send task.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("dest_rank - %d. qp index - %d.\n",
|
||||
dest_rank, MCA_BCOL_IBOFFLOAD_QP_CREDIT));
|
||||
|
||||
*last_send = &task->element;
|
||||
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_recv_small_preposted_buff_setup(
|
||||
struct mqe_task **last_wait,
|
||||
size_t len, uint32_t dest_rank,
|
||||
int qp_index,
|
||||
int nwaits,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_frag_t *fragment;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10,("Get preposted recv from rank %d", dest_rank));
|
||||
|
||||
fragment = mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
iboffload, dest_rank,
|
||||
qp_index);
|
||||
if (OPAL_UNLIKELY(NULL == fragment)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, nwaits,
|
||||
fragment, qp_index, NULL);
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
*last_wait = &task->element;
|
||||
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_recv_small_buff_setup(
|
||||
struct mqe_task **last_wait,
|
||||
size_t len, uint32_t dest_rank,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_frag_t *fragment;
|
||||
|
||||
mca_bcol_iboffload_collreq_t *coll_request =
|
||||
coll_fragment->coll_full_req;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Get preposted recv from rank %d", dest_rank));
|
||||
|
||||
fragment = mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
iboffload, dest_rank,
|
||||
coll_request->qp_index);
|
||||
if (OPAL_UNLIKELY(NULL == fragment)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1,
|
||||
fragment, coll_request->qp_index, NULL);
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
*last_wait = &task->element;
|
||||
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_recv_large_buff_setup(
|
||||
struct mqe_task **last_wait,
|
||||
int buf_index, int offset,
|
||||
size_t len, uint32_t dest_rank,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment)
|
||||
{
|
||||
int num_preposted;
|
||||
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_frag_t *fragment;
|
||||
|
||||
mca_bcol_iboffload_collreq_t *coll_request = coll_fragment->coll_full_req;
|
||||
|
||||
/* Post message to recv queue for large messages */
|
||||
fragment = mca_bcol_iboffload_get_ml_frag(
|
||||
iboffload, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, len,
|
||||
coll_request->buffer_info[buf_index].iboffload_reg->mr->lkey,
|
||||
(uint64_t)((unsigned char *)coll_request->buffer_info[buf_index].buf + offset));
|
||||
if (OPAL_UNLIKELY(NULL == fragment)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get recv frag.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
num_preposted = mca_bcol_iboffload_prepost_ml_recv_frag(
|
||||
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
|
||||
dest_rank, fragment, iboffload);
|
||||
if (0 >= num_preposted) {
|
||||
IBOFFLOAD_ERROR(("Failed to prepost recv fragments "
|
||||
"return code - %d; dest_rank - %d",
|
||||
num_preposted, dest_rank));
|
||||
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
task = mca_bcol_iboffload_get_wait_task(iboffload, dest_rank, 1,
|
||||
fragment, MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF, NULL);
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to get wait task.\n"));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
*last_wait = &task->element;
|
||||
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, task);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(coll_fragment->tail_next, task);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_iboffload_binomial_root_to_src(int group_root, int my_rank,
|
||||
int pow2_size, int group_size, int *distance)
|
||||
{
|
||||
int root, relative_rank, src,
|
||||
pow2_distance = 0, i;
|
||||
|
||||
if (group_root < pow2_size) {
|
||||
root = group_root;
|
||||
} else {
|
||||
/* the source of the data is extra node,
|
||||
the real root it represented by some rank from
|
||||
pow2 group */
|
||||
root = group_root - pow2_size;
|
||||
/* shortcut for the case when my rank is root for the group */
|
||||
if (my_rank == root) {
|
||||
*distance = -1;
|
||||
return group_root;
|
||||
}
|
||||
}
|
||||
|
||||
relative_rank = (my_rank - root) < 0 ? my_rank - root + pow2_size :
|
||||
my_rank - root;
|
||||
|
||||
for (i = 1; i < pow2_size; i<<=1, pow2_distance++) {
|
||||
if (relative_rank & i) {
|
||||
src = my_rank ^ i;
|
||||
if (src >= pow2_size)
|
||||
src -= pow2_size;
|
||||
|
||||
*distance = pow2_distance;
|
||||
IBOFFLOAD_VERBOSE(10, ("AAAAA d %d rel %d it %d root %d my %d", *distance, relative_rank, i, root, my_rank));
|
||||
return src;
|
||||
}
|
||||
}
|
||||
|
||||
/* error case */
|
||||
*distance = -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static inline void bcol_iboffload_setup_binomial_connection(mca_bcol_iboffload_module_t *iboffload)
|
||||
{
|
||||
netpatterns_pair_exchange_node_t *my_exchange_node =
|
||||
&iboffload->recursive_doubling_tree;
|
||||
|
||||
int i, n_exchanges = my_exchange_node->n_exchanges,
|
||||
*exchanges = my_exchange_node->rank_exchanges,
|
||||
n_extra_src = my_exchange_node->n_extra_sources,
|
||||
my_rank = iboffload->ibnet->super.my_index,
|
||||
rank_extra_src = my_exchange_node->rank_extra_source;
|
||||
|
||||
mca_bcol_iboffload_endpoint_t *ep;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Open connections.\n"));
|
||||
|
||||
if (0 < n_extra_src) {
|
||||
ep = iboffload->endpoints[rank_extra_src];
|
||||
while (OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
{
|
||||
int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;
|
||||
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
|
||||
assert(NULL != ep->qps[qp_index].qp->lcl_qp);
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.",
|
||||
ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Connect to all extra nodes */
|
||||
if (EXTRA_NODE == my_exchange_node->node_type) {
|
||||
for (i = iboffload->power_of_2_ranks;
|
||||
i < iboffload->num_endpoints; ++i) {
|
||||
if (i != my_rank) {
|
||||
ep = iboffload->endpoints[i];
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("subgroup rank %d: Connect to rank %d.\n", my_rank, i));
|
||||
|
||||
while (OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
{
|
||||
int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;
|
||||
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
|
||||
assert(NULL != ep->qps[qp_index].qp->lcl_qp);
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.",
|
||||
ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < n_exchanges; ++i) {
|
||||
ep = iboffload->endpoints[exchanges[i]];
|
||||
|
||||
while (OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
{
|
||||
int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;
|
||||
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
|
||||
assert(NULL != ep->qps[qp_index].qp->lcl_qp);
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, QP index - %d: qp num - %x.",
|
||||
ep, qp_index, ep->qps[qp_index].qp->lcl_qp->qp_num));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
/* set the connection status to connected */
|
||||
iboffload->connection_status[RECURSIVE_DOUBLING_TREE_BCAST] = true;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_iboffload_bcast_binomial_gather(mca_bcol_iboffload_module_t *iboffload_module,
|
||||
struct mqe_task **last_send, struct mqe_task **last_wait,
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment,
|
||||
int count, int base_block_size, int radix_mask_pow)
|
||||
{
|
||||
int rc;
|
||||
int i;
|
||||
int my_group_index = iboffload_module->ibnet->super.my_index;
|
||||
int delta, rdelta;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("bcol_iboffload_bcast_binomial_gather %d %d",
|
||||
radix_mask_pow, my_group_index));
|
||||
|
||||
/* we assume the iteration #iteration already was completed with probe */
|
||||
for (i = 0; i < iboffload_module->power_of_2; i++) {
|
||||
int pow2 = 1 << i;
|
||||
int peer_index = my_group_index ^ pow2;
|
||||
int slen, rlen,
|
||||
send_offset,
|
||||
recv_offset;
|
||||
|
||||
if (i > radix_mask_pow) {
|
||||
slen = rlen = pow2 * base_block_size;
|
||||
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
|
||||
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
|
||||
|
||||
rdelta = count - recv_offset;
|
||||
if (rdelta > 0) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Recv1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
|
||||
pow2, 1 << iboffload_module->power_of_2,
|
||||
recv_offset, rlen, peer_index));
|
||||
|
||||
rc = mca_bcol_iboffload_send_rtr_setup(last_send,
|
||||
peer_index, iboffload_module,
|
||||
coll_fragment);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
delta = count - send_offset;
|
||||
if (delta > 0) {
|
||||
if (delta < slen) {
|
||||
/* recv the tail */
|
||||
slen = delta;
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Send1 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
|
||||
pow2, 1 << iboffload_module->power_of_2,
|
||||
send_offset, slen, peer_index));
|
||||
rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index,
|
||||
iboffload_module, coll_fragment);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if (rdelta > 0) {
|
||||
if (rdelta < rlen) {
|
||||
/* recv the tail */
|
||||
rlen = rdelta;
|
||||
}
|
||||
|
||||
rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait,
|
||||
SBUF, recv_offset, rlen, peer_index,
|
||||
iboffload_module, coll_fragment);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (i == radix_mask_pow) {
|
||||
/* only receive data */
|
||||
rlen = pow2 * base_block_size;
|
||||
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
|
||||
delta = count - recv_offset;
|
||||
if (0 >= delta) {
|
||||
/* we have nothing to send, skip the iteration */
|
||||
continue;
|
||||
}
|
||||
if (delta < rlen) {
|
||||
/* recv the tail */
|
||||
rlen = delta;
|
||||
}
|
||||
/* receive data from the peer */
|
||||
IBOFFLOAD_VERBOSE(10, ("Recv2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
|
||||
pow2,
|
||||
1 << iboffload_module->power_of_2,
|
||||
recv_offset,
|
||||
rlen, peer_index));
|
||||
rc = mca_bcol_iboffload_send_rtr_setup(last_send,
|
||||
peer_index, iboffload_module,
|
||||
coll_fragment);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
rc = mca_bcol_iboffload_recv_large_buff_setup(last_wait,
|
||||
SBUF, recv_offset, rlen, peer_index,
|
||||
iboffload_module, coll_fragment);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
} else if (i < radix_mask_pow) {
|
||||
/* Only send data */
|
||||
slen = pow2 * base_block_size;
|
||||
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
|
||||
delta = count - send_offset;
|
||||
if (0 >= delta) {
|
||||
/* we have nothing to send, skip the iteration */
|
||||
continue;
|
||||
}
|
||||
|
||||
if (delta < slen) {
|
||||
slen = delta;
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Send2 [ pow2 %d, radix %d ] offset %d , len %d , dest %d",
|
||||
pow2,
|
||||
1 << iboffload_module->power_of_2,
|
||||
send_offset,
|
||||
slen,
|
||||
peer_index));
|
||||
|
||||
rc = mca_bcol_iboffload_recv_rtr_setup(last_wait, peer_index, iboffload_module, coll_fragment);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
rc = mca_bcol_iboffload_send_large_buff_setup(last_send, SBUF, send_offset, slen, peer_index,
|
||||
iboffload_module, coll_fragment);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,51 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "bcol_iboffload_collreq.h"
|
||||
#include "bcol_iboffload_collfrag.h"
|
||||
|
||||
static void
|
||||
collfrag_constructor(struct mca_bcol_iboffload_collfrag_t *collfrag)
|
||||
{
|
||||
collfrag->n_sends = 0;
|
||||
collfrag->n_sends_completed = 0;
|
||||
|
||||
memset(collfrag->pre_posted_recvs, 0,
|
||||
sizeof(struct mca_bcol_iboffload_task_t *) * MAX_MQE_TASKS);
|
||||
|
||||
collfrag->signal_task_wr_id = (uint64_t) 0;
|
||||
collfrag->complete = false;
|
||||
|
||||
collfrag->seq_n = -1;
|
||||
collfrag->coll_full_req = NULL;
|
||||
|
||||
collfrag->unpack_size = 0;
|
||||
|
||||
collfrag->tasks_posted = 0;
|
||||
collfrag->to_post = NULL;
|
||||
collfrag->task_next = NULL;
|
||||
collfrag->tasks_to_release = NULL;
|
||||
|
||||
collfrag->in_pending_list = false;
|
||||
}
|
||||
|
||||
static void
|
||||
collfrag_destruct(struct mca_bcol_iboffload_collfrag_t *collfrag)
|
||||
{
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_bcol_iboffload_collfrag_t,
|
||||
ompi_free_list_item_t,
|
||||
collfrag_constructor,
|
||||
collfrag_destruct);
|
@ -1,144 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_COLLFRAG_H
|
||||
#define MCA_BCOL_IBOFFLOAD_COLLFRAG_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <infiniband/mqe.h>
|
||||
#include <infiniband/verbs.h>
|
||||
#include <infiniband/mverbs.h>
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
|
||||
#include "opal/class/ompi_free_list.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define MAX_MQE_TASKS 128 /* Pasha - do we want to make it dynamic ?*/
|
||||
|
||||
struct mca_bcol_iboffload_task_t;
|
||||
struct mca_bcol_iboffload_collreq_t;
|
||||
|
||||
/* collective fragment descriptor */
|
||||
struct mca_bcol_iboffload_collfrag_t {
|
||||
ompi_free_list_item_t super;
|
||||
|
||||
/* number of asynchronous sends scheduled */
|
||||
uint32_t n_sends;
|
||||
|
||||
/* number of sends completed */
|
||||
uint32_t n_sends_completed;
|
||||
|
||||
/* Algorithm ID that was user for this fragment*/
|
||||
int32_t alg;
|
||||
|
||||
/* pre-posted receive sources */
|
||||
struct mca_bcol_iboffload_task_t *pre_posted_recvs[MAX_MQE_TASKS];
|
||||
|
||||
/* cache here pointer to signaled task */
|
||||
uint64_t signal_task_wr_id;
|
||||
|
||||
/* mwr completion from the mcq */
|
||||
volatile bool complete;
|
||||
|
||||
/* sequence number - we use it for
|
||||
correct ordering of resources release */
|
||||
uint32_t seq_n;
|
||||
|
||||
/* pointer to the full collective request descriptor */
|
||||
struct mca_bcol_iboffload_collreq_t *coll_full_req;
|
||||
|
||||
size_t unpack_size;
|
||||
|
||||
bool in_pending_list;
|
||||
|
||||
/* Num of posted tasks */
|
||||
int tasks_posted;
|
||||
|
||||
/* Pointer to head of not posted elements list */
|
||||
struct mqe_task *to_post;
|
||||
|
||||
/* Pointer to tail next */
|
||||
struct mqe_task **tail_next;
|
||||
|
||||
/* List of the all tasks of this coll frag */
|
||||
struct mca_bcol_iboffload_task_t *tasks_to_release;
|
||||
|
||||
/* Pointer to the next elem in All tasks list */
|
||||
struct mca_bcol_iboffload_task_t **task_next;
|
||||
|
||||
/* Num of needed mq credits */
|
||||
int mq_credits;
|
||||
|
||||
/* MQ index, that used for this frag */
|
||||
int mq_index;
|
||||
|
||||
/*
|
||||
* Last wait sequence number; zero i.e.
|
||||
* there isn't any wait in the coll request
|
||||
*/
|
||||
int32_t last_wait_num;
|
||||
/* fragment descriptor for non contiguous data */
|
||||
bcol_fragment_descriptor_t *bcol_frag_info;
|
||||
/* frag-len of ml buffer */
|
||||
int frag_len;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_collfrag_t mca_bcol_iboffload_collfrag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_collfrag_t);
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
void mca_bcol_iboffload_collfrag_init(
|
||||
mca_bcol_iboffload_collfrag_t *cf)
|
||||
{
|
||||
/* init the request */
|
||||
cf->n_sends = 0;
|
||||
cf->complete = false;
|
||||
cf->n_sends_completed = 0;
|
||||
cf->alg = -1;
|
||||
cf->in_pending_list = false;
|
||||
cf->tail_next = NULL;
|
||||
cf->tasks_posted = 0;
|
||||
cf->to_post = NULL;
|
||||
cf->mq_credits = 0;
|
||||
cf->mq_index = 0;
|
||||
cf->tasks_to_release = NULL;
|
||||
cf->task_next = &cf->tasks_to_release;
|
||||
cf->last_wait_num = 0;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
struct mca_bcol_iboffload_collfrag_t *
|
||||
mca_bcol_iboffload_get_collfrag(void)
|
||||
{
|
||||
ompi_free_list_item_t *item;
|
||||
mca_bcol_iboffload_collfrag_t *cf;
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
/* blocking allocation for collectives fragment */
|
||||
OMPI_FREE_LIST_GET_MT(&cm->collfrags_free, item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
IBOFFLOAD_ERROR(("Failed to allocated collfrag.\n"));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
cf = (mca_bcol_iboffload_collfrag_t*) item;
|
||||
mca_bcol_iboffload_collfrag_init(cf);
|
||||
|
||||
return cf;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,50 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "bcol_iboffload_collreq.h"
|
||||
|
||||
static void
|
||||
collreq_construct(struct mca_bcol_iboffload_collreq_t *collreq)
|
||||
{
|
||||
int i;
|
||||
collreq->n_fragments = 0;
|
||||
collreq->n_frag_mpi_complete = 0;
|
||||
collreq->n_frag_net_complete = 0;
|
||||
collreq->user_handle_freed = false;
|
||||
|
||||
for (i = 0; i < BCOL_IBOFFLOAD_BUFFERS; i++) {
|
||||
collreq->buffer_info[i].buf = NULL;
|
||||
collreq->buffer_info[i].offset = 0;
|
||||
collreq->buffer_info[i].iboffload_reg = NULL;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&collreq->work_requests, opal_list_t);
|
||||
OBJ_CONSTRUCT(&collreq->first_collfrag, mca_bcol_iboffload_collfrag_t);
|
||||
|
||||
OBJ_CONSTRUCT(&collreq->send_convertor, opal_convertor_t);
|
||||
OBJ_CONSTRUCT(&collreq->recv_convertor, opal_convertor_t);
|
||||
}
|
||||
|
||||
static void
|
||||
collreq_destruct(struct mca_bcol_iboffload_collreq_t *collreq)
|
||||
{
|
||||
OBJ_DESTRUCT(&collreq->work_requests);
|
||||
OBJ_DESTRUCT(&collreq->first_collfrag);
|
||||
|
||||
OBJ_DESTRUCT(&collreq->send_convertor);
|
||||
OBJ_DESTRUCT(&collreq->recv_convertor);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_bcol_iboffload_collreq_t,
|
||||
ompi_request_t,
|
||||
collreq_construct,
|
||||
collreq_destruct);
|
@ -1,273 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_COLLREQ_H
|
||||
#define MCA_BCOL_IBOFFLOAD_COLLREQ_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <infiniband/mqe.h>
|
||||
#include <infiniband/verbs.h>
|
||||
#include <infiniband/mverbs.h>
|
||||
|
||||
#include "opal/class/ompi_free_list.h"
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_device.h"
|
||||
#include "bcol_iboffload_collfrag.h"
|
||||
|
||||
#define SBUF 0
|
||||
#define RBUF 1
|
||||
|
||||
#define BCOL_IBOFFLOAD_BUFFERS 2
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct mca_bcol_iboffload_reg_t;
|
||||
|
||||
/*
|
||||
* collective progress function
|
||||
*/
|
||||
typedef int (*collective_message_progress_function)(
|
||||
struct mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *full_message_descriptor);
|
||||
/*
|
||||
* callback function to be called after the collective work request
|
||||
* completes. This is invoked in user-space, and is typically where
|
||||
* data may be copied out of library buffers, or when any other user-
|
||||
* level protocol may be completed
|
||||
*
|
||||
* input:
|
||||
* callback data: typically, this may be the work request just finished
|
||||
*/
|
||||
typedef int (*collective_message_completion_callback_function)(
|
||||
void *callback_data);
|
||||
|
||||
struct mca_bcol_iboffload_buff_info {
|
||||
void *buf;
|
||||
size_t offset;
|
||||
uint32_t lkey;
|
||||
struct mca_bcol_iboffload_reg_t *iboffload_reg;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_buff_info mca_bcol_iboffload_buff_info;
|
||||
|
||||
/*
|
||||
* Collective message descriptor
|
||||
* the mca_bcol_iboffload_message_desc_t was replaced with mca_bcol_iboffload_collreq_t
|
||||
* *************************************************************************************************
|
||||
*
|
||||
* Brief description of iboffload collective request dependencies:
|
||||
*
|
||||
* mca_bcol_iboffload_collreq_t <----<< Full coll request
|
||||
* |
|
||||
* --(0)-- mca_bcol_iboffload_collfrag_t <----<< Fragment of coll request ( for example
|
||||
* | | 10MB Bcast maybe split to 2MB fragments )
|
||||
* | |
|
||||
* | --(0)-- mca_bcol_iboffload_task_t---mqe_task
|
||||
* | | |
|
||||
* | | ---mca_bcol_iboffload_frag_t---ibv_sge
|
||||
* | --(1)-- mca_bcol_iboffload_task_t---mqe_task
|
||||
* | | |
|
||||
* | | ---mca_bcol_iboffload_frag_t---ibv_sge
|
||||
* | ..(M)..
|
||||
* |
|
||||
* --(1)-- mca_bcol_iboffload_collfrag_t
|
||||
* |
|
||||
* ..(N)..
|
||||
*
|
||||
* *************************************************************************************************
|
||||
*/
|
||||
|
||||
struct mca_bcol_iboffload_collreq_t {
|
||||
ompi_request_t super;
|
||||
|
||||
/* op type */
|
||||
struct ompi_op_t *op;
|
||||
|
||||
/* Sometimes the operation that should be performed
|
||||
by the IB is different than the mpi_op and is then set
|
||||
by the pack_data_for_calc function */
|
||||
enum ibv_m_wr_calc_op actual_ib_op;
|
||||
|
||||
/* Sometimes the data type that should be used by the IB
|
||||
to peroform the calc s different than the mpi dtype,
|
||||
and is then set by the pack_data_for_calc function */
|
||||
enum ibv_m_wr_data_type actual_ib_dtype;
|
||||
|
||||
/* data type */
|
||||
struct ompi_datatype_t *dtype;
|
||||
|
||||
/* convertor for send operation */
|
||||
opal_convertor_t send_conv;
|
||||
|
||||
/* convertor for recv operation */
|
||||
opal_convertor_t recv_conv;
|
||||
|
||||
/*
|
||||
* count (in data type units)
|
||||
*/
|
||||
uint64_t count;
|
||||
|
||||
/*
|
||||
* root of collective operation
|
||||
*/
|
||||
int root;
|
||||
|
||||
/* number of message fragments */
|
||||
int n_fragments;
|
||||
|
||||
/* number of fragments sent - all resrouces for a fragment are allocated
|
||||
* or none at all are
|
||||
*/
|
||||
int n_frags_sent;
|
||||
|
||||
/* number of fragments completed from the MPI perspective */
|
||||
int n_frag_mpi_complete;
|
||||
|
||||
/* number of fragments completed from a network perspective */
|
||||
int n_frag_net_complete;
|
||||
|
||||
/* collective free and may be released - message complete from the
|
||||
** MPI perspective, the network prespective, and the user is done
|
||||
** with the message handle */
|
||||
volatile bool user_handle_freed;
|
||||
|
||||
/* list of collective fragements - only 1 for now */
|
||||
opal_list_t work_requests;
|
||||
|
||||
/* message progress function */
|
||||
collective_message_progress_function progress_fn;
|
||||
|
||||
/* work request completion callback function */
|
||||
collective_message_completion_callback_function completion_cb_fn;
|
||||
|
||||
/* index of qp with enough length of buffs for this collective */
|
||||
int qp_index;
|
||||
|
||||
bool if_bcol_last;
|
||||
|
||||
/* The flag is used for the last bcol to indicate if the calculation should be done by the cpu */
|
||||
bool do_calc_in_cpu;
|
||||
|
||||
/* in Allreduce case, if (true == do_calc_in_cpu) =>
|
||||
the final result will be calc on local CPU */
|
||||
uint64_t l_operand;
|
||||
uint64_t r_operand;
|
||||
|
||||
/* caching ML-rdma buffer descriptor */
|
||||
mca_bcol_iboffload_rdma_buffer_desc_t *ml_rdma_desc;
|
||||
|
||||
/* ML buffer index code */
|
||||
int ml_buffer_index;
|
||||
|
||||
/* In the current implementation the collrequest connected to 1 single
|
||||
iboffload module */
|
||||
struct mca_bcol_iboffload_module_t *module;
|
||||
|
||||
mca_bcol_iboffload_collfrag_t first_collfrag;
|
||||
|
||||
/* Send/recv buffs info - user buffers registration if needed etc. */
|
||||
mca_bcol_iboffload_buff_info buffer_info[BCOL_IBOFFLOAD_BUFFERS];
|
||||
|
||||
/* My bi nominal tree children in this collective */
|
||||
int *bi_nominal_tree_children;
|
||||
|
||||
/* Convertors for send/recv if needed */
|
||||
opal_convertor_t send_convertor;
|
||||
opal_convertor_t recv_convertor;
|
||||
|
||||
/* Order info from upper layer */
|
||||
mca_bcol_base_order_info_t *order_info;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_collreq_t mca_bcol_iboffload_collreq_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_collreq_t);
|
||||
|
||||
#define COLLREQ_IS_DONE(cr) (cr->user_handle_freed && \
|
||||
(cr->n_frag_mpi_complete == cr->n_fragments) && \
|
||||
(cr->n_frag_net_complete == cr->n_fragments))
|
||||
|
||||
#define RELEASE_COLLREQ(cr) \
|
||||
do { \
|
||||
(cr)->user_handle_freed = false; \
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_bcol_iboffload_component.collreqs_free, \
|
||||
(ompi_free_list_item_t *) (cr)); \
|
||||
} while (0)
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int mca_bcol_iboffload_free_resources_and_move_to_pending(
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment,
|
||||
mca_bcol_iboffload_module_t *iboffload)
|
||||
{
|
||||
int rc = mca_bcol_iboffload_free_tasks_frags_resources(coll_fragment,
|
||||
iboffload->device->frags_free);
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("iboffload - %p, coll_fragment - %p, "
|
||||
"coll frag in_pending_list ? - %d, pending_list size - %d.\n",
|
||||
iboffload, coll_fragment, coll_fragment->in_pending_list,
|
||||
opal_list_get_size(&iboffload->collfrag_pending)));
|
||||
|
||||
BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(iboffload, coll_fragment->mq_index, coll_fragment->mq_credits);
|
||||
|
||||
/* Remove coll frag from coll request opal_list */
|
||||
opal_list_remove_item(&coll_fragment->coll_full_req->work_requests,
|
||||
(opal_list_item_t *) coll_fragment);
|
||||
|
||||
if (false == coll_fragment->in_pending_list) {
|
||||
/* Put the collfrag on pending list */
|
||||
coll_fragment->in_pending_list = true;
|
||||
opal_list_append(&iboffload->collfrag_pending,
|
||||
(opal_list_item_t *) coll_fragment);
|
||||
} else {
|
||||
/* The item is already on pending list =>
|
||||
insert it first that not break order
|
||||
between frags on the list */
|
||||
opal_list_prepend(&iboffload->collfrag_pending,
|
||||
(opal_list_item_t *) coll_fragment);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* Forward declaration */
|
||||
struct mca_bcol_iboffload_reg_t;
|
||||
static inline __opal_attribute_always_inline__
|
||||
int mca_bcol_iboffload_prepare_buffer(
|
||||
void *buffer,
|
||||
size_t size,
|
||||
struct mca_bcol_iboffload_reg_t **registration_handler,
|
||||
mca_bcol_iboffload_module_t *iboffload)
|
||||
{
|
||||
int rc;
|
||||
mca_mpool_base_registration_t *reg = NULL;
|
||||
|
||||
assert(size > 0);
|
||||
rc = iboffload->device->mpool->mpool_register(
|
||||
iboffload->device->mpool,
|
||||
buffer, size,
|
||||
(uint32_t) 0 /* flags */,
|
||||
®);
|
||||
|
||||
*registration_handler =
|
||||
(struct mca_bcol_iboffload_reg_t *) reg;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_coll_req_implement(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collreq_t *coll_request);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,73 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_DEVICE_H
|
||||
#define MCA_BCOL_IBOFFLOAD_DEVICE_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <infiniband/mqe.h>
|
||||
#include <infiniband/mverbs.h>
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
|
||||
#define BCOL_IBOFFLOAD_DUMMY_MEM_SIZE 1
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* Device OBJ */
|
||||
struct mca_bcol_iboffload_device_t {
|
||||
opal_list_item_t super;
|
||||
|
||||
bool activated;
|
||||
|
||||
struct ompi_common_ofacm_base_dev_desc_t dev;
|
||||
struct ibv_pd *ib_pd;
|
||||
struct ibv_device_attr ib_dev_attr;
|
||||
|
||||
int num_act_ports;
|
||||
|
||||
struct mca_bcol_iboffload_port_t *ports;
|
||||
struct ibv_cq *ib_cq;
|
||||
|
||||
/* CQ for MQs of all iboffload modules on this device */
|
||||
struct ibv_cq *ib_mq_cq;
|
||||
|
||||
/* The free list of registered buffers
|
||||
* since the registration depends on PD, it is
|
||||
* most resonable place to keep the frags */
|
||||
ompi_free_list_t *frags_free;
|
||||
mca_mpool_base_module_t *mpool;
|
||||
|
||||
/* netowrk context */
|
||||
bcol_base_network_context_t *net_context;
|
||||
|
||||
/* We keep dummy frags for all QPs on each device,
|
||||
possibly some of QPs don't need it but anyway we distribute dummy
|
||||
for them. All dummies point to a same byte of memory. */
|
||||
mca_bcol_iboffload_frag_t dummy_frags[MCA_BCOL_IBOFFLOAD_QP_LAST];
|
||||
|
||||
/* Registred memory for the dummy frags */
|
||||
char dummy_mem[BCOL_IBOFFLOAD_DUMMY_MEM_SIZE];
|
||||
|
||||
/* Registration info of the dummy memory */
|
||||
mca_bcol_iboffload_reg_t dummy_reg;
|
||||
};
|
||||
|
||||
typedef struct mca_bcol_iboffload_device_t mca_bcol_iboffload_device_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_device_t);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_BCOL_IBOFFLOAD_DEVICE_H */
|
||||
|
@ -1,373 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <infiniband/mverbs.h>
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/mca/common/ofacm/connect.h"
|
||||
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/class/opal_object.h"
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
#include "bcol_iboffload_device.h"
|
||||
#include "bcol_iboffload_endpoint.h"
|
||||
|
||||
static void mca_bcol_iboffload_endpoint_construct(mca_bcol_iboffload_endpoint_t *ep)
|
||||
{
|
||||
ep->iboffload_module = NULL;
|
||||
ep->ibnet_proc = NULL;
|
||||
|
||||
ep->qps = (mca_bcol_iboffload_endpoint_qp_t *)
|
||||
calloc(mca_bcol_iboffload_component.num_qps,
|
||||
sizeof(mca_bcol_iboffload_endpoint_qp_t));
|
||||
|
||||
ep->index = 0;
|
||||
OBJ_CONSTRUCT(&ep->endpoint_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&ep->pending_frags, opal_list_t);
|
||||
|
||||
memset(ep->recv_cq, 0, IBOFFLOAD_CQ_LAST * sizeof(ep->recv_cq[0]));
|
||||
memset(&ep->qp_config, 0, sizeof(ompi_common_ofacm_base_qp_config_t));
|
||||
|
||||
ep->cpc_context = NULL;
|
||||
|
||||
memset(&ep->remote_zero_rdma_addr, 0, sizeof(mca_bcol_iboffload_rdma_info_t));
|
||||
memset(&ep->remote_rdma_block, 0, sizeof(mca_bcol_iboffload_rem_rdma_block_t));
|
||||
|
||||
ep->need_toset_remote_rdma_info = false;
|
||||
}
|
||||
|
||||
static void mca_bcol_iboffload_endpoint_destruct(mca_bcol_iboffload_endpoint_t *ep)
|
||||
{
|
||||
int qp_index, num_qps, i;
|
||||
ompi_free_list_item_t *item;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
num_qps = cm->num_qps;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Destruct: ep - %p, ep->index - %d", ep, ep->index));
|
||||
|
||||
if (NULL != ep->qps) {
|
||||
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
|
||||
do {
|
||||
item = (ompi_free_list_item_t *)
|
||||
opal_list_remove_first(&ep->qps[qp_index].preposted_frags);
|
||||
if(OPAL_LIKELY(NULL != item)) {
|
||||
OMPI_FREE_LIST_RETURN_MT(&ep->device->frags_free[qp_index], item);
|
||||
}
|
||||
} while (NULL != item);
|
||||
|
||||
OBJ_DESTRUCT(&ep->qps[qp_index].preposted_frags);
|
||||
}
|
||||
|
||||
free(ep->qps);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&ep->endpoint_lock);
|
||||
OBJ_DESTRUCT(&ep->pending_frags);
|
||||
|
||||
/* If the CPC has an endpoint_finalize function, call it */
|
||||
if (NULL != ep->endpoint_cpc->cbm_endpoint_finalize) {
|
||||
ep->endpoint_cpc->cbm_endpoint_finalize(ep->cpc_context);
|
||||
}
|
||||
|
||||
for (i = 0; i < IBOFFLOAD_CQ_LAST; i++) {
|
||||
if (NULL != ep->recv_cq[i]) {
|
||||
if (ibv_destroy_cq(ep->recv_cq[i])) {
|
||||
IBOFFLOAD_ERROR(("Endpoint %x "
|
||||
", failed to destroy CQ, errno says %s",
|
||||
ep, strerror(errno)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_bcol_iboffload_endpoint_t,
|
||||
opal_list_item_t,
|
||||
mca_bcol_iboffload_endpoint_construct,
|
||||
mca_bcol_iboffload_endpoint_destruct);
|
||||
|
||||
/* Pasha: Add some error message here */
|
||||
|
||||
/*
|
||||
* Called when the CPC has established a connection on an endpoint
|
||||
*/
|
||||
static void mca_bcol_iboffload_endpoint_invoke_error(void *context)
|
||||
{
|
||||
mca_bcol_iboffload_endpoint_t *endpoint = (mca_bcol_iboffload_endpoint_t *) context;
|
||||
IBOFFLOAD_ERROR(("Getting error on endpoint - %p!", endpoint));
|
||||
}
|
||||
|
||||
|
||||
/* Pasha: Need to add more logic here */
|
||||
static void mca_bcol_iboffload_endpoint_cpc_complete(void *context)
|
||||
{
|
||||
mca_bcol_iboffload_endpoint_t *endpoint = (mca_bcol_iboffload_endpoint_t *) context;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p for comm rank %d: CPC complete.\n",
|
||||
endpoint, endpoint->iboffload_module->ibnet->super.group_list[endpoint->index]));
|
||||
|
||||
if (OMPI_SUCCESS !=
|
||||
mca_bcol_iboffload_exchange_rem_addr(endpoint)) {
|
||||
IBOFFLOAD_ERROR(("endpoint - %p, "
|
||||
"remote addr exchange error.\n", endpoint));
|
||||
}
|
||||
/* The connection is correctly setup. Now we can decrease the
|
||||
event trigger. */
|
||||
opal_progress_event_users_decrement();
|
||||
}
|
||||
|
||||
/* Vasily: Need to add more logic here */
|
||||
int mca_bcol_iboffload_endpoint_post_recvs(void *context)
|
||||
{
|
||||
int qp_index, rc, num_qps;
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
mca_bcol_iboffload_endpoint_t *endpoint =
|
||||
(mca_bcol_iboffload_endpoint_t *) context;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("endpoint - %p, post of %d recvs !",
|
||||
endpoint, cm->qp_infos[0].rd_num));
|
||||
/* TODO Pasha - fix later */
|
||||
num_qps = cm->num_qps;
|
||||
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
|
||||
rc = mca_bcol_iboffload_prepost_recv(endpoint, qp_index,
|
||||
cm->qp_infos[qp_index].rd_num);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
/* Pasha: Need to add more failure logic */
|
||||
IBOFFLOAD_ERROR(("Failed to prepost recv fragments "
|
||||
"on qp index %d, return code - %d",
|
||||
qp_index, rc));
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* The function go over each ibnet proc and creates endpoint for each one */
|
||||
int mca_bcol_iboffloads_create_endpoints(mca_sbgp_ibnet_connection_group_info_t *cgroup,
|
||||
mca_bcol_iboffload_module_t *module) {
|
||||
uint32_t i;
|
||||
mca_bcol_iboffload_endpoint_t *ep;
|
||||
|
||||
if (NULL == cgroup || NULL == module) {
|
||||
IBOFFLOAD_ERROR(("Bad parameters for create endpoints function."));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
module->num_endpoints = cgroup->num_procs;
|
||||
module->endpoints = (mca_bcol_iboffload_endpoint_t **)
|
||||
calloc(module->num_endpoints,
|
||||
sizeof(mca_bcol_iboffload_endpoint_t *));
|
||||
if (NULL == module->endpoints) {
|
||||
IBOFFLOAD_ERROR(("Error memory allocation for endpoints array"
|
||||
", errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("iboffload - %p, num of endpoints - %d.\n",
|
||||
module, module->num_endpoints));
|
||||
/* Ishai: No need to open so many endpoints. We are not talking with all procs */
|
||||
for (i = 0; i < cgroup->num_procs; i++) {
|
||||
ep = OBJ_NEW(mca_bcol_iboffload_endpoint_t);
|
||||
/* check qp memory allocation */
|
||||
if (NULL == ep->qps) {
|
||||
IBOFFLOAD_ERROR(("Failed to allocate memory for qps"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
/* init new endpoint */
|
||||
ep->index = i;
|
||||
ep->iboffload_module = module;
|
||||
/* saving the device for the destruction - iboffload module amy not exist than */
|
||||
ep->device = ep->iboffload_module->device;
|
||||
ep->ibnet_proc = (mca_sbgp_ibnet_proc_t *)
|
||||
opal_pointer_array_get_item(cgroup->ibnet_procs, i);
|
||||
if (NULL == ep->ibnet_proc) {
|
||||
IBOFFLOAD_ERROR(("Failed to get proc pointer, for index %d", i));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS !=
|
||||
mca_bcol_iboffload_endpoint_init(ep)) {
|
||||
IBOFFLOAD_ERROR(("Failed to init endpoint - %p", ep));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, ep index - %d, iboffload - %p, "
|
||||
"cpc contex - %p.\n", ep, ep->index,
|
||||
ep->iboffload_module, ep->cpc_context));
|
||||
|
||||
/* Add the new endpoint to array of endpoints */
|
||||
module->endpoints[i] = ep;
|
||||
}
|
||||
|
||||
/* Pasha: Need to add better clean-up here */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int config_qps(mca_bcol_iboffload_endpoint_t *ep)
|
||||
{
|
||||
int qp_index;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
ompi_common_ofacm_base_qp_config_t *qp_config = &ep->qp_config;
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
qp_config->num_srqs = 0;
|
||||
qp_config->srq_num = NULL;
|
||||
|
||||
qp_config->num_qps = cm->num_qps;
|
||||
|
||||
qp_config->init_attr = (struct ibv_qp_init_attr *)
|
||||
calloc(qp_config->num_qps, sizeof(struct ibv_qp_init_attr));
|
||||
|
||||
if (NULL == qp_config->init_attr) {
|
||||
IBOFFLOAD_ERROR(("Failed allocate memory for qp init attributes"));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
goto config_qps_exit;
|
||||
}
|
||||
|
||||
qp_config->attr = (struct ibv_qp_attr *)
|
||||
calloc(qp_config->num_qps, sizeof(struct ibv_qp_attr));
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == qp_config->attr)) {
|
||||
IBOFFLOAD_ERROR(("Failed allocate memory for qp attributes"));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
goto config_qps_exit;
|
||||
}
|
||||
|
||||
/* we must to specify that the qps are special */
|
||||
qp_config->init_attr_mask = (uint32_t *)
|
||||
calloc(qp_config->num_qps, sizeof(uint32_t));
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == qp_config->init_attr_mask)) {
|
||||
IBOFFLOAD_ERROR(("Failed allocate memory for qp mask."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
goto config_qps_exit;
|
||||
}
|
||||
|
||||
/* qp_config->rtr_attr_mask = qp_config->rts_attr_mask = NULL; */
|
||||
|
||||
qp_config->rtr_attr_mask = (uint32_t *)
|
||||
calloc(qp_config->num_qps, sizeof(uint32_t));
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == qp_config->rtr_attr_mask)) {
|
||||
IBOFFLOAD_ERROR(("Failled allocate memory for qp rtr attributes mask."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
goto config_qps_exit;
|
||||
}
|
||||
|
||||
qp_config->rts_attr_mask = (uint32_t *)
|
||||
calloc(qp_config->num_qps, sizeof(uint32_t));
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == qp_config->rts_attr_mask)) {
|
||||
IBOFFLOAD_ERROR(("Failled allocate memory for qp rts attributes mask."));
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
goto config_qps_exit;
|
||||
}
|
||||
|
||||
for (qp_index = 0; qp_index < qp_config->num_qps; ++qp_index) {
|
||||
mca_bcol_iboffload_config_qps_fn_t config_qp =
|
||||
cm->qp_infos[qp_index].config_qp;
|
||||
|
||||
if (NULL != config_qp) {
|
||||
config_qp(qp_index, ep, qp_config);
|
||||
}
|
||||
}
|
||||
|
||||
config_qps_exit:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* The fucntion is called for endpoints
|
||||
* with MCA_COMMON_OFACM_USER_CUSTOM state only,
|
||||
* we need a OPAL_THREAD_LOCK before call to this function */
|
||||
int mca_bcol_iboffload_endpoint_init(mca_bcol_iboffload_endpoint_t *ep)
|
||||
{
|
||||
int qp_index, cq_index, num_qps;
|
||||
ompi_common_ofacm_base_module_t *cpc;
|
||||
|
||||
mca_bcol_iboffload_device_t *device = ep->iboffload_module->device;
|
||||
|
||||
mca_sbgp_ibnet_connection_group_info_t *cgroup =
|
||||
&ep->iboffload_module->ibnet->cgroups[ep->iboffload_module->cgroup_index];
|
||||
|
||||
for (cq_index = 0; cq_index < IBOFFLOAD_CQ_LAST; cq_index++) {
|
||||
if (OMPI_SUCCESS !=
|
||||
mca_bcol_iboffload_adjust_cq(device, &ep->recv_cq[cq_index])) {
|
||||
IBOFFLOAD_ERROR(("Error creating CQ for %s errno says %s",
|
||||
ibv_get_device_name(device->dev.ib_dev), strerror(errno)));
|
||||
/* OBJ_RELEASE(ep); */ /* Vasily: What must we do in this case ??? */
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != config_qps(ep))) {
|
||||
IBOFFLOAD_ERROR(("Error configure QPs for endpoint %x errno says %s",
|
||||
ep, strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Adding here one more redirection in critical path. Need to think
|
||||
* what is the best way to prevent it */
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, rem port - %d", ep,
|
||||
ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].id));
|
||||
|
||||
cpc = ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].local_cpc;
|
||||
ep->endpoint_cpc = cpc; /* caching pointer to cpc */
|
||||
|
||||
if (NULL != cpc->cbm_endpoint_init) {
|
||||
ep->cpc_context = cpc->cbm_endpoint_init(
|
||||
ep->ibnet_proc->ompi_proc,
|
||||
&ep->qp_config,
|
||||
device->ib_pd,
|
||||
ep->iboffload_module->subnet_id,
|
||||
ep->iboffload_module->ibnet->group_id,
|
||||
ep->iboffload_module->lid,
|
||||
/* Remote lid of target module */
|
||||
ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].lid,
|
||||
ep->index, /* user context index */
|
||||
(void *) ep, /* user context */
|
||||
cpc,
|
||||
mca_bcol_iboffload_endpoint_cpc_complete,
|
||||
mca_bcol_iboffload_endpoint_invoke_error,
|
||||
mca_bcol_iboffload_endpoint_post_recvs);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == ep->cpc_context)) {
|
||||
IBOFFLOAD_ERROR(("Endpoint - %p, failed to init context", ep));
|
||||
/* OBJ_RELEASE(ep); */ /* Vasily: What must we do in this case ??? */
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Updating remote port info */
|
||||
num_qps = mca_bcol_iboffload_component.num_qps;
|
||||
|
||||
ep->remote_info = &ep->cpc_context->remote_info;
|
||||
for (qp_index = 0; qp_index < num_qps; ++qp_index) {
|
||||
ep->qps[qp_index].qp = &ep->cpc_context->qps[qp_index];
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,328 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_ENDPOINT_H
|
||||
#define MCA_BCOL_IBOFFLOAD_ENDPOINT_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
|
||||
#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h"
|
||||
|
||||
#define BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) (ep)->ibnet_proc->use_port[(cgroup)->index]
|
||||
#define BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep) (BCOL_IBOFFLOAD_ENDPOINT_PORT(cgroup, ep) - 1)
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct mca_bcol_iboffload_endpoint_qp_t {
|
||||
struct ompi_common_ofacm_base_qp_t *qp;
|
||||
size_t ib_inline_max;
|
||||
int32_t sd_wqe; /* Number of available send wqe entries */
|
||||
int32_t rd_wqe; /* Number of available recv wqe entries */
|
||||
opal_list_t preposted_frags; /* List of preposted frags */
|
||||
/* opal_mutex_t lock; */ /* Do I need lock here ? */
|
||||
};
|
||||
|
||||
typedef struct mca_bcol_iboffload_endpoint_qp_t mca_bcol_iboffload_endpoint_qp_t;
|
||||
|
||||
enum {
|
||||
IBOFFLOAD_CQ_SMALL_MESSAGES = 0,
|
||||
IBOFFLOAD_CQ_SYNC,
|
||||
IBOFFLOAD_CQ_LARGE_MESSAGES,
|
||||
IBOFFLOAD_CQ_LAST
|
||||
};
|
||||
|
||||
/* Endpoint object */
|
||||
struct mca_bcol_iboffload_endpoint_t {
|
||||
opal_list_item_t super;
|
||||
|
||||
/** BTL module that created this connection */
|
||||
mca_bcol_iboffload_module_t *iboffload_module;
|
||||
|
||||
/** proc structure corresponding to endpoint */
|
||||
mca_sbgp_ibnet_proc_t *ibnet_proc;
|
||||
|
||||
/** lock for concurrent access to endpoint state */
|
||||
opal_mutex_t endpoint_lock;
|
||||
|
||||
/** Penging frag list */
|
||||
opal_list_t pending_frags;
|
||||
|
||||
/** QPs information */
|
||||
mca_bcol_iboffload_endpoint_qp_t *qps;
|
||||
|
||||
/** endpoint index on array */
|
||||
int32_t index;
|
||||
|
||||
/** CQ for receive queues on this endpoint */
|
||||
struct ibv_cq *recv_cq[IBOFFLOAD_CQ_LAST];
|
||||
|
||||
/** QP configuration information */
|
||||
ompi_common_ofacm_base_qp_config_t qp_config;
|
||||
|
||||
/** cpc context */
|
||||
ompi_common_ofacm_base_local_connection_context_t *cpc_context;
|
||||
|
||||
/** caching pointer to remote info */
|
||||
ompi_common_ofacm_base_remote_connection_context_t *remote_info;
|
||||
|
||||
/** caching pointer to cpc */
|
||||
ompi_common_ofacm_base_module_t *endpoint_cpc;
|
||||
|
||||
/** The struct is used for zero RDMA with immediate
|
||||
in some collectives, in barrier for example. */
|
||||
mca_bcol_iboffload_rdma_info_t remote_zero_rdma_addr;
|
||||
mca_bcol_iboffload_rem_rdma_block_t remote_rdma_block;
|
||||
|
||||
/** The pointer to device - In the destruction function
|
||||
the iboffload module may not exist any more - caching the device */
|
||||
struct mca_bcol_iboffload_device_t *device;
|
||||
|
||||
bool need_toset_remote_rdma_info;
|
||||
|
||||
mca_bcol_iboffload_rdma_info_t remote_rdma_info[MAX_REMOTE_RDMA_INFO];
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_endpoint_t mca_bcol_iboffload_endpoint_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_endpoint_t);
|
||||
|
||||
/* Function declaration */
|
||||
int mca_bcol_iboffload_endpoint_init(mca_bcol_iboffload_endpoint_t *ep);
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int check_endpoint_state(mca_bcol_iboffload_endpoint_t *ep,
|
||||
mca_bcol_base_descriptor_t *des,
|
||||
opal_list_t *pending_list)
|
||||
{
|
||||
int rc = OMPI_ERR_RESOURCE_BUSY;
|
||||
|
||||
OPAL_THREAD_LOCK(&ep->cpc_context->context_lock);
|
||||
/* Adding here one more redirection in critical path. Need to think
|
||||
* what is the best way to prevent it */
|
||||
switch(ep->cpc_context->state) {
|
||||
case MCA_COMMON_OFACM_CLOSED:
|
||||
rc = ep->endpoint_cpc->cbm_start_connect(ep->cpc_context);
|
||||
if (OMPI_SUCCESS == rc) {
|
||||
rc = OMPI_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
/*
|
||||
* As long as we expect a message from the peer (in order
|
||||
* to setup the connection) let the event engine pool the
|
||||
* OOB events. Note: we increment it once peer active
|
||||
* connection.
|
||||
*/
|
||||
opal_progress_event_users_increment();
|
||||
/* fall through */
|
||||
default:
|
||||
/* opal_list_append(pending_list, (opal_list_item_t *)des); */ /* Vasily: will be uncomment later */
|
||||
break;
|
||||
case MCA_COMMON_OFACM_FAILED:
|
||||
rc = OMPI_ERR_UNREACH;
|
||||
break;
|
||||
case MCA_COMMON_OFACM_CONNECTED:
|
||||
rc = OMPI_SUCCESS;
|
||||
break;
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&ep->cpc_context->context_lock);
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffloads_create_endpoints(mca_sbgp_ibnet_connection_group_info_t *cgroup,
|
||||
mca_bcol_iboffload_module_t *module);
|
||||
|
||||
int mca_bcol_iboffload_endpoint_post_recvs(void *context);
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_prepost_recv(
|
||||
mca_bcol_iboffload_endpoint_t *endpoint,
|
||||
int qp_index, int num_to_prepost)
|
||||
{
|
||||
mca_bcol_iboffload_prepost_qps_fn_t prepost_recv =
|
||||
mca_bcol_iboffload_component.qp_infos[qp_index].prepost_recv;
|
||||
if (NULL != prepost_recv) {
|
||||
return prepost_recv(endpoint, qp_index, num_to_prepost);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_post_ml_scatter_recv_frag(
|
||||
int qp_index, uint32_t dest_rank,
|
||||
int nitems, struct iovec *buff_iovec,
|
||||
uint32_t lkey,
|
||||
struct ibv_sge *sg_entries,
|
||||
mca_bcol_iboffload_frag_t *frag,
|
||||
mca_bcol_iboffload_module_t *iboffload)
|
||||
{
|
||||
int ret, start_wr_index;
|
||||
struct ibv_recv_wr *recv_wr, *recv_bad;
|
||||
int i;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank];
|
||||
|
||||
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
|
||||
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d",
|
||||
(void *) endpoint, qp_index));
|
||||
|
||||
/* make sure that we do not overrun number of rd_wqe */
|
||||
if (0 >= endpoint->qps[qp_index].rd_wqe) {
|
||||
IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d",
|
||||
endpoint->qps[qp_index].rd_wqe));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&recv_wrs->lock);
|
||||
|
||||
/* Calculate start index in array
|
||||
* of pre-allocated work requests */
|
||||
start_wr_index = cm->qp_infos[qp_index].rd_num - 1;
|
||||
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, "
|
||||
"start index of WRs - %d", (void *) endpoint,
|
||||
qp_index, start_wr_index));
|
||||
|
||||
for (i = 0; i < nitems; i++) {
|
||||
sg_entries[i].length = buff_iovec[i].iov_len;
|
||||
sg_entries[i].addr = (uint64_t)buff_iovec[i].iov_base;
|
||||
sg_entries[i].lkey = lkey;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , length %d , address %p",
|
||||
i, sg_entries[i].length, sg_entries[i].addr));
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Recv SGE List item %d , iovec length %d",
|
||||
i, buff_iovec[i].iov_len));
|
||||
}
|
||||
|
||||
recv_wr->num_sge = nitems;
|
||||
recv_wr->sg_list = sg_entries;
|
||||
|
||||
/* Set the tail */
|
||||
recv_wr->next = NULL;
|
||||
|
||||
/* post the list of recvs */
|
||||
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
|
||||
if (OPAL_UNLIKELY(0 != ret)) {
|
||||
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
|
||||
"qp_index - %d.\n",
|
||||
ibv_get_device_name(device->dev.ib_dev),
|
||||
strerror(errno), ret, qp_index));
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* decresing numbers of free recv wqe */
|
||||
--endpoint->qps[qp_index].rd_wqe;
|
||||
|
||||
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Return success: "
|
||||
"endpoint %p, qp_index %d, dest_rank %d",
|
||||
endpoint, qp_index, dest_rank));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ int
|
||||
mca_bcol_iboffload_prepost_ml_recv_frag(
|
||||
int qp_index, uint32_t dest_rank,
|
||||
mca_bcol_iboffload_frag_t *frag,
|
||||
mca_bcol_iboffload_module_t *iboffload)
|
||||
{
|
||||
int ret, start_wr_index;
|
||||
struct ibv_recv_wr *recv_wr, *recv_bad;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[dest_rank];
|
||||
|
||||
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
|
||||
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, qp_index %d",
|
||||
(void *) endpoint, qp_index));
|
||||
|
||||
/* make sure that we do not overrun number of rd_wqe */
|
||||
if (0 >= endpoint->qps[qp_index].rd_wqe) {
|
||||
IBOFFLOAD_VERBOSE(10, ("There are no rd_wqe - %d",
|
||||
endpoint->qps[qp_index].rd_wqe));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&recv_wrs->lock);
|
||||
|
||||
/* Calculate start index in array
|
||||
* of pre-allocated work requests */
|
||||
start_wr_index = cm->qp_infos[qp_index].rd_num - 1;
|
||||
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, "
|
||||
"start index of WRs - %d", (void *) endpoint,
|
||||
qp_index, start_wr_index));
|
||||
|
||||
recv_wr->sg_list = &frag->sg_entry;
|
||||
|
||||
/* Set the tail */
|
||||
recv_wr->next = NULL;
|
||||
|
||||
/* post the list of recvs */
|
||||
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
|
||||
if (OPAL_UNLIKELY(0 != ret)) {
|
||||
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
|
||||
"qp_index - %d.\n",
|
||||
ibv_get_device_name(device->dev.ib_dev),
|
||||
strerror(errno), ret, qp_index));
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* decresing numbers of free recv wqe */
|
||||
--endpoint->qps[qp_index].rd_wqe;
|
||||
|
||||
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Return success: "
|
||||
"endpoint %p, qp_index %d, dest_rank %d",
|
||||
endpoint, qp_index, dest_rank));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
int source, int qp_index)
|
||||
{
|
||||
mca_bcol_iboffload_frag_t *frag;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[source];
|
||||
|
||||
frag = mca_bcol_iboffload_component.qp_infos[qp_index].get_preposted_recv(endpoint, qp_index);
|
||||
|
||||
/* do we want to run prepost */
|
||||
IBOFFLOAD_VERBOSE(10, ("source - %d, qp_index - %d; "
|
||||
"allocating preposted addr %p.\n",
|
||||
source, qp_index, (void *) frag->sg_entry.addr));
|
||||
|
||||
if (OPAL_LIKELY(NULL != frag)) {
|
||||
frag->next = NULL;
|
||||
}
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_BCOL_IBOFFLOAD_ENDPOINT_H */
|
@ -1,350 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
#include "bcol_iboffload_task.h"
|
||||
#include "bcol_iboffload_collfrag.h"
|
||||
#include "bcol_iboffload_endpoint.h"
|
||||
|
||||
static int mca_bcol_iboffload_fanin_leader_progress(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int rc = OMPI_SUCCESS, leader_rank = 0, rank,
|
||||
sbgp_size = iboffload->ibnet->super.group_size;
|
||||
|
||||
struct mqe_task *last_wait = NULL;
|
||||
|
||||
mca_bcol_iboffload_task_t *wait_task = NULL;
|
||||
mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL;
|
||||
|
||||
struct mqe_task **mqe_ptr_to_set;
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment;
|
||||
|
||||
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
|
||||
opal_list_get_last(&coll_request->work_requests);
|
||||
|
||||
mqe_ptr_to_set = &coll_fragment->to_post;
|
||||
|
||||
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
|
||||
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
|
||||
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
for (rank = leader_rank + 1; rank < sbgp_size; ++rank) {
|
||||
/* post wait */
|
||||
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
iboffload, rank, coll_request->qp_index);
|
||||
if(NULL == preposted_recv_frag) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, rank, 1,
|
||||
preposted_recv_frag, coll_request->qp_index, NULL);
|
||||
if(NULL == wait_task) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
|
||||
}
|
||||
|
||||
/* end of list */
|
||||
*mqe_ptr_to_set = NULL;
|
||||
|
||||
last_wait->flags |= MQE_WR_FLAG_SIGNAL;
|
||||
|
||||
coll_fragment->signal_task_wr_id = last_wait->wr_id;
|
||||
last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
|
||||
|
||||
/* post the mwr */
|
||||
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
|
||||
/* Note: need to clean up */
|
||||
return rc;
|
||||
}
|
||||
|
||||
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
out_of_resources:
|
||||
/* Release all resources */
|
||||
IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
|
||||
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_fanin_proxy_progress(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int rc = OMPI_SUCCESS, leader_rank = 0;
|
||||
|
||||
struct mqe_task *last_send = NULL;
|
||||
mca_bcol_iboffload_task_t *send_task = NULL;
|
||||
mca_bcol_iboffload_frag_t *send_fragment = NULL;
|
||||
|
||||
struct mqe_task **mqe_ptr_to_set;
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment;
|
||||
|
||||
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
|
||||
opal_list_get_last(&coll_request->work_requests);
|
||||
|
||||
mqe_ptr_to_set = &coll_fragment->to_post;
|
||||
|
||||
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
|
||||
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
|
||||
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
/* post send */
|
||||
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
|
||||
leader_rank, coll_request->qp_index, 0,
|
||||
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
||||
if(NULL == send_fragment) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
send_task = mca_bcol_iboffload_get_send_task(iboffload, leader_rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER,
|
||||
send_fragment, coll_fragment, INLINE);
|
||||
if(NULL == send_task) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
|
||||
|
||||
/* end of list */
|
||||
*mqe_ptr_to_set = NULL;
|
||||
assert(NULL != last_send);
|
||||
|
||||
last_send->flags |= MQE_WR_FLAG_SIGNAL;
|
||||
|
||||
coll_fragment->signal_task_wr_id = last_send->wr_id;
|
||||
last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
|
||||
|
||||
/* post the mwr */
|
||||
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
|
||||
/* Note: need to clean up */
|
||||
return rc;
|
||||
}
|
||||
|
||||
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
out_of_resources:
|
||||
/* Release all resources */
|
||||
IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
|
||||
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_fanin_init(
|
||||
bcol_function_args_t *input_args,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t **coll_request)
|
||||
{
|
||||
ompi_free_list_item_t *item = NULL;
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment = NULL;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init"));
|
||||
|
||||
OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item);
|
||||
if(OPAL_UNLIKELY(NULL == item)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n"));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
(*coll_request) = (mca_bcol_iboffload_collreq_t *) item;
|
||||
(*coll_request)->progress_fn = iboffload->fanin_algth;
|
||||
|
||||
(*coll_request)->completion_cb_fn = NULL;
|
||||
(*coll_request)->order_info = &input_args->order_info;
|
||||
|
||||
(*coll_request)->module = iboffload;
|
||||
(*coll_request)->ml_buffer_index = input_args->buffer_index;
|
||||
(*coll_request)->buffer_info[SBUF].offset = 0;
|
||||
(*coll_request)->buffer_info[RBUF].offset = 0;
|
||||
(*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;
|
||||
|
||||
input_args->bcol_opaque_data = (void *) (*coll_request);
|
||||
|
||||
/* finish initializing full message descriptor */
|
||||
(*coll_request)->n_fragments = 1;
|
||||
(*coll_request)->n_frags_sent = 1;
|
||||
|
||||
(*coll_request)->n_frag_mpi_complete = 0;
|
||||
(*coll_request)->n_frag_net_complete = 0;
|
||||
|
||||
(*coll_request)->user_handle_freed = false;
|
||||
|
||||
/*
|
||||
* setup collective work request
|
||||
*/
|
||||
|
||||
/* get collective frag */
|
||||
coll_fragment = &(*coll_request)->first_collfrag;
|
||||
mca_bcol_iboffload_collfrag_init(coll_fragment);
|
||||
|
||||
coll_fragment->alg = FANIN_ALG;
|
||||
coll_fragment->mq_index = COLL_MQ;
|
||||
|
||||
/* Set mq credits */
|
||||
coll_fragment->mq_credits = iboffload->alg_task_consump[FANIN_ALG];
|
||||
|
||||
/* set pointers for (coll frag) <-> (coll full request) */
|
||||
MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
************************ New style Fan-In ******************************
|
||||
***********************************************************************/
|
||||
static int mca_bcol_iboffload_new_style_fanin_progress(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
mca_bcol_iboffload_collreq_t *coll_request =
|
||||
(mca_bcol_iboffload_collreq_t *)
|
||||
input_args->bcol_opaque_data;
|
||||
|
||||
if (BCOL_IS_COMPLETED(coll_request)) {
|
||||
coll_request->user_handle_freed = true;
|
||||
if (COLLREQ_IS_DONE(coll_request)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
|
||||
RELEASE_COLLREQ(coll_request);
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Fan-In already done.\n"));
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_new_style_fanin_first_call(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int i = 0, leader_rank = 0, /* We always suppose - the lowest index is a leader */
|
||||
my_rank = iboffload->ibnet->super.my_index,
|
||||
sbgp_size = iboffload->ibnet->super.group_size;
|
||||
|
||||
mca_bcol_iboffload_endpoint_t *ep = NULL;
|
||||
mca_sbgp_ibnet_proc_t *my_ibnet_proc = iboffload->endpoints[my_rank]->ibnet_proc;
|
||||
|
||||
assert(NULL != my_ibnet_proc);
|
||||
|
||||
if (MCA_SBGP_IBNET_NODE_LEADER == my_ibnet_proc->duty) {
|
||||
iboffload->fanin_algth = mca_bcol_iboffload_fanin_leader_progress;
|
||||
iboffload->alg_task_consump[FANIN_ALG] += sbgp_size;
|
||||
|
||||
for (i = leader_rank + 1; i < sbgp_size; ++i) {
|
||||
ep = iboffload->endpoints[i];
|
||||
while (OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iboffload->fanin_algth = mca_bcol_iboffload_fanin_proxy_progress;
|
||||
iboffload->alg_task_consump[FANIN_ALG] += 1;
|
||||
|
||||
ep = iboffload->endpoints[leader_rank];
|
||||
while(OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
|
||||
return iboffload->fanin_algth(iboffload, coll_request);
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_new_style_fanin_intra(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request = NULL;
|
||||
mca_bcol_iboffload_module_t *iboffload =
|
||||
(mca_bcol_iboffload_module_t *) const_args->bcol_module;
|
||||
|
||||
assert(NULL != iboffload);
|
||||
|
||||
MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args);
|
||||
|
||||
/* Init Fan-In collective reqeust */
|
||||
rc = mca_bcol_iboffload_fanin_init(input_args, iboffload, &coll_request);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Error from mca_bcol_iboffload_fanin_init.\n"));
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
}
|
||||
|
||||
rc = iboffload->fanin_algth(iboffload, coll_request);
|
||||
if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
}
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_fanin_register(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Register iboffload Fan-In.\n"));
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_FANIN;
|
||||
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super,
|
||||
&comm_attribs, &inv_attribs,
|
||||
mca_bcol_iboffload_new_style_fanin_intra,
|
||||
mca_bcol_iboffload_new_style_fanin_progress);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,349 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
#include "bcol_iboffload_task.h"
|
||||
#include "bcol_iboffload_collfrag.h"
|
||||
#include "bcol_iboffload_endpoint.h"
|
||||
|
||||
static int mca_bcol_iboffload_fanout_leader_progress(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int rc = OMPI_SUCCESS, leader_rank = 0, rank,
|
||||
sbgp_size = iboffload->ibnet->super.group_size;
|
||||
|
||||
struct mqe_task *last_send = NULL;
|
||||
mca_bcol_iboffload_task_t *send_task = NULL;
|
||||
mca_bcol_iboffload_frag_t *send_fragment = NULL;
|
||||
|
||||
struct mqe_task **mqe_ptr_to_set;
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment;
|
||||
|
||||
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
|
||||
opal_list_get_last(&coll_request->work_requests);
|
||||
|
||||
mqe_ptr_to_set = &coll_fragment->to_post;
|
||||
|
||||
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
|
||||
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
|
||||
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
for (rank = leader_rank + 1; rank < sbgp_size; ++rank) {
|
||||
/* post send */
|
||||
send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
|
||||
rank, coll_request->qp_index, 0,
|
||||
0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
|
||||
if(NULL == send_fragment) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
send_task = mca_bcol_iboffload_get_send_task(iboffload, rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER,
|
||||
send_fragment, coll_fragment, INLINE);
|
||||
if(NULL == send_task) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
|
||||
}
|
||||
|
||||
/* end of list */
|
||||
*mqe_ptr_to_set = NULL;
|
||||
assert(NULL != last_send);
|
||||
|
||||
last_send->flags |= MQE_WR_FLAG_SIGNAL;
|
||||
|
||||
coll_fragment->signal_task_wr_id = last_send->wr_id;
|
||||
last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
|
||||
|
||||
/* post the mwr */
|
||||
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
|
||||
/* Note: need to clean up */
|
||||
return rc;
|
||||
}
|
||||
|
||||
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
out_of_resources:
|
||||
/* Release all resources */
|
||||
IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
|
||||
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_fanout_proxy_progress(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int rc = OMPI_SUCCESS, leader_rank = 0;
|
||||
|
||||
struct mqe_task *last_wait = NULL;
|
||||
mca_bcol_iboffload_task_t *wait_task = NULL;
|
||||
mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL;
|
||||
|
||||
struct mqe_task **mqe_ptr_to_set;
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment;
|
||||
|
||||
coll_fragment = (mca_bcol_iboffload_collfrag_t *)
|
||||
opal_list_get_last(&coll_request->work_requests);
|
||||
|
||||
mqe_ptr_to_set = &coll_fragment->to_post;
|
||||
|
||||
if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
|
||||
iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
|
||||
IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
/* post wait */
|
||||
preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
|
||||
iboffload, leader_rank, coll_request->qp_index);
|
||||
if(NULL == preposted_recv_frag) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
wait_task = mca_bcol_iboffload_get_wait_task(iboffload, leader_rank, 1,
|
||||
preposted_recv_frag, coll_request->qp_index, NULL);
|
||||
if(NULL == wait_task) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
|
||||
goto out_of_resources;
|
||||
}
|
||||
|
||||
APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
|
||||
MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
|
||||
|
||||
/* end of list */
|
||||
*mqe_ptr_to_set = NULL;
|
||||
|
||||
last_wait->flags |= MQE_WR_FLAG_SIGNAL;
|
||||
|
||||
coll_fragment->signal_task_wr_id = last_wait->wr_id;
|
||||
last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
|
||||
|
||||
/* post the mwr */
|
||||
rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
|
||||
/* Note: need to clean up */
|
||||
return rc;
|
||||
}
|
||||
|
||||
MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
out_of_resources:
|
||||
/* Release all resources */
|
||||
IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
|
||||
return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_fanout_init(
|
||||
bcol_function_args_t *input_args,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t **coll_request)
|
||||
{
|
||||
ompi_free_list_item_t *item = NULL;
|
||||
mca_bcol_iboffload_collfrag_t *coll_fragment = NULL;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init"));
|
||||
|
||||
OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item);
|
||||
if(NULL == item) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n"));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
(*coll_request) = (mca_bcol_iboffload_collreq_t *) item;
|
||||
(*coll_request)->progress_fn = iboffload->fanout_algth;
|
||||
|
||||
(*coll_request)->completion_cb_fn = NULL;
|
||||
(*coll_request)->order_info = &input_args->order_info;
|
||||
|
||||
(*coll_request)->module = iboffload;
|
||||
(*coll_request)->ml_buffer_index = input_args->buffer_index;
|
||||
(*coll_request)->buffer_info[SBUF].offset = 0;
|
||||
(*coll_request)->buffer_info[RBUF].offset = 0;
|
||||
(*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;
|
||||
|
||||
/* finish initializing full message descriptor */
|
||||
(*coll_request)->n_fragments = 1;
|
||||
(*coll_request)->n_frags_sent = 1;
|
||||
|
||||
(*coll_request)->n_frag_mpi_complete = 0;
|
||||
(*coll_request)->n_frag_net_complete = 0;
|
||||
|
||||
(*coll_request)->user_handle_freed = false;
|
||||
|
||||
input_args->bcol_opaque_data = (void *) (*coll_request);
|
||||
|
||||
/*
|
||||
* setup collective work request
|
||||
*/
|
||||
|
||||
/* get collective frag */
|
||||
coll_fragment = &(*coll_request)->first_collfrag;
|
||||
mca_bcol_iboffload_collfrag_init(coll_fragment);
|
||||
|
||||
coll_fragment->alg = FANOUT_ALG;
|
||||
coll_fragment->mq_index = COLL_MQ;
|
||||
|
||||
/* Set mq credits */
|
||||
coll_fragment->mq_credits = iboffload->alg_task_consump[FANOUT_ALG];
|
||||
|
||||
/* set pointers for (coll frag) <-> (coll full request) */
|
||||
MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
************************ New style Fan-In ******************************
|
||||
***********************************************************************/
|
||||
static int mca_bcol_iboffload_new_style_fanout_progress(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
mca_bcol_iboffload_collreq_t *coll_request =
|
||||
(mca_bcol_iboffload_collreq_t *)
|
||||
input_args->bcol_opaque_data;
|
||||
|
||||
if (BCOL_IS_COMPLETED(coll_request)) {
|
||||
coll_request->user_handle_freed = true;
|
||||
if (COLLREQ_IS_DONE(coll_request)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
|
||||
RELEASE_COLLREQ(coll_request);
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Fan-Out already done.\n"));
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_new_style_fanout_first_call(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int i = 0, leader_rank = 0, /* We always suppose - the lowest index is a leader */
|
||||
my_rank = iboffload->ibnet->super.my_index,
|
||||
sbgp_size = iboffload->ibnet->super.group_size;
|
||||
|
||||
mca_bcol_iboffload_endpoint_t *ep = NULL;
|
||||
mca_sbgp_ibnet_proc_t *my_ibnet_proc = iboffload->endpoints[my_rank]->ibnet_proc;
|
||||
|
||||
assert(NULL != my_ibnet_proc);
|
||||
|
||||
if (MCA_SBGP_IBNET_NODE_LEADER == my_ibnet_proc->duty) {
|
||||
iboffload->fanout_algth = mca_bcol_iboffload_fanout_leader_progress;
|
||||
iboffload->alg_task_consump[FANOUT_ALG] += sbgp_size;
|
||||
|
||||
for (i = leader_rank + 1; i < sbgp_size; ++i) {
|
||||
ep = iboffload->endpoints[i];
|
||||
while (OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iboffload->fanout_algth = mca_bcol_iboffload_fanout_proxy_progress;
|
||||
iboffload->alg_task_consump[FANOUT_ALG] += 1;
|
||||
|
||||
ep = iboffload->endpoints[leader_rank];
|
||||
while(OMPI_SUCCESS !=
|
||||
check_endpoint_state(ep, NULL, NULL)) {
|
||||
opal_progress();
|
||||
}
|
||||
}
|
||||
|
||||
return iboffload->fanout_algth(iboffload, coll_request);
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_new_style_fanout_intra(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request = NULL;
|
||||
mca_bcol_iboffload_module_t *iboffload =
|
||||
(mca_bcol_iboffload_module_t *) const_args->bcol_module;
|
||||
|
||||
assert(NULL != iboffload);
|
||||
|
||||
MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args);
|
||||
|
||||
/* Init Fan-In collective reqeust */
|
||||
rc = mca_bcol_iboffload_fanout_init(input_args, iboffload, &coll_request);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Error from mca_bcol_iboffload_fanin_init.\n"));
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
}
|
||||
|
||||
rc = iboffload->fanout_algth(iboffload, coll_request);
|
||||
if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
}
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_fanout_register(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Register iboffload Fan-In.\n"));
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_FANOUT;
|
||||
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super,
|
||||
&comm_attribs, &inv_attribs,
|
||||
mca_bcol_iboffload_new_style_fanout_intra,
|
||||
mca_bcol_iboffload_new_style_fanout_progress);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,272 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "opal/include/opal/types.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
#include "bcol_iboffload_device.h"
|
||||
#include "bcol_iboffload_collreq.h"
|
||||
#include "bcol_iboffload_endpoint.h"
|
||||
|
||||
static void frag_constructor(mca_bcol_iboffload_frag_t *frag)
|
||||
{
|
||||
mca_bcol_iboffload_reg_t* reg =
|
||||
(mca_bcol_iboffload_reg_t*) frag->super.registration;
|
||||
|
||||
memset(&frag->sg_entry, 0, sizeof(struct ibv_sge));
|
||||
frag->sg_entry.addr = (uint64_t) (uintptr_t) frag->super.ptr;
|
||||
|
||||
frag->registration = reg;
|
||||
|
||||
if (NULL != reg) {
|
||||
frag->sg_entry.lkey = reg->mr->lkey;
|
||||
}
|
||||
|
||||
frag->next = NULL;
|
||||
frag->type = MCA_BCOL_IBOFFLOAD_NONE_OWNER;
|
||||
frag->ref_counter = 0;
|
||||
frag->qp_index = -1;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_bcol_iboffload_frag_t,
|
||||
ompi_free_list_item_t,
|
||||
frag_constructor,
|
||||
NULL);
|
||||
|
||||
|
||||
static mca_bcol_iboffload_frag_t*
|
||||
mca_bcol_iboffload_get_ml_frag_calc(mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collreq_t *coll_request,
|
||||
size_t len, size_t src_offset)
|
||||
{
|
||||
int rc;
|
||||
|
||||
mca_bcol_iboffload_frag_t *fragment;
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
uint64_t sbuff = (uint64_t) (uintptr_t) coll_request->buffer_info[SBUF].buf +
|
||||
src_offset;
|
||||
|
||||
/* The buffer was allocated on ML level,
|
||||
no need to allocate local buffer */
|
||||
rc = pack_data_for_calc(iboffload->device->dev.ib_dev_context,
|
||||
cm->map_ompi_to_ib_calcs[coll_request->op->op_type],
|
||||
cm->map_ompi_to_ib_dt[coll_request->dtype->id],
|
||||
false /* host order */,
|
||||
(void *) sbuff, 0,
|
||||
&coll_request->actual_ib_op,
|
||||
&coll_request->actual_ib_dtype,
|
||||
(void *) sbuff);
|
||||
if (OPAL_UNLIKELY(0 != rc)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("pack_data_for_calc failed, op: %s, type: %s\n",
|
||||
coll_request->op->o_name, coll_request->dtype->name));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fragment = mca_bcol_iboffload_get_ml_frag(
|
||||
iboffload, coll_request->qp_index, len,
|
||||
coll_request->buffer_info[SBUF].lkey,
|
||||
sbuff);
|
||||
|
||||
return fragment;
|
||||
}
|
||||
|
||||
static mca_bcol_iboffload_frag_t *
|
||||
mca_bcol_iboffload_get_packed_frag(mca_bcol_iboffload_module_t *iboffload,
|
||||
uint32_t destination, int qp_index, size_t len,
|
||||
struct opal_convertor_t *convertor)
|
||||
{
|
||||
/* local variables */
|
||||
int rc;
|
||||
uint32_t out_size;
|
||||
size_t max_size = 0;
|
||||
|
||||
struct iovec payload_iovec;
|
||||
|
||||
ompi_free_list_item_t *item;
|
||||
mca_bcol_iboffload_frag_t *frag;
|
||||
|
||||
mca_bcol_iboffload_device_t *device = iboffload->device;
|
||||
|
||||
/* Get frag from free list */
|
||||
OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag = (mca_bcol_iboffload_frag_t *) item;
|
||||
|
||||
/* Pack data into the buffer */
|
||||
out_size = 1;
|
||||
payload_iovec.iov_len = len;
|
||||
|
||||
payload_iovec.iov_base = (void *) (uintptr_t) frag->sg_entry.addr;
|
||||
|
||||
rc = opal_convertor_pack(convertor, &(payload_iovec),
|
||||
&out_size, &max_size);
|
||||
if (OPAL_UNLIKELY(rc < 0)) {
|
||||
/* Error: put the fragment back */
|
||||
OMPI_FREE_LIST_RETURN_MT(&device->frags_free[qp_index], item);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
static mca_bcol_iboffload_frag_t *
|
||||
mca_bcol_iboffload_get_calc_frag(mca_bcol_iboffload_module_t *iboffload, int qp_index,
|
||||
struct mca_bcol_iboffload_collreq_t *coll_request)
|
||||
{
|
||||
int rc;
|
||||
|
||||
ompi_free_list_item_t *item;
|
||||
mca_bcol_iboffload_frag_t *frag;
|
||||
|
||||
mca_bcol_iboffload_device_t *device = iboffload->device;
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Start to pack frag.\n"));
|
||||
|
||||
/* Get frag from free list */
|
||||
OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag = (mca_bcol_iboffload_frag_t *) item;
|
||||
|
||||
/* Pack data into the buffer */
|
||||
rc = pack_data_for_calc(device->dev.ib_dev_context,
|
||||
cm->map_ompi_to_ib_calcs[coll_request->op->op_type],
|
||||
cm->map_ompi_to_ib_dt[coll_request->dtype->id], false,
|
||||
coll_request->buffer_info[SBUF].buf, 0,
|
||||
&coll_request->actual_ib_op,
|
||||
&coll_request->actual_ib_dtype,
|
||||
(void *) (uintptr_t) frag->sg_entry.addr);
|
||||
if (OPAL_UNLIKELY(0 != rc)) {
|
||||
IBOFFLOAD_ERROR(("pack_data_for_calc failed, op: %s, type: %s\n",
|
||||
coll_request->op->o_name, coll_request->dtype->name));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
mca_bcol_iboffload_frag_t*
|
||||
mca_bcol_iboffload_get_send_frag(mca_bcol_iboffload_collreq_t *coll_request,
|
||||
uint32_t destination, int qp_index, size_t len,
|
||||
size_t src_offset, int buf_index, int send_frag_type)
|
||||
{
|
||||
/* local variables */
|
||||
mca_bcol_iboffload_frag_t *frag;
|
||||
mca_bcol_iboffload_module_t *iboffload = coll_request->module;
|
||||
|
||||
mca_bcol_iboffload_endpoint_t *endpoint =
|
||||
iboffload->endpoints[destination];
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Calling mca_bcol_iboffload_get_send_frag qp_index %d",
|
||||
qp_index));
|
||||
|
||||
if ((endpoint->qps[qp_index].sd_wqe) <= 0) {
|
||||
IBOFFLOAD_VERBOSE(10, ("No send wqe %d",
|
||||
endpoint->qps[qp_index].sd_wqe));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
--endpoint->qps[qp_index].sd_wqe;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint %p: qp_index %d, destination %d, sd_wqe %d",
|
||||
endpoint, qp_index, destination, endpoint->qps[qp_index].sd_wqe));
|
||||
|
||||
switch (send_frag_type) {
|
||||
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY:
|
||||
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY"));
|
||||
assert(NULL != &iboffload->device->dummy_frags[qp_index]);
|
||||
return &iboffload->device->dummy_frags[qp_index];
|
||||
|
||||
case MCA_BCOL_IBOFFLOAD_SEND_FRAG:
|
||||
{
|
||||
ompi_free_list_item_t *item;
|
||||
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG"));
|
||||
|
||||
/* Get frag from free list */
|
||||
OMPI_FREE_LIST_GET_MT(&iboffload->device->frags_free[qp_index], item);
|
||||
|
||||
frag = (mca_bcol_iboffload_frag_t *) item;
|
||||
}
|
||||
|
||||
break;
|
||||
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT:
|
||||
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT"));
|
||||
frag = mca_bcol_iboffload_get_packed_frag(iboffload, destination,
|
||||
qp_index, len, &coll_request->send_convertor);
|
||||
|
||||
break;
|
||||
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC:
|
||||
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC"));
|
||||
frag = mca_bcol_iboffload_get_calc_frag(iboffload, qp_index, coll_request);
|
||||
|
||||
break;
|
||||
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML:
|
||||
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML"));
|
||||
frag = mca_bcol_iboffload_get_ml_frag(
|
||||
iboffload, qp_index, len, coll_request->buffer_info[buf_index].lkey,
|
||||
(uint64_t)(uintptr_t) coll_request->buffer_info[buf_index].buf + src_offset);
|
||||
|
||||
break;
|
||||
case MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC:
|
||||
frag = mca_bcol_iboffload_get_ml_frag_calc(iboffload, coll_request, len, src_offset);
|
||||
IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC"));
|
||||
|
||||
break;
|
||||
default:
|
||||
IBOFFLOAD_VERBOSE(10, ("Getting default"));
|
||||
frag = NULL;
|
||||
IBOFFLOAD_ERROR(("Unknown send frag type %d for QP index %d",
|
||||
send_frag_type, qp_index));
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Getting NULL"));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->sg_entry.length = len;
|
||||
frag->next = NULL;
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
void
|
||||
mca_bcol_iboffload_frag_init(ompi_free_list_item_t* item, void* ctx)
|
||||
{
|
||||
int qp_index = *(int *) ctx;
|
||||
mca_bcol_iboffload_frag_t *frag = (mca_bcol_iboffload_frag_t *) item;
|
||||
|
||||
frag->qp_index = qp_index;
|
||||
frag->type = MCA_BCOL_IBOFFLOAD_BCOL_OWNER;
|
||||
}
|
||||
|
||||
void
|
||||
mca_bcol_iboffload_ml_frag_init(ompi_free_list_item_t* item, void* ctx)
|
||||
{
|
||||
mca_bcol_iboffload_frag_t *frag = (mca_bcol_iboffload_frag_t *) item;
|
||||
|
||||
frag->qp_index = -1;
|
||||
frag->type = MCA_BCOL_IBOFFLOAD_ML_OWNER;
|
||||
}
|
@ -1,154 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_FRAG_H
|
||||
#define MCA_BCOL_IBOFFLOAD_FRAG_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "opal/class/ompi_free_list.h"
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* forward declarations */
|
||||
struct mca_bcol_iboffload_collreq_t;
|
||||
|
||||
struct mca_bcol_iboffload_reg_t {
|
||||
mca_mpool_base_registration_t base;
|
||||
struct ibv_mr *mr;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_reg_t mca_bcol_iboffload_reg_t;
|
||||
|
||||
typedef enum {
|
||||
MCA_BCOL_IBOFFLOAD_NONE_OWNER = -1,
|
||||
MCA_BCOL_IBOFFLOAD_DUMMY_OWNER,
|
||||
MCA_BCOL_IBOFFLOAD_BCOL_OWNER,
|
||||
MCA_BCOL_IBOFFLOAD_ML_OWNER
|
||||
} frag_type;
|
||||
|
||||
typedef enum {
|
||||
MCA_BCOL_IBOFFLOAD_SEND_FRAG,
|
||||
MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML,
|
||||
MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC,
|
||||
MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT,
|
||||
MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC,
|
||||
MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY
|
||||
} send_frag_type;
|
||||
|
||||
struct mca_bcol_iboffload_frag_t {
|
||||
ompi_free_list_item_t super;
|
||||
|
||||
struct mca_bcol_iboffload_frag_t *next;
|
||||
struct mca_bcol_iboffload_reg_t *registration;
|
||||
|
||||
struct ibv_sge sg_entry;
|
||||
|
||||
frag_type type;
|
||||
|
||||
int ref_counter;
|
||||
int qp_index;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_frag_t mca_bcol_iboffload_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_frag_t);
|
||||
|
||||
/* The same fragment maybe shared by multiple task.
|
||||
* In order to manage right release and allocation flow
|
||||
* we use reference counter on each fragment and the follow
|
||||
* wrapper allocation and release function that hides
|
||||
* the counter */
|
||||
|
||||
#define IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(fragment, task) \
|
||||
do { \
|
||||
++((fragment)->ref_counter); \
|
||||
(task)->frag = (fragment); \
|
||||
} while(0)
|
||||
|
||||
#define IBOFFLOAD_SET_FRAGS_ON_TASK(fragment, task) \
|
||||
do { \
|
||||
struct mca_bcol_iboffload_frag_t *temp_frag = fragment; \
|
||||
while (NULL != temp_frag) { \
|
||||
++(temp_frag->ref_counter); \
|
||||
temp_frag = temp_frag->next; \
|
||||
} \
|
||||
(task)->frag = fragment; \
|
||||
} while(0)
|
||||
|
||||
/* function declarations */
|
||||
mca_bcol_iboffload_frag_t *
|
||||
mca_bcol_iboffload_get_send_frag(struct mca_bcol_iboffload_collreq_t *coll_request,
|
||||
uint32_t destination, int qp_index, size_t len,
|
||||
size_t src_offset, int buff_index, int send_frag_type);
|
||||
|
||||
void
|
||||
mca_bcol_iboffload_frag_init(ompi_free_list_item_t* item, void* ctx);
|
||||
void
|
||||
mca_bcol_iboffload_ml_frag_init(ompi_free_list_item_t* item, void* ctx);
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_ml_empty_frag(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
int qp_index)
|
||||
{
|
||||
ompi_free_list_item_t *item;
|
||||
mca_bcol_iboffload_frag_t *frag;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
/* Get frag from free list */
|
||||
OMPI_FREE_LIST_GET_MT(&cm->ml_frags_free, item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag = (mca_bcol_iboffload_frag_t *) item;
|
||||
|
||||
frag->qp_index = qp_index;
|
||||
frag->next = NULL;
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
mca_bcol_iboffload_frag_t* mca_bcol_iboffload_get_ml_frag(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
int qp_index, size_t len, uint32_t lkey, uint64_t addr)
|
||||
{
|
||||
/* local variables */
|
||||
mca_bcol_iboffload_frag_t *frag;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Call for get ML frag - addr 0x%x", addr));
|
||||
|
||||
frag = mca_bcol_iboffload_get_ml_empty_frag(iboffload, qp_index);
|
||||
|
||||
frag->sg_entry.addr = addr;
|
||||
frag->sg_entry.lkey = lkey;
|
||||
frag->sg_entry.length = len;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Setting ml frag lkey %u, "
|
||||
"addr %p, qp_index %d, send value - %lf",
|
||||
frag->sg_entry.lkey, frag->sg_entry.addr,
|
||||
qp_index, *(double *) frag->sg_entry.addr));
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,451 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_mca.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/common/ofacm/base.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
/*
|
||||
* Local flags
|
||||
*/
|
||||
enum {
|
||||
REGINT_NEG_ONE_OK = 0x01,
|
||||
REGINT_GE_ZERO = 0x02,
|
||||
REGINT_GE_ONE = 0x04,
|
||||
REGINT_NONZERO = 0x08,
|
||||
REGINT_MAX = 0x88
|
||||
};
|
||||
|
||||
enum {
|
||||
REGSTR_EMPTY_OK = 0x01,
|
||||
REGSTR_MAX = 0x88
|
||||
};
|
||||
|
||||
mca_base_var_enum_value_t mtu_values[] = {
|
||||
{IBV_MTU_256, "256B"},
|
||||
{IBV_MTU_512, "512B"},
|
||||
{IBV_MTU_1024, "1k"},
|
||||
{IBV_MTU_4096, "4k"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/*
|
||||
* utility routine for string parameter registration
|
||||
*/
|
||||
static int reg_string(const char* param_name,
|
||||
const char* deprecated_param_name,
|
||||
const char* param_desc,
|
||||
const char* default_value, char **storage,
|
||||
int flags)
|
||||
{
|
||||
int index;
|
||||
|
||||
/* the MCA variable system will not attempt to modify this value */
|
||||
*storage = (char *) default_value;
|
||||
index = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version,
|
||||
param_name, param_desc, MCA_BASE_VAR_TYPE_STRING,
|
||||
NULL, 0, 0, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
if (NULL != deprecated_param_name) {
|
||||
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "iboffload", deprecated_param_name,
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
if (0 != (flags & REGSTR_EMPTY_OK) && 0 == strlen(*storage)) {
|
||||
opal_output(0, "Bad parameter value for parameter \"%s\"",
|
||||
param_name);
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* utility routine for integer parameter registration
|
||||
*/
|
||||
static int reg_int(const char* param_name,
|
||||
const char* deprecated_param_name,
|
||||
const char* param_desc,
|
||||
int default_value, int *storage, int flags)
|
||||
{
|
||||
int index;
|
||||
|
||||
*storage = default_value;
|
||||
index = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version,
|
||||
param_name, param_desc, MCA_BASE_VAR_TYPE_INT,
|
||||
NULL, 0, 0, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
if (NULL != deprecated_param_name) {
|
||||
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "iboffload", deprecated_param_name,
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) ||
|
||||
(0 != (flags & REGINT_GE_ONE) && *storage < 1) ||
|
||||
(0 != (flags & REGINT_NONZERO) && 0 == *storage)) {
|
||||
opal_output(0, "Bad parameter value for parameter \"%s\"",
|
||||
param_name);
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* utility routine for integer parameter registration
|
||||
*/
|
||||
static int reg_bool(const char* param_name,
|
||||
const char* deprecated_param_name,
|
||||
const char* param_desc,
|
||||
bool default_value, bool *storage)
|
||||
{
|
||||
int index;
|
||||
|
||||
*storage = default_value;
|
||||
index = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version,
|
||||
param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL,
|
||||
NULL, 0, 0, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
if (NULL != deprecated_param_name) {
|
||||
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "iboffload", deprecated_param_name,
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_verify_params(void)
|
||||
{
|
||||
if (mca_bcol_iboffload_component.min_rnr_timer > 31) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_min_rnr_timer > 31",
|
||||
"bcol_iboffload_ib_min_rnr_timer reset to 31");
|
||||
mca_bcol_iboffload_component.min_rnr_timer = 31;
|
||||
} else if (mca_bcol_iboffload_component.min_rnr_timer < 0){
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_min_rnr_timer < 0",
|
||||
"bcol_iboffload_ib_min_rnr_timer reset to 0");
|
||||
mca_bcol_iboffload_component.min_rnr_timer = 0;
|
||||
}
|
||||
|
||||
if (mca_bcol_iboffload_component.timeout > 31) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_timeout > 31",
|
||||
"bcol_iboffload_ib_timeout reset to 31");
|
||||
mca_bcol_iboffload_component.timeout = 31;
|
||||
} else if (mca_bcol_iboffload_component.timeout < 0) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_timeout < 0",
|
||||
"bcol_iboffload_ib_timeout reset to 0");
|
||||
mca_bcol_iboffload_component.timeout = 0;
|
||||
}
|
||||
|
||||
if (mca_bcol_iboffload_component.retry_count > 7) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_retry_count > 7",
|
||||
"bcol_iboffload_ib_retry_count reset to 7");
|
||||
mca_bcol_iboffload_component.retry_count = 7;
|
||||
} else if (mca_bcol_iboffload_component.retry_count < 0) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_retry_count < 0",
|
||||
"bcol_iboffload_ib_retry_count reset to 0");
|
||||
mca_bcol_iboffload_component.retry_count = 0;
|
||||
}
|
||||
|
||||
if (mca_bcol_iboffload_component.max_rdma_dst_ops > 7) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_rnr_retry > 7",
|
||||
"bcol_iboffload_ib_rnr_retry reset to 7");
|
||||
mca_bcol_iboffload_component.max_rdma_dst_ops = 7;
|
||||
} else if (mca_bcol_iboffload_component.max_rdma_dst_ops < 0) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_rnr_retry < 0",
|
||||
"bcol_iboffload_ib_rnr_retry reset to 0");
|
||||
mca_bcol_iboffload_component.max_rdma_dst_ops = 0;
|
||||
}
|
||||
|
||||
if (mca_bcol_iboffload_component.service_level > 15) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_service_level > 15",
|
||||
"bcol_iboffload_ib_service_level reset to 15");
|
||||
mca_bcol_iboffload_component.service_level = 15;
|
||||
} else if (mca_bcol_iboffload_component.service_level < 0) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
true, "bcol_iboffload_ib_service_level < 0",
|
||||
"bcol_iboffload_ib_service_level reset to 0");
|
||||
mca_bcol_iboffload_component.service_level = 0;
|
||||
}
|
||||
|
||||
if(mca_bcol_iboffload_component.buffer_alignment <= 1 ||
|
||||
(mca_bcol_iboffload_component.buffer_alignment & (mca_bcol_iboffload_component.buffer_alignment - 1))) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "wrong buffer alignment",
|
||||
true, mca_bcol_iboffload_component.buffer_alignment, ompi_process_info.nodename, 64);
|
||||
mca_bcol_iboffload_component.buffer_alignment = 64;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_bcol_iboffload_register_params(void)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
char *msg;
|
||||
int ret = OMPI_SUCCESS, tmp;
|
||||
|
||||
#define CHECK(expr) do { \
|
||||
tmp = (expr); \
|
||||
if (OMPI_SUCCESS != tmp) ret = tmp; \
|
||||
} while (0)
|
||||
|
||||
/* register openib component parameters */
|
||||
CHECK(reg_int("k_nomial_radix", NULL,
|
||||
"The radix of the K-nomial tree for scatther-gather type algorithms"
|
||||
"(starts from 2)", 2, &mca_bcol_iboffload_component.k_nomial_radix,
|
||||
REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("priority", NULL,
|
||||
"IB offload component priority"
|
||||
"(from 0(low) to 90 (high))", 90,
|
||||
&mca_bcol_iboffload_component.super.priority, 0));
|
||||
|
||||
CHECK(reg_int("verbose", NULL,
|
||||
"Output some verbose IB offload BTL information "
|
||||
"(0 = no output, nonzero = output)", 0,
|
||||
&mca_bcol_iboffload_component.verbose, 0));
|
||||
|
||||
CHECK(reg_bool("warn_default_gid_prefix", NULL,
|
||||
"Warn when there is more than one active ports and at least one of them connected to the network with only default GID prefix configured (0 = do not warn; any other value = warn)",
|
||||
true, &mca_bcol_iboffload_component.warn_default_gid_prefix));
|
||||
|
||||
CHECK(reg_bool("warn_nonexistent_if", NULL,
|
||||
"Warn if non-existent devices and/or ports are specified in the bcol_iboffla_if_[in|ex]clude MCA parameters (0 = do not warn; any other value = warn)",
|
||||
true, &mca_bcol_iboffload_component.warn_nonexistent_if));
|
||||
|
||||
CHECK(reg_int("max_pipeline_depth", NULL,
|
||||
"The maximal number of fragments of the same collective request that can be transferred in parallel", 3,
|
||||
(int *) &mca_bcol_iboffload_component.max_pipeline_depth, 0));
|
||||
|
||||
CHECK(reg_int("max_mqe_tasks", NULL,
|
||||
"Maximum number of MQEs for each iboffload module",
|
||||
1024, &mca_bcol_iboffload_component.max_mqe_tasks, 0));
|
||||
CHECK(reg_int("max_mq_size", NULL,
|
||||
"Maximum size of each MQ for each iboffload module",
|
||||
1024, &mca_bcol_iboffload_component.max_mq_size, 0));
|
||||
CHECK(reg_int("free_list_num", NULL,
|
||||
"Intial size of free lists (must be >= 1)",
|
||||
256, &mca_bcol_iboffload_component.free_list_num,
|
||||
REGINT_GE_ONE));
|
||||
CHECK(reg_int("free_list_max", NULL,
|
||||
"Maximum size of free lists "
|
||||
"(-1 = infinite, otherwise must be >= 0)",
|
||||
-1, &mca_bcol_iboffload_component.free_list_max,
|
||||
REGINT_NEG_ONE_OK | REGINT_GE_ONE));
|
||||
CHECK(reg_int("free_list_inc", NULL,
|
||||
"Increment size of free lists (must be >= 1)",
|
||||
32, &mca_bcol_iboffload_component.free_list_inc,
|
||||
REGINT_GE_ONE));
|
||||
/* rdma mpool no longer exists - must use the grdma mpool component, should resolve errors in
|
||||
* mtt testing
|
||||
*/
|
||||
/*
|
||||
CHECK(reg_string("mpool", NULL,
|
||||
"Name of the memory pool to be used (it is unlikely that you will ever want to change this",
|
||||
"rdma", &mca_bcol_iboffload_component.mpool_name,
|
||||
0));
|
||||
*/
|
||||
CHECK(reg_string("mpool", NULL,
|
||||
"Name of the memory pool to be used (it is unlikely that you will ever want to change this",
|
||||
"grdma", &mca_bcol_iboffload_component.mpool_name,
|
||||
0));
|
||||
CHECK(reg_int("cq_size", "cq_size",
|
||||
"Size of the OpenFabrics completion "
|
||||
"queue (will automatically be set to a minimum of "
|
||||
"(2 * number_of_peers * bcol_iboffload_rd_num))",
|
||||
1024, &mca_bcol_iboffload_component.cq_size, REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("exchange_tree_order", NULL,
|
||||
"The order of the exchange tree. "
|
||||
"Must be power of two.",
|
||||
2, &mca_bcol_iboffload_component.exchange_tree_order, REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("knomial_tree_order", NULL,
|
||||
"The order of the knomial exchange tree. ",
|
||||
3, &mca_bcol_iboffload_component.knomial_tree_order, REGINT_GE_ONE));
|
||||
|
||||
|
||||
CHECK(reg_int("max_inline_data", "max_inline_data",
|
||||
"Maximum size of inline data segment "
|
||||
"(-1 = run-time probe to discover max value, "
|
||||
"otherwise must be >= 0). "
|
||||
"If not explicitly set, use max_inline_data from "
|
||||
"the INI file containing device-specific parameters",
|
||||
128, (int *) &mca_bcol_iboffload_component.max_inline_data,
|
||||
REGINT_NEG_ONE_OK | REGINT_GE_ZERO));
|
||||
|
||||
#if 0
|
||||
CHECK(reg_string("pkey", "ib_pkey_val",
|
||||
"OpenFabrics partition key (pkey) value. "
|
||||
"Unsigned integer decimal or hex values are allowed (e.g., \"3\" or \"0x3f\") and will be masked against the maximum allowable IB paritition key value (0x7fff)",
|
||||
"0", &pkey, 0));
|
||||
/* Pasha
|
||||
mca_bcol_iboffload_component.pkey_val =
|
||||
ompi_btl_openib_ini_intify(pkey) & MCA_BTL_IB_PKEY_MASK;
|
||||
free(pkey);
|
||||
*/
|
||||
#endif
|
||||
|
||||
CHECK(reg_string("receive_queues", NULL,
|
||||
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
|
||||
"P,512,256,192,128", &mca_bcol_iboffload_component.receive_queues,
|
||||
0));
|
||||
|
||||
CHECK(reg_int("qp_ous_rd_atom", NULL,
|
||||
"InfiniBand outstanding atomic reads (must be >= 0)", 4,
|
||||
(int *) &mca_bcol_iboffload_component.qp_ous_rd_atom, REGINT_GE_ZERO));
|
||||
|
||||
asprintf(&msg, "OpenFabrics MTU, in bytes (if not specified in INI files). Valid values are: %d=256 bytes, %d=512 bytes, %d=1024 bytes, %d=2048 bytes, %d=4096 bytes",
|
||||
IBV_MTU_256,
|
||||
IBV_MTU_512,
|
||||
IBV_MTU_1024,
|
||||
IBV_MTU_2048,
|
||||
IBV_MTU_4096);
|
||||
if (NULL == msg) {
|
||||
/* Don't try to recover from this */
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
CHECK(mca_base_var_enum_create("infiniband mtu", mtu_values, &new_enum));
|
||||
mca_bcol_iboffload_component.mtu = IBV_MTU_1024;
|
||||
tmp = mca_base_component_var_register(&mca_bcol_iboffload_component.super.bcol_version,
|
||||
"mtu", msg, MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_bcol_iboffload_component.mtu);
|
||||
OBJ_RELEASE(new_enum);
|
||||
free(msg);
|
||||
|
||||
if (0 > tmp) ret = tmp;
|
||||
|
||||
tmp = mca_base_var_register_synonym(tmp, "ompi", "bcol", "iboffload", "ib_mtu",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
if (0 > tmp) ret = tmp;
|
||||
|
||||
CHECK(reg_int("ib_min_rnr_timer", NULL, "InfiniBand minimum "
|
||||
"\"receiver not ready\" timer, in seconds "
|
||||
"(must be >= 0 and <= 31)",
|
||||
1 , &mca_bcol_iboffload_component.min_rnr_timer, 0));
|
||||
|
||||
CHECK(reg_int("ib_timeout", NULL, "InfiniBand transmit timeout, plugged into formula: 4.096 microseconds * "
|
||||
"(2^bcol_iboffload_ib_timeout) (must be >= 0 and <= 31)",
|
||||
20, &mca_bcol_iboffload_component.timeout, 0));
|
||||
|
||||
CHECK(reg_int("ib_retry_count", NULL, "InfiniBand transmit retry count "
|
||||
"(must be >= 0 and <= 7)",
|
||||
7, &mca_bcol_iboffload_component.retry_count, 0));
|
||||
|
||||
CHECK(reg_int("ib_rnr_retry", NULL, "InfiniBand \"receiver not ready\" "
|
||||
"retry count; applies *only* to SRQ/XRC queues. PP queues "
|
||||
"use RNR retry values of 0 because Open MPI performs "
|
||||
"software flow control to guarantee that RNRs never occur "
|
||||
"(must be >= 0 and <= 7; 7 = \"infinite\")",
|
||||
7, &mca_bcol_iboffload_component.rnr_retry, 0));
|
||||
|
||||
CHECK(reg_int("ib_max_rdma_dst_ops", NULL, "InfiniBand maximum pending RDMA "
|
||||
"destination operations "
|
||||
"(must be >= 0)",
|
||||
4, &mca_bcol_iboffload_component.max_rdma_dst_ops, REGINT_GE_ZERO));
|
||||
|
||||
CHECK(reg_int("ib_service_level", NULL, "InfiniBand service level "
|
||||
"(must be >= 0 and <= 15)",
|
||||
0, &mca_bcol_iboffload_component.service_level, 0));
|
||||
|
||||
CHECK(reg_int("buffer_alignment", NULL,
|
||||
"Prefered communication buffer alignment, in bytes "
|
||||
"(must be > 0 and power of two)",
|
||||
64, &mca_bcol_iboffload_component.buffer_alignment, REGINT_GE_ZERO));
|
||||
|
||||
/* register parmeters controlling message fragementation */
|
||||
CHECK(reg_int("min_frag_size", NULL,
|
||||
"Minimum fragment size",
|
||||
getpagesize(), &mca_bcol_iboffload_component.super.min_frag_size,
|
||||
REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("max_frag_size", NULL,
|
||||
"Maximum fragment size",
|
||||
FRAG_SIZE_NO_LIMIT, &mca_bcol_iboffload_component.super.max_frag_size,
|
||||
REGINT_NONZERO));
|
||||
|
||||
CHECK(reg_bool("can_use_user_buffers", NULL,
|
||||
"User memory can be used by the collective algorithms",
|
||||
true, &mca_bcol_iboffload_component.super.can_use_user_buffers));
|
||||
|
||||
CHECK(reg_int("barrier_mode", NULL,
|
||||
"Barrier mode: 0 - Recursive doubling; 1 - Recursive K-ing",
|
||||
0, &mca_bcol_iboffload_component.barrier_mode, REGINT_GE_ZERO));
|
||||
|
||||
CHECK(reg_int("max_progress_pull", NULL,
|
||||
"Max number of progress pull checks",
|
||||
8, &mca_bcol_iboffload_component.max_progress_pull, REGINT_GE_ZERO));
|
||||
|
||||
CHECK(reg_int("use_brucks_smsg_alltoall_rdma", NULL,
|
||||
"Use brucks algorithm for smsg alltoall and RDMA semantics 1 = No Temp buffer recycling"
|
||||
"1 = Alg with no Temp Buffer Recycling (faster), 2 = Alg with temp Buffer Recycling (slower)",
|
||||
0, &mca_bcol_iboffload_component.use_brucks_smsg_alltoall_rdma, 0));
|
||||
|
||||
CHECK(reg_int("use_brucks_smsg_alltoall_sr", NULL,
|
||||
"Use brucks algorithm for smsg alltoall and Send/Recv semantics "
|
||||
"1 = Alg with RTR (faster), 2 = Alg with RNR (slower)",
|
||||
0, &mca_bcol_iboffload_component.use_brucks_smsg_alltoall_sr, 0));
|
||||
|
||||
CHECK(reg_int("alltoall_bruck_radix", NULL,
|
||||
"Radix for Bruck algorithm for smsg alltoall",
|
||||
3, &mca_bcol_iboffload_component.k_alltoall_bruck_radix, 0));
|
||||
|
||||
CHECK(reg_int("k_alltoall_bruck_radix", NULL,
|
||||
"Temp Buffer alignment for Bruck algorithm for smsg alltoall",
|
||||
64, &mca_bcol_iboffload_component.tmp_buf_alignment, 0));
|
||||
|
||||
/*
|
||||
CHECK(reg_string("if_include", NULL,
|
||||
"Comma-delimited list of devices/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with bcol_iboffload_if_exclude.",
|
||||
NULL, &mca_bcol_iboffload_component.if_include,
|
||||
0));
|
||||
|
||||
CHECK(reg_string("if_exclude", NULL,
|
||||
"Comma-delimited list of device/ports to be excluded (empty value means to not exclude any ports). Mutually exclusive with bcol_iboffload_if_include.",
|
||||
NULL, &mca_bcol_iboffload_component.if_exclude,
|
||||
0));
|
||||
*/
|
||||
|
||||
CHECK(mca_bcol_iboffload_verify_params());
|
||||
|
||||
/* Register any MCA params for the connect pseudo-components */
|
||||
if (OMPI_SUCCESS == ret) {
|
||||
ret = ompi_common_ofacm_base_register(&mca_bcol_iboffload_component.super.bcol_version);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_MCA_H
|
||||
#define MCA_BCOL_IBOFFLOAD_MCA_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
int mca_bcol_iboffload_register_params(void);
|
||||
int mca_bcol_iboffload_verify_params(void);
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,452 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <infiniband/mqe.h>
|
||||
#include <infiniband/verbs.h>
|
||||
#include <infiniband/mverbs.h>
|
||||
|
||||
#include "bcol_iboffload_frag.h"
|
||||
#include "bcol_iboffload_device.h"
|
||||
#include "bcol_iboffload_qp_info.h"
|
||||
#include "bcol_iboffload_collreq.h"
|
||||
#include "bcol_iboffload_endpoint.h"
|
||||
|
||||
static int mca_bcol_iboffload_dummy_frag_qp_prepost(
|
||||
mca_bcol_iboffload_endpoint_t *endpoint,
|
||||
int qp_index, int num_to_prepost)
|
||||
{
|
||||
struct ibv_recv_wr *recv_wr, *recv_bad;
|
||||
int ret, num_preposted = 0, start_wr_index;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d",
|
||||
(void *) endpoint, num_to_prepost));
|
||||
|
||||
if (OPAL_UNLIKELY(0 == num_to_prepost)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate"));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* make sure that we do not overrun number of rd_wqe */
|
||||
if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d",
|
||||
num_to_prepost, endpoint->qps[qp_index].rd_wqe));
|
||||
|
||||
num_to_prepost = endpoint->qps[qp_index].rd_wqe;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&recv_wrs->lock);
|
||||
|
||||
/* calculate start index in array
|
||||
* of pre-allocated work requests */
|
||||
start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost;
|
||||
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, "
|
||||
"start index of WRs - %d, rd_wqe - %d",
|
||||
(void *) endpoint, qp_index, num_to_prepost,
|
||||
start_wr_index, endpoint->qps[qp_index].rd_wqe));
|
||||
|
||||
while (num_preposted < num_to_prepost) {
|
||||
/* prepost the special barrier frag to recv queue */
|
||||
struct ibv_sge *dummy_sg_entry =
|
||||
&endpoint->iboffload_module->device->dummy_frags[qp_index].sg_entry;
|
||||
|
||||
recv_wr[num_preposted].sg_list = dummy_sg_entry;
|
||||
++num_preposted;
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(num_preposted > 0)) {
|
||||
/* Set the tail */
|
||||
recv_wr[num_preposted - 1].next = NULL;
|
||||
|
||||
/* post the list of recvs */
|
||||
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
|
||||
if (OPAL_UNLIKELY(0 != ret)) {
|
||||
IBOFFLOAD_ERROR(("ibv_post_recv failed, error: %s [%d], "
|
||||
"qp_index - %d.\n", strerror(errno), ret, qp_index));
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* recover last recv_wr if needed */
|
||||
if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) {
|
||||
recv_wr[num_preposted - 1].next = &recv_wr[num_preposted];
|
||||
}
|
||||
|
||||
/* decresing numbers of free recv wqe */
|
||||
endpoint->qps[qp_index].rd_wqe -= num_preposted;
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d, qp_index - %d",
|
||||
(void *) endpoint, num_to_prepost, num_preposted, qp_index));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive prepost:
|
||||
* return values:
|
||||
* 0 - no prepost was done
|
||||
* -1 - fatal error during prepost
|
||||
* other value - number preposted elements
|
||||
*/
|
||||
static int mca_bcol_iboffload_frag_reg_qp_prepost(
|
||||
mca_bcol_iboffload_endpoint_t *endpoint,
|
||||
int qp_index, int num_to_prepost)
|
||||
{
|
||||
ompi_free_list_item_t *item;
|
||||
mca_bcol_iboffload_frag_t *frag;
|
||||
|
||||
struct ibv_recv_wr *recv_wr, *recv_bad;
|
||||
int i, ret, num_preposted = 0, start_wr_index;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;
|
||||
|
||||
opal_list_t *preposted = &(endpoint->qps[qp_index].preposted_frags);
|
||||
mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d",
|
||||
(void *) endpoint, num_to_prepost));
|
||||
|
||||
if (OPAL_UNLIKELY(0 == num_to_prepost)) {
|
||||
IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate"));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* make sure that we do not overrun number of rd_wqe */
|
||||
if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d",
|
||||
num_to_prepost, endpoint->qps[qp_index].rd_wqe));
|
||||
|
||||
num_to_prepost = endpoint->qps[qp_index].rd_wqe;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&recv_wrs->lock);
|
||||
|
||||
/* calculate start index in array
|
||||
* of pre-allocated work requests */
|
||||
start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost;
|
||||
recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, "
|
||||
"start index of WRs - %d, rd_wqe - %d",
|
||||
(void *) endpoint, qp_index, num_to_prepost,
|
||||
start_wr_index, endpoint->qps[qp_index].rd_wqe));
|
||||
|
||||
while (num_preposted < num_to_prepost) {
|
||||
/* put the item on list of preposted */
|
||||
OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
break;
|
||||
}
|
||||
|
||||
frag = (mca_bcol_iboffload_frag_t *) item;
|
||||
opal_list_append(preposted, (opal_list_item_t *) item);
|
||||
|
||||
recv_wr[num_preposted].sg_list = &frag->sg_entry;
|
||||
/* TODO Pasha - fix it later */ /* Vasily: Is it right place to take a size value ???? */
|
||||
frag->sg_entry.length = cm->qp_infos[qp_index].size;
|
||||
++num_preposted;
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(num_preposted > 0)) {
|
||||
/* Set the tail */
|
||||
recv_wr[num_preposted - 1].next = NULL;
|
||||
|
||||
/* post the list of recvs */
|
||||
ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
|
||||
if (OPAL_UNLIKELY(0 != ret)) {
|
||||
IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
|
||||
"qp_index - %d.\n",
|
||||
ibv_get_device_name(device->dev.ib_dev),
|
||||
strerror(errno), ret, qp_index));
|
||||
|
||||
/* Return allocated frags */
|
||||
for (i = 0; i < num_preposted; i++) {
|
||||
OMPI_FREE_LIST_RETURN_MT(&device->frags_free[qp_index],
|
||||
(ompi_free_list_item_t *)
|
||||
opal_list_remove_last(preposted));
|
||||
}
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* recover last recv_wr if needed */
|
||||
if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) {
|
||||
recv_wr[num_preposted - 1].next = &recv_wr[num_preposted];
|
||||
}
|
||||
|
||||
/* decresing numbers of free recv wqe */
|
||||
endpoint->qps[qp_index].rd_wqe -= num_preposted;
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&recv_wrs->lock);
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d",
|
||||
(void *) endpoint, num_to_prepost, num_preposted));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static void mca_bcol_iboffload_fillin_qp_attr(int qp_index,
|
||||
mca_bcol_iboffload_endpoint_t *ep,
|
||||
ompi_common_ofacm_base_qp_config_t *qp_config)
|
||||
{
|
||||
uint32_t max_sge, *init_attr_mask =
|
||||
&qp_config->init_attr_mask[qp_index];
|
||||
|
||||
struct ibv_qp_attr *attr = &qp_config->attr[qp_index];
|
||||
struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index];
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
/* Set special init attributes mask */
|
||||
*init_attr_mask = IBV_M_QP_EXT_CLASS_1 |
|
||||
IBV_M_QP_EXT_CLASS_2 |
|
||||
IBV_M_QP_EXT_IGNORE_RQ_OVERFLOW;
|
||||
|
||||
/* Set init attributes */
|
||||
init_attr->qp_type = IBV_QPT_RC;
|
||||
|
||||
/* Vasily: ??????
|
||||
init_attr->cap.max_inline_data =
|
||||
max_inline_size(qp, iboffload_module->device);
|
||||
*/
|
||||
/* Pasha: we can not leave max_inline empty !
|
||||
Todo: copy max_inline_size() from ofacm to
|
||||
common area.
|
||||
*/
|
||||
init_attr->cap.max_inline_data = (int32_t) cm->max_inline_data;
|
||||
|
||||
/* We allocate SG list for some algorithms (Bruck's alltoall) */
|
||||
max_sge = ep->iboffload_module->group_size / 2 +
|
||||
ep->iboffload_module->group_size % 2;
|
||||
|
||||
/* max send sge should be less than device maximums */
|
||||
if (max_sge > (uint32_t)
|
||||
ep->iboffload_module->device->ib_dev_attr.max_sge) {
|
||||
max_sge = (uint32_t) ep->iboffload_module->device->ib_dev_attr.max_sge;
|
||||
}
|
||||
|
||||
init_attr->cap.max_send_sge = max_sge;
|
||||
init_attr->cap.max_recv_sge = max_sge;
|
||||
/* Vasily: the value will be changed later */
|
||||
/* TODO Pasha: this is real crap */
|
||||
init_attr->cap.max_recv_wr = (uint32_t) cm->cq_size;
|
||||
init_attr->cap.max_send_wr = (uint32_t) cm->cq_size;
|
||||
|
||||
/* Set attributes */
|
||||
|
||||
/* attr->pkey_index = 0; */ /* Vasily: ????? */
|
||||
|
||||
attr->port_num = ep->iboffload_module->port;
|
||||
/* Vasily: the value will be changed later */
|
||||
attr->path_mtu = (uint32_t)cm->mtu;
|
||||
|
||||
attr->max_dest_rd_atomic = cm->max_rdma_dst_ops;
|
||||
attr->min_rnr_timer = (uint32_t)cm->min_rnr_timer;
|
||||
|
||||
attr->ah_attr.is_global = 0;
|
||||
attr->ah_attr.sl = (uint32_t)cm->service_level;
|
||||
/* Vasily: from struct mca_bcol_iboffload_port_t ????? */
|
||||
/*
|
||||
attr->ah_attr.src_path_bits = iboffload_module->src_path_bits;
|
||||
*/
|
||||
attr->ah_attr.port_num = ep->iboffload_module->port;
|
||||
/* JMS to be filled in later dynamically */
|
||||
attr->ah_attr.static_rate = 0;
|
||||
/* RTS params */
|
||||
attr->timeout = (uint32_t)cm->timeout;
|
||||
attr->retry_cnt = (uint32_t)cm->retry_count;
|
||||
attr->rnr_retry = (uint32_t)cm->rnr_retry;
|
||||
attr->max_rd_atomic = (uint32_t)cm->max_rdma_dst_ops;
|
||||
|
||||
/* Init for local mca_bcol_iboffload_endpoint_qp_t qps structure
|
||||
* that caches the qp information on endpoint */
|
||||
OBJ_CONSTRUCT(&ep->qps[qp_index].preposted_frags, opal_list_t);
|
||||
|
||||
/* Pasha: Need to add function that will */
|
||||
ep->qps[qp_index].ib_inline_max = cm->max_inline_data;
|
||||
/* TODO Pasha - this is crap too... we do not have info for sevice qps. Fix it later */
|
||||
|
||||
ep->qps[qp_index].sd_wqe = cm->qp_infos[qp_index].rd_num;
|
||||
ep->qps[qp_index].rd_wqe = cm->qp_infos[qp_index].rd_num;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("ep - %p, qp index - %d, num of rd_wqe - %d.",
|
||||
ep, qp_index, ep->qps[qp_index].rd_wqe));
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_alloc_reg_qp_resource(int qp_index, mca_bcol_iboffload_device_t *device)
|
||||
{
|
||||
int length;
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
ompi_free_list_t *frags_free = &device->frags_free[qp_index];
|
||||
|
||||
OBJ_CONSTRUCT(frags_free, ompi_free_list_t);
|
||||
length = cm->qp_infos[qp_index].size;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("free list len %d\n", length));
|
||||
if (OMPI_SUCCESS != ompi_free_list_init_ex_new(frags_free,
|
||||
sizeof(mca_bcol_iboffload_frag_t), MCA_IBOFFLOAD_CACHE_LINE_SIZE,
|
||||
OBJ_CLASS(mca_bcol_iboffload_frag_t),
|
||||
length, cm->buffer_alignment,
|
||||
cm->free_list_num,
|
||||
cm->free_list_max,
|
||||
cm->free_list_inc,
|
||||
device->mpool,
|
||||
mca_bcol_iboffload_frag_init,
|
||||
(void *) &cm->qp_infos[qp_index].qp_index)) {
|
||||
IBOFFLOAD_ERROR(("Failed to allocate frags_free"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
static int mca_bcol_iboffload_dealloc_reg_qp_resource(int qp_index, mca_bcol_iboffload_device_t *device)
|
||||
{
|
||||
OBJ_DESTRUCT(&device->frags_free[qp_index]);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static mca_bcol_iboffload_frag_t *mca_bcol_iboffload_get_dummy_frag(
|
||||
mca_bcol_iboffload_endpoint_t *ep, int qp_index)
|
||||
{
|
||||
return &ep->iboffload_module->device->dummy_frags[qp_index];
|
||||
}
|
||||
|
||||
static mca_bcol_iboffload_frag_t *mca_bcol_iboffload_endpoint_get_preposted_frag(
|
||||
mca_bcol_iboffload_endpoint_t *ep, int qp_index)
|
||||
{
|
||||
return (mca_bcol_iboffload_frag_t *)
|
||||
opal_list_remove_first(&ep->qps[qp_index].preposted_frags);
|
||||
}
|
||||
|
||||
static void mca_bcol_iboffload_regular_qp_attr(int qp_index,
|
||||
mca_bcol_iboffload_endpoint_t *ep,
|
||||
ompi_common_ofacm_base_qp_config_t *qp_config)
|
||||
{
|
||||
struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index];
|
||||
|
||||
mca_bcol_iboffload_fillin_qp_attr(qp_index, ep, qp_config);
|
||||
|
||||
init_attr->send_cq = ep->iboffload_module->device->ib_cq;
|
||||
init_attr->recv_cq = ep->recv_cq[IBOFFLOAD_CQ_SMALL_MESSAGES];
|
||||
}
|
||||
|
||||
static void mca_bcol_iboffload_large_buff_qp_attr(int qp_index,
|
||||
mca_bcol_iboffload_endpoint_t *ep,
|
||||
ompi_common_ofacm_base_qp_config_t *qp_config)
|
||||
{
|
||||
struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index];
|
||||
|
||||
mca_bcol_iboffload_fillin_qp_attr(qp_index, ep, qp_config);
|
||||
|
||||
init_attr->send_cq = ep->iboffload_module->device->ib_cq;
|
||||
init_attr->recv_cq = ep->recv_cq[IBOFFLOAD_CQ_LARGE_MESSAGES];
|
||||
}
|
||||
|
||||
static void mca_bcol_iboffload_sync_qp_attr(int qp_index,
|
||||
mca_bcol_iboffload_endpoint_t *ep,
|
||||
ompi_common_ofacm_base_qp_config_t *qp_config)
|
||||
{
|
||||
struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index];
|
||||
|
||||
mca_bcol_iboffload_fillin_qp_attr(qp_index, ep, qp_config);
|
||||
|
||||
init_attr->send_cq = ep->iboffload_module->device->ib_cq;
|
||||
init_attr->recv_cq = ep->recv_cq[IBOFFLOAD_CQ_SYNC];
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_setup_barrier_qp(mca_bcol_iboffload_qp_info_t* qp_info)
|
||||
{
|
||||
qp_info->config_qp = mca_bcol_iboffload_regular_qp_attr;
|
||||
qp_info->prepost_recv = mca_bcol_iboffload_dummy_frag_qp_prepost;
|
||||
|
||||
qp_info->alloc_resource = NULL;
|
||||
qp_info->dealloc_resource = NULL;
|
||||
|
||||
qp_info->get_preposted_recv = mca_bcol_iboffload_get_dummy_frag;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_setup_regular_qp(mca_bcol_iboffload_qp_info_t* qp_info)
|
||||
{
|
||||
qp_info->config_qp = mca_bcol_iboffload_regular_qp_attr;
|
||||
qp_info->prepost_recv = mca_bcol_iboffload_frag_reg_qp_prepost;
|
||||
|
||||
qp_info->alloc_resource = mca_bcol_iboffload_alloc_reg_qp_resource;
|
||||
qp_info->dealloc_resource = mca_bcol_iboffload_dealloc_reg_qp_resource;
|
||||
|
||||
qp_info->get_preposted_recv = mca_bcol_iboffload_endpoint_get_preposted_frag;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_setup_large_buff_qp(mca_bcol_iboffload_qp_info_t* qp_info)
|
||||
{
|
||||
qp_info->config_qp = mca_bcol_iboffload_large_buff_qp_attr;
|
||||
|
||||
qp_info->prepost_recv = NULL; /* We use "manual" ML frag preposting for this QP */
|
||||
qp_info->alloc_resource = NULL;
|
||||
qp_info->dealloc_resource = NULL;
|
||||
qp_info->get_preposted_recv = NULL;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_setup_credit_qp(mca_bcol_iboffload_qp_info_t* qp_info)
|
||||
{
|
||||
qp_info->config_qp = mca_bcol_iboffload_large_buff_qp_attr;
|
||||
qp_info->prepost_recv = mca_bcol_iboffload_dummy_frag_qp_prepost;
|
||||
|
||||
qp_info->alloc_resource = NULL;
|
||||
qp_info->dealloc_resource = NULL;
|
||||
|
||||
qp_info->get_preposted_recv = mca_bcol_iboffload_get_dummy_frag;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_bcol_iboffload_setup_sync_qp(mca_bcol_iboffload_qp_info_t* qp_info)
|
||||
{
|
||||
qp_info->config_qp = mca_bcol_iboffload_sync_qp_attr;
|
||||
qp_info->prepost_recv = mca_bcol_iboffload_dummy_frag_qp_prepost;
|
||||
|
||||
qp_info->alloc_resource = NULL;
|
||||
qp_info->dealloc_resource = NULL;
|
||||
|
||||
qp_info->get_preposted_recv = mca_bcol_iboffload_get_dummy_frag;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
mca_bcol_iboffload_setup_qps_fn_t setup_qps_fn[MCA_BCOL_IBOFFLOAD_QP_LAST] = {
|
||||
mca_bcol_iboffload_setup_barrier_qp, /* MCA_BCOL_IBOFFLOAD_QP_BARRIER */
|
||||
mca_bcol_iboffload_setup_regular_qp, /* MCA_BCOL_IBOFFLOAD_QP_REGULAR */
|
||||
mca_bcol_iboffload_setup_sync_qp, /* MCA_BCOL_IBOFFLOAD_QP_SYNC */
|
||||
mca_bcol_iboffload_setup_credit_qp, /* MCA_BCOL_IBOFFLOAD_QP_CREDIT */
|
||||
mca_bcol_iboffload_setup_large_buff_qp, /* MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF */
|
||||
/* MCA_BCOL_IBOFFLOAD_QP_LAST */
|
||||
};
|
@ -1,127 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/*
|
||||
* In order to add a new QP you need to do next steps:
|
||||
*
|
||||
* 1) Add new index to enum with list of the all QPs,
|
||||
* MCA_BCOL_IBOFFLOAD_QP_NEW_QP e.g.
|
||||
*
|
||||
* 2) In the setup_qps_fn array init MCA_BCOL_IBOFFLOAD_QP_NEW_QP
|
||||
* index with your init func for this QP.
|
||||
*
|
||||
* 3) In the init func you added init the next func pointers:
|
||||
* a) config_qp - in this func you need to fill in ibv_qp_init_attr
|
||||
* structure will be used for this QP creation.
|
||||
*
|
||||
* b) prepost_recv - you have to specify this poiner if you want
|
||||
* automatically executed preposting to your new QP.
|
||||
*
|
||||
* c) alloc_resource - will be called during device activation,
|
||||
* if you need any device resource (list of frags for example)
|
||||
* for your new QP here the right place to allocate it.
|
||||
*
|
||||
* d) dealloc_resource - if any resource was allocated dynamically
|
||||
* by alloc_resource func destruct it in this func.
|
||||
*
|
||||
* e) get_preposted_recv - the function returns preposted recieve for 'wait task'.
|
||||
*
|
||||
* d) If you don't need any of these funcs you have to init appropriate pointer with NULL.
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_QP_INFO_H
|
||||
#define MCA_BCOL_IBOFFLOAD_QP_INFO_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* forward declarations */
|
||||
struct mca_bcol_iboffload_device_t;
|
||||
struct mca_bcol_iboffload_collreq_t;
|
||||
struct mca_bcol_iboffload_qp_info_t;
|
||||
struct mca_bcol_iboffload_endpoint_t;
|
||||
|
||||
/* The list of the all required QPs */
|
||||
enum {
|
||||
MCA_BCOL_IBOFFLOAD_QP_BARRIER,
|
||||
MCA_BCOL_IBOFFLOAD_QP_REGULAR,
|
||||
MCA_BCOL_IBOFFLOAD_QP_SYNC,
|
||||
MCA_BCOL_IBOFFLOAD_QP_CREDIT,
|
||||
MCA_BCOL_IBOFFLOAD_QP_LARGE_BUFF,
|
||||
MCA_BCOL_IBOFFLOAD_QP_LAST
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
MCA_BCOL_IBOFFLOAD_PP_QP,
|
||||
MCA_BCOL_IBOFFLOAD_SRQ_QP,
|
||||
MCA_BCOL_IBOFFLOAD_XRC_QP
|
||||
} mca_bcol_iboffload_qp_type_t;
|
||||
|
||||
struct mca_bcol_iboffload_pp_qp_info_t {
|
||||
int32_t rd_win;
|
||||
int32_t rd_rsv;
|
||||
}; typedef struct mca_bcol_iboffload_pp_qp_info_t mca_bcol_iboffload_pp_qp_info_t;
|
||||
|
||||
struct mca_bcol_iboffload_srq_qp_info_t {
|
||||
int32_t sd_max;
|
||||
}; typedef struct mca_bcol_iboffload_srq_qp_info_t mca_bcol_iboffload_srq_qp_info_t;
|
||||
|
||||
typedef int (*mca_bcol_iboffload_setup_qps_fn_t) (struct mca_bcol_iboffload_qp_info_t*);
|
||||
typedef int (*mca_bcol_iboffload_prepost_qps_fn_t)
|
||||
(struct mca_bcol_iboffload_endpoint_t *endpoint,
|
||||
int qp_index, int num_to_prepost);
|
||||
|
||||
typedef void (*mca_bcol_iboffload_config_qps_fn_t)
|
||||
(int qp_index,
|
||||
struct mca_bcol_iboffload_endpoint_t *ep,
|
||||
ompi_common_ofacm_base_qp_config_t *qp_config);
|
||||
|
||||
typedef int (*mca_bcol_iboffload_alloc_qps_resource_fn_t)
|
||||
(int qp_index,
|
||||
struct mca_bcol_iboffload_device_t *device);
|
||||
|
||||
typedef int (*mca_bcol_iboffload_dealloc_qps_resource_fn_t)
|
||||
(int qp_index,
|
||||
struct mca_bcol_iboffload_device_t *device);
|
||||
|
||||
typedef struct mca_bcol_iboffload_frag_t* (*mca_bcol_iboffload_get_preposted_recv_fn_t)
|
||||
(struct mca_bcol_iboffload_endpoint_t *ep, int qp_index);
|
||||
|
||||
struct mca_bcol_iboffload_qp_info_t {
|
||||
size_t size;
|
||||
|
||||
int32_t rd_num;
|
||||
int32_t rd_low;
|
||||
int32_t rd_pp_win; /* prepost window = rd_num - rd_low */
|
||||
int qp_index;
|
||||
|
||||
mca_bcol_iboffload_qp_type_t type;
|
||||
|
||||
mca_bcol_iboffload_config_qps_fn_t config_qp;
|
||||
mca_bcol_iboffload_prepost_qps_fn_t prepost_recv;
|
||||
|
||||
mca_bcol_iboffload_alloc_qps_resource_fn_t alloc_resource;
|
||||
mca_bcol_iboffload_dealloc_qps_resource_fn_t dealloc_resource;
|
||||
|
||||
mca_bcol_iboffload_get_preposted_recv_fn_t get_preposted_recv;
|
||||
|
||||
union {
|
||||
mca_bcol_iboffload_pp_qp_info_t pp_qp;
|
||||
mca_bcol_iboffload_srq_qp_info_t srq_qp;
|
||||
} u;
|
||||
}; typedef struct mca_bcol_iboffload_qp_info_t mca_bcol_iboffload_qp_info_t;
|
||||
|
||||
extern mca_bcol_iboffload_setup_qps_fn_t setup_qps_fn[MCA_BCOL_IBOFFLOAD_QP_LAST];
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_BCOL_IBOFFLOAD_QP_INFO_H */
|
||||
|
@ -1,81 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
#include "bcol_iboffload_task.h"
|
||||
|
||||
static void task_constructor(mca_bcol_iboffload_task_t *task)
|
||||
{
|
||||
task->frag = NULL;
|
||||
task->collfrag = NULL;
|
||||
task->endpoint = NULL;
|
||||
task->next_task = NULL;
|
||||
|
||||
task->sg_entries = NULL;
|
||||
task->sg_entries_num = 0;
|
||||
|
||||
task->task_list = NULL;
|
||||
|
||||
memset(&task->wr, 0, sizeof(task->wr));
|
||||
|
||||
memset(&task->element, 0, sizeof(struct mqe_task));
|
||||
memset(&task->task_mqe_qp_entry, 0, sizeof(struct mqe_qp_entry));
|
||||
}
|
||||
|
||||
static void task_destructor(mca_bcol_iboffload_task_t *task)
|
||||
{
|
||||
if (NULL != task->sg_entries) {
|
||||
free(task->sg_entries);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_bcol_iboffload_task_t,
|
||||
ompi_free_list_item_t,
|
||||
task_constructor,
|
||||
task_destructor);
|
||||
|
||||
void
|
||||
mca_bcol_iboffload_calc_task_init(ompi_free_list_item_t* item, void* ctx)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *calc_task =
|
||||
(mca_bcol_iboffload_task_t *) item;
|
||||
|
||||
calc_task->task_list = (ompi_free_list_t *) ctx;
|
||||
|
||||
calc_task->sg_entries_num = 2;
|
||||
calc_task->sg_entries = (struct ibv_sge *) malloc (2 * sizeof(struct ibv_sge));
|
||||
}
|
||||
|
||||
void
|
||||
mca_bcol_iboffload_iovec_task_init(ompi_free_list_item_t* item, void* ctx)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *iovec_task =
|
||||
(mca_bcol_iboffload_task_t *) item;
|
||||
|
||||
mca_bcol_iboffload_module_t *iboffload_module =
|
||||
(mca_bcol_iboffload_module_t *) ctx;
|
||||
|
||||
int nitems, group_size = iboffload_module->group_size;
|
||||
|
||||
nitems = group_size / 2 + group_size % 2;
|
||||
if (nitems > iboffload_module->device->ib_dev_attr.max_sge) {
|
||||
nitems = iboffload_module->device->ib_dev_attr.max_sge;
|
||||
}
|
||||
|
||||
iovec_task->sg_entries_num = nitems;
|
||||
iovec_task->task_list = &iboffload_module->iovec_tasks_free;
|
||||
|
||||
iovec_task->sg_entries = (struct ibv_sge *)
|
||||
malloc(nitems * sizeof(struct ibv_sge));
|
||||
}
|
@ -1,613 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_IBOFFLOAD_TASK_H
|
||||
#define MCA_BCOL_IBOFFLOAD_TASK_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
#include <infiniband/mverbs.h>
|
||||
#include <infiniband/mqe.h>
|
||||
|
||||
#include "bcol_iboffload.h"
|
||||
#include "bcol_iboffload_frag.h"
|
||||
#include "bcol_iboffload_collreq.h"
|
||||
#include "bcol_iboffload_endpoint.h"
|
||||
#include "bcol_iboffload_collfrag.h"
|
||||
|
||||
#define SENDWR(task) ((task)->element.post.send_wr)
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* the mca_bcol_ibv_mwr_task_t name was replaced with mca_bcol_iboffload_task_t */
|
||||
struct mca_bcol_iboffload_task_t {
|
||||
ompi_free_list_item_t super;
|
||||
|
||||
/* pointer to the memory descriptor associated with the task */
|
||||
mca_bcol_iboffload_frag_t *frag;
|
||||
|
||||
/* pointer to the bcol descriptor,
|
||||
* we need it for send task only becasue we complete them in async maner
|
||||
*/
|
||||
mca_bcol_iboffload_collfrag_t *collfrag;
|
||||
|
||||
/* task to be posted */
|
||||
struct mqe_task element;
|
||||
|
||||
/* allocate ibv_sge structs array - in a CALC case
|
||||
* for example it will have two entries.
|
||||
*/
|
||||
struct ibv_sge *sg_entries;
|
||||
|
||||
/* sg_entries array length */
|
||||
int sg_entries_num;
|
||||
|
||||
/* Each task is a member of some free list,
|
||||
if the pointer is NULL => we assume the task
|
||||
is a member of the common task list (tasks_free) */
|
||||
ompi_free_list_t *task_list;
|
||||
|
||||
/* Pointer to the next task */
|
||||
struct mca_bcol_iboffload_task_t *next_task;
|
||||
|
||||
/* pasha - it is crappy work around for driver interface
|
||||
* the send_wr and recv_wr should be part of mqe_task and not pointers !
|
||||
*/
|
||||
union {
|
||||
struct ibv_m_send_wr send_wr;
|
||||
struct ibv_recv_wr recv_wr;
|
||||
} wr;
|
||||
|
||||
/* If we'll decide to post a task to a different qp */
|
||||
struct mqe_qp_entry task_mqe_qp_entry;
|
||||
|
||||
/* Pointer to endpoint for this task */
|
||||
mca_bcol_iboffload_endpoint_t *endpoint;
|
||||
};
|
||||
typedef struct mca_bcol_iboffload_task_t mca_bcol_iboffload_task_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_task_t);
|
||||
|
||||
|
||||
/* calc_tasks_free free list init function */
|
||||
void
|
||||
mca_bcol_iboffload_calc_task_init(ompi_free_list_item_t* item, void* ctx);
|
||||
|
||||
/* iovec_tasks_free free list init function */
|
||||
void
|
||||
mca_bcol_iboffload_iovec_task_init(ompi_free_list_item_t* item, void* ctx);
|
||||
|
||||
static inline __opal_attribute_always_inline__ void
|
||||
mca_bcol_iboffload_return_frag_tolist(
|
||||
mca_bcol_iboffload_frag_t *frag,
|
||||
ompi_free_list_t *list)
|
||||
{
|
||||
if (NULL != frag) {
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
assert(MCA_BCOL_IBOFFLOAD_NONE_OWNER != frag->type);
|
||||
|
||||
if (MCA_BCOL_IBOFFLOAD_DUMMY_OWNER != frag->type &&
|
||||
0 == frag->ref_counter) {
|
||||
if (MCA_BCOL_IBOFFLOAD_BCOL_OWNER == frag->type) {
|
||||
OMPI_FREE_LIST_RETURN_MT((&(list[frag->qp_index])),
|
||||
(ompi_free_list_item_t*) frag);
|
||||
} else if (MCA_BCOL_IBOFFLOAD_ML_OWNER == frag->type) {
|
||||
OMPI_FREE_LIST_RETURN_MT((&(cm->ml_frags_free)),
|
||||
(ompi_free_list_item_t*) frag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ void
|
||||
mca_bcol_iboffload_return_recv_frags_toendpoint(
|
||||
mca_bcol_iboffload_frag_t *frags,
|
||||
mca_bcol_iboffload_endpoint_t *ep,
|
||||
int qp_index)
|
||||
{
|
||||
mca_bcol_iboffload_frag_t *recv_frag = frags;
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
|
||||
while (NULL != recv_frag) {
|
||||
assert(MCA_BCOL_IBOFFLOAD_NONE_OWNER != recv_frag->type);
|
||||
if (MCA_BCOL_IBOFFLOAD_ML_OWNER != recv_frag->type) {
|
||||
opal_list_prepend(&ep->qps[qp_index].preposted_frags,
|
||||
(opal_list_item_t *) recv_frag);
|
||||
} else {
|
||||
OMPI_FREE_LIST_RETURN_MT((&(cm->ml_frags_free)),
|
||||
(ompi_free_list_item_t*) recv_frag);
|
||||
}
|
||||
|
||||
recv_frag = recv_frag->next;
|
||||
}
|
||||
}
|
||||
|
||||
/* Wait task allocation and initialization */
|
||||
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
||||
mca_bcol_iboffload_get_wait_task(mca_bcol_iboffload_module_t *iboffload,
|
||||
uint32_t source, int num_waits,
|
||||
mca_bcol_iboffload_frag_t *frags,
|
||||
int qp_index, struct ibv_qp *qp)
|
||||
{
|
||||
ompi_free_list_item_t *item;
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[source];
|
||||
|
||||
/* blocking allocation for send fragment */
|
||||
OMPI_FREE_LIST_GET_MT(&cm->tasks_free, item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
mca_bcol_iboffload_return_recv_frags_toendpoint(frags, endpoint, qp_index);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
task = (mca_bcol_iboffload_task_t *) item;
|
||||
/* set pointer to corresponding recv fragment */
|
||||
IBOFFLOAD_SET_FRAGS_ON_TASK(frags, task);
|
||||
|
||||
task->next_task = NULL;
|
||||
task->endpoint = endpoint;
|
||||
|
||||
/* set opcode */
|
||||
task->element.opcode = MQE_WR_CQE_WAIT;
|
||||
task->element.flags = 0; /* Here maybe ANY flag, anyway driver ignore it */
|
||||
/* set task id */
|
||||
task->element.wr_id = (uint64_t) (uintptr_t) task;
|
||||
/* set CQ */
|
||||
task->element.wait.cq = endpoint->qp_config.init_attr[qp_index].recv_cq;
|
||||
|
||||
/* set number of tasks to task */
|
||||
task->element.wait.count = num_waits;
|
||||
/* set pointer to QP */
|
||||
|
||||
if (NULL == qp) { /* NULL means use MQ's QP */
|
||||
task->element.wait.mqe_qp = NULL;
|
||||
} else { /* Post wait to the SQ of this QP */
|
||||
task->task_mqe_qp_entry.next = NULL;
|
||||
task->task_mqe_qp_entry.qp = qp;
|
||||
|
||||
task->element.wait.mqe_qp = &task->task_mqe_qp_entry;
|
||||
}
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Allocating task %p, cq: %p, num waits: %d, qp_index - %d, "
|
||||
"destination %d for comm rank: %d.\n",
|
||||
(void *) task, (void *) task->element.wait.cq,
|
||||
task->element.wait.count, qp_index, source,
|
||||
endpoint->iboffload_module->ibnet->super.group_list[endpoint->index]));
|
||||
return task;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
||||
mca_bcol_iboffload_prepare_send_task(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_endpoint_t *endpoint,
|
||||
int qp_index, ompi_free_list_t *task_list,
|
||||
mca_bcol_iboffload_collfrag_t *collfrag)
|
||||
{
|
||||
ompi_free_list_item_t *item;
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Destination rank - %d, QP index - %d, "
|
||||
"for comm rank - %d\n", endpoint->index, qp_index,
|
||||
endpoint->iboffload_module->ibnet->super.group_list[endpoint->index]));
|
||||
|
||||
/* get item from free list */
|
||||
OMPI_FREE_LIST_GET_MT(task_list, item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
task = (mca_bcol_iboffload_task_t*) item;
|
||||
task->endpoint = endpoint;
|
||||
|
||||
++(collfrag->n_sends);
|
||||
task->collfrag = collfrag;
|
||||
|
||||
task->next_task = NULL;
|
||||
task->element.wr_id = (uint64_t) (uintptr_t) task;
|
||||
|
||||
task->element.post.qp = endpoint->qps[qp_index].qp->lcl_qp;
|
||||
|
||||
task->element.opcode = MQE_WR_SEND;
|
||||
|
||||
/* define send work request */
|
||||
SENDWR(task) = &(task->wr.send_wr);
|
||||
|
||||
SENDWR(task)->next = NULL;
|
||||
|
||||
SENDWR(task)->wr_id = (uint64_t) (uintptr_t) collfrag;
|
||||
IBOFFLOAD_VERBOSE(10, ("coll_frag - %p.\n", collfrag));
|
||||
|
||||
/* Allways send IMM on sends ! */
|
||||
task->element.flags = MQE_WR_FLAG_IMM_EXE;
|
||||
|
||||
/* Always signal completion */
|
||||
SENDWR(task)->send_flags = IBV_SEND_SIGNALED;
|
||||
|
||||
return task;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
||||
mca_bcol_iboffload_get_send_task(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
uint32_t destination, int qp_index,
|
||||
mca_bcol_iboffload_frag_t *frag,
|
||||
mca_bcol_iboffload_collfrag_t *collfrag,
|
||||
bool enable_inline)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[destination];
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_get_send_task qp_index %d\n",
|
||||
qp_index));
|
||||
|
||||
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
|
||||
&cm->tasks_free,
|
||||
collfrag);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* no support for multiple frags */
|
||||
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
||||
|
||||
/* We can not do send with 0 byte but we can do zero byte RDMA with immidiate */
|
||||
if (0 == frag->sg_entry.length) {
|
||||
SENDWR(task)->imm_data = 0;
|
||||
SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
||||
|
||||
SENDWR(task)->wr.rdma.rkey = endpoint->remote_zero_rdma_addr.rkey;
|
||||
SENDWR(task)->wr.rdma.remote_addr = endpoint->remote_zero_rdma_addr.addr;
|
||||
} else {
|
||||
SENDWR(task)->opcode = IBV_WR_SEND;
|
||||
}
|
||||
|
||||
/* single sge */
|
||||
SENDWR(task)->num_sge = 1;
|
||||
SENDWR(task)->sg_list = &(frag->sg_entry);
|
||||
|
||||
/* Use inline send when it is possible */
|
||||
if (enable_inline &&
|
||||
frag->sg_entry.length < cm->max_inline_data) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Setting inline for len %d\n", frag->sg_entry.length));
|
||||
SENDWR(task)->send_flags |= IBV_SEND_INLINE;
|
||||
}
|
||||
|
||||
return task;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
||||
mca_bcol_iboffload_get_send_vec_task(
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
uint32_t destination, int qp_index,
|
||||
size_t nitems,
|
||||
struct iovec *buff_iovec,
|
||||
uint32_t lkey,
|
||||
mca_bcol_iboffload_frag_t *frag,
|
||||
mca_bcol_iboffload_collfrag_t *collfrag,
|
||||
bool enable_inline)
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
int i;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint = iboffload->endpoints[destination];
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_get_send_task qp_index %d\n",
|
||||
qp_index));
|
||||
|
||||
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
|
||||
&iboffload->iovec_tasks_free,
|
||||
collfrag);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* no support for multiple frags */
|
||||
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
||||
|
||||
/* We can not do send with 0 byte but we can do zero byte RDMA with immidiate */
|
||||
SENDWR(task)->opcode = IBV_WR_SEND;
|
||||
|
||||
assert (task->sg_entries != NULL);
|
||||
|
||||
for (i = 0; (size_t) i < nitems; ++i){
|
||||
task->sg_entries[i].length = buff_iovec[i].iov_len;
|
||||
task->sg_entries[i].addr = (uint64_t) buff_iovec[i].iov_base;
|
||||
task->sg_entries[i].lkey = lkey;
|
||||
}
|
||||
|
||||
/* multiple sge */
|
||||
SENDWR(task)->num_sge = nitems;
|
||||
SENDWR(task)->sg_list = (task->sg_entries);
|
||||
|
||||
/* Use inline send when it is possible */
|
||||
if (enable_inline &&
|
||||
frag->sg_entry.length < cm->max_inline_data) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Setting inline for len %d\n", frag->sg_entry.length));
|
||||
SENDWR(task)->send_flags |= IBV_SEND_INLINE;
|
||||
}
|
||||
|
||||
return task;
|
||||
}
|
||||
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
||||
mca_bcol_iboffload_get_rdma_vec_task(
|
||||
uint32_t destination, size_t offset, size_t nitems,
|
||||
mca_bcol_iboffload_frag_t *frag,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
struct iovec *buff_iovec, uint32_t lkey,
|
||||
mca_bcol_iboffload_collfrag_t *collfrag)
|
||||
{
|
||||
int i;
|
||||
mca_bcol_iboffload_collreq_t *coll_request = collfrag->coll_full_req;
|
||||
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint =
|
||||
iboffload->endpoints[destination];
|
||||
|
||||
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint,
|
||||
coll_request->qp_index,
|
||||
&iboffload->iovec_tasks_free,
|
||||
collfrag);
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* no support for multiple frags */
|
||||
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
||||
|
||||
SENDWR(task)->imm_data = 0;
|
||||
SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
||||
SENDWR(task)->wr.rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey;
|
||||
|
||||
SENDWR(task)->wr.rdma.remote_addr = (uint64_t) (uintptr_t)
|
||||
((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset);
|
||||
|
||||
for (i = 0; (size_t) i < nitems; ++i){
|
||||
task->sg_entries[i].length = buff_iovec[i].iov_len;
|
||||
task->sg_entries[i].addr = (uint64_t) buff_iovec[i].iov_base;
|
||||
task->sg_entries[i].lkey = lkey;
|
||||
}
|
||||
|
||||
/* single sge */
|
||||
SENDWR(task)->num_sge = nitems;
|
||||
SENDWR(task)->sg_list = (task->sg_entries);
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("The remote offset %ld \n", offset));
|
||||
return task;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
||||
mca_bcol_iboffload_get_rdma_task(
|
||||
uint32_t destination, size_t offset,
|
||||
mca_bcol_iboffload_frag_t *frag,
|
||||
mca_bcol_iboffload_module_t *iboffload,
|
||||
mca_bcol_iboffload_collfrag_t *collfrag)
|
||||
{
|
||||
mca_bcol_iboffload_collreq_t *coll_request = collfrag->coll_full_req;
|
||||
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint =
|
||||
iboffload->endpoints[destination];
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint,
|
||||
coll_request->qp_index,
|
||||
&cm->tasks_free, collfrag);
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* no support for multiple frags */
|
||||
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
||||
|
||||
SENDWR(task)->imm_data = 0;
|
||||
SENDWR(task)->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
||||
SENDWR(task)->wr.rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey;
|
||||
/* Pasha: I really not happy with the way we calculate remote addresses.
|
||||
why we don't use rbuf + offset ?*/
|
||||
SENDWR(task)->wr.rdma.remote_addr = (uint64_t) (uintptr_t)
|
||||
((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset);
|
||||
/* single sge */
|
||||
SENDWR(task)->num_sge = 1;
|
||||
SENDWR(task)->sg_list = &(frag->sg_entry);
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("The remote offset %ld \n", offset));
|
||||
return task;
|
||||
}
|
||||
|
||||
/* Pasha: hacking version of calc operation */
|
||||
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
||||
mca_bcol_iboffload_get_calc_task(mca_bcol_iboffload_module_t *iboffload,
|
||||
uint32_t destination, int qp_index, mca_bcol_iboffload_frag_t *frag,
|
||||
struct ibv_sge *l_operand, struct ibv_sge *r_operand,
|
||||
mca_bcol_iboffload_collreq_t *coll_request,
|
||||
bool enable_inline)
|
||||
/* Some specifications for this function:
|
||||
* 1) We assume that the len of two operands (ibv_sge structs) is a same.
|
||||
* 2) Possibly we use the results (ibv_sge structs) from previous
|
||||
* calc operations => maybe the frag pointer is NULL.
|
||||
*/
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint =
|
||||
iboffload->endpoints[destination];
|
||||
|
||||
mca_bcol_iboffload_collfrag_t *collfrag =
|
||||
(mca_bcol_iboffload_collfrag_t *)
|
||||
opal_list_get_last(&coll_request->work_requests);
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
|
||||
&cm->calc_tasks_free, collfrag);
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (NULL != frag) {
|
||||
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
||||
} else {
|
||||
task->frag = NULL;
|
||||
}
|
||||
|
||||
task->sg_entries[0] = *l_operand;
|
||||
task->sg_entries[1] = *r_operand;
|
||||
|
||||
SENDWR(task)->num_sge = 2;
|
||||
SENDWR(task)->sg_list = task->sg_entries;
|
||||
|
||||
SENDWR(task)->opcode = MCA_BCOL_IBOFFLOAD_SEND_CALC;
|
||||
#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA
|
||||
SENDWR(task)->wr.calc_send.data_type = coll_request->actual_ib_dtype;
|
||||
SENDWR(task)->wr.calc_send.calc_op = coll_request->actual_ib_op;
|
||||
#else
|
||||
SENDWR(task)->wr.calc.data_type = coll_request->actual_ib_dtype;
|
||||
SENDWR(task)->wr.calc.calc_op = coll_request->actual_ib_op;
|
||||
#endif
|
||||
|
||||
return task;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__ mca_bcol_iboffload_task_t*
|
||||
mca_bcol_iboffload_get_rdma_calc_task(mca_bcol_iboffload_module_t *iboffload,
|
||||
uint32_t destination, int qp_index, mca_bcol_iboffload_frag_t *frag,
|
||||
struct ibv_sge *l_operand, struct ibv_sge *r_operand,
|
||||
mca_bcol_iboffload_collreq_t *coll_request,
|
||||
size_t offset)
|
||||
/* Some specifications for this function:
|
||||
* 1) We assume that the len of two operands (ibv_sge structs) is a same.
|
||||
* 2) Possibly we use the results (ibv_sge structs) from previous
|
||||
* calc operations => maybe the frag pointer is NULL.
|
||||
*/
|
||||
{
|
||||
mca_bcol_iboffload_task_t *task;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint =
|
||||
iboffload->endpoints[destination];
|
||||
|
||||
mca_bcol_iboffload_collfrag_t *collfrag =
|
||||
(mca_bcol_iboffload_collfrag_t *)
|
||||
opal_list_get_last(&coll_request->work_requests);
|
||||
|
||||
mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
|
||||
task = mca_bcol_iboffload_prepare_send_task(iboffload, endpoint, qp_index,
|
||||
&cm->calc_tasks_free, collfrag);
|
||||
if (OPAL_UNLIKELY(NULL == task)) {
|
||||
mca_bcol_iboffload_return_frag_tolist(frag, iboffload->device->frags_free);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (NULL != frag) {
|
||||
IBOFFLOAD_SET_SINGLE_FRAG_ON_TASK(frag, task);
|
||||
} else {
|
||||
task->frag = NULL;
|
||||
}
|
||||
|
||||
task->sg_entries[0] = *l_operand;
|
||||
|
||||
/* Hack - we don't really use it.
|
||||
task->sg_entries[1] = *r_operand;
|
||||
*/
|
||||
/* We use only single entry
|
||||
SENDWR(task)->num_sge = 2;
|
||||
*/
|
||||
SENDWR(task)->num_sge = 1;
|
||||
SENDWR(task)->sg_list = task->sg_entries;
|
||||
|
||||
#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA
|
||||
SENDWR(task)->opcode = IBV_M_WR_CALC_RDMA_WRITE_WITH_IMM;
|
||||
SENDWR(task)->wr.calc_rdma.data_type = coll_request->actual_ib_dtype;
|
||||
SENDWR(task)->wr.calc_rdma.calc_op = coll_request->actual_ib_op;
|
||||
SENDWR(task)->wr.calc_rdma.rkey = endpoint->remote_rdma_block.ib_info.rkey;
|
||||
SENDWR(task)->wr.calc_rdma.remote_addr = (uint64_t) (uintptr_t)
|
||||
((unsigned char *) endpoint->remote_rdma_block.rdma_desc[coll_request->ml_buffer_index].data_addr + offset);
|
||||
#else
|
||||
IBOFFLOAD_ERROR(("Fatal error: RDMA CALC was called, but the driver does not support this operation"));
|
||||
return NULL;
|
||||
#endif
|
||||
|
||||
return task;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int release_frags_on_task(mca_bcol_iboffload_task_t *task,
|
||||
ompi_free_list_t *list)
|
||||
{
|
||||
int rc, qp_index;
|
||||
|
||||
mca_bcol_iboffload_frag_t *temp_frag = task->frag;
|
||||
mca_bcol_iboffload_endpoint_t *endpoint = task->endpoint;
|
||||
|
||||
mca_bcol_iboffload_component_t *cm =
|
||||
&mca_bcol_iboffload_component;
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("\nCalling release_frags_on_task"));
|
||||
|
||||
while (NULL != temp_frag) {
|
||||
qp_index = temp_frag->qp_index;
|
||||
|
||||
--(temp_frag->ref_counter);
|
||||
|
||||
/* Return credits */
|
||||
if (MQE_WR_CQE_WAIT == task->element.opcode) {
|
||||
++(endpoint->qps[qp_index].rd_wqe);
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Return rd_wqe %d pp_win %d",
|
||||
endpoint->qps[qp_index].rd_wqe,
|
||||
cm->qp_infos[qp_index].rd_pp_win));
|
||||
|
||||
/* Call for recv prepost */
|
||||
if (endpoint->qps[qp_index].rd_wqe >=
|
||||
cm->qp_infos[qp_index].rd_pp_win) {
|
||||
IBOFFLOAD_VERBOSE(10, ("Prepost to endpoint->index - %d, qp_index - %d", endpoint->index, qp_index));
|
||||
rc = mca_bcol_iboffload_prepost_recv(endpoint, qp_index,
|
||||
endpoint->qps[qp_index].rd_wqe);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
IBOFFLOAD_ERROR(("QP %d: failed to prepost.\n", qp_index));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
/* What happens if we can not prepost ?*/
|
||||
}
|
||||
} else if (MQE_WR_SEND == task->element.opcode) {
|
||||
++(endpoint->qps[qp_index].sd_wqe);
|
||||
|
||||
assert(endpoint->qps[qp_index].sd_wqe <= cm->qp_infos[qp_index].rd_num);
|
||||
|
||||
IBOFFLOAD_VERBOSE(10, ("Return sd_wqe %d, qp_index - %d, endpoint - %p",
|
||||
endpoint->qps[qp_index].sd_wqe, qp_index, endpoint));
|
||||
} else {
|
||||
/* We should not arrive to this case */
|
||||
IBOFFLOAD_ERROR(("Unsupporeted operation"));
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
mca_bcol_iboffload_return_frag_tolist(temp_frag, list);
|
||||
temp_frag = temp_frag->next;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,40 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
# Copyright (c) 2015 Research Organization for Information Science
|
||||
# and Technology (RIST). All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ompi_bcol_iboffload_CONFIG([should_build])
|
||||
# ------------------------------------------
|
||||
# AC_DEFUN([MCA_ompi_bcol_iboffload_POST_CONFIG], [
|
||||
# ])
|
||||
|
||||
|
||||
# MCA_ompi_bcol_iboffload_CONFIG([action-if-can-compile],
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
AC_DEFUN([MCA_ompi_bcol_iboffload_CONFIG],[
|
||||
AC_CONFIG_FILES([ompi/mca/bcol/iboffload/Makefile])
|
||||
bcol_ofa_happy="no"
|
||||
bcol_mlnx_ofed_happy="no"
|
||||
|
||||
OPAL_CHECK_OPENFABRICS([bcol_iboffload], [bcol_ofa_happy="yes"])
|
||||
OPAL_CHECK_MLNX_OPENFABRICS([bcol_iboffload], [bcol_mlnx_ofed_happy="yes"])
|
||||
|
||||
AS_IF([test "$bcol_ofa_happy" = "yes" && test "$bcol_mlnx_ofed_happy" = "yes"],
|
||||
[$1],
|
||||
[$2])
|
||||
|
||||
# substitute in the things needed to build iboffload
|
||||
AC_SUBST([bcol_iboffload_CFLAGS])
|
||||
AC_SUBST([bcol_iboffload_CPPFLAGS])
|
||||
AC_SUBST([bcol_iboffload_LDFLAGS])
|
||||
AC_SUBST([bcol_iboffload_LIBS])
|
||||
])dnl
|
@ -1,7 +0,0 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: ORNL
|
||||
status: unmaintained
|
@ -1,57 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
# Copyright (c) 2009-2013 Mellanox Technologies. All rights reserved.
|
||||
# Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
bcol_ptpcoll.h \
|
||||
bcol_ptpcoll_utils.h \
|
||||
bcol_ptpcoll_utils.c \
|
||||
bcol_ptpcoll_mca.h \
|
||||
bcol_ptpcoll_mca.c \
|
||||
bcol_ptpcoll_barrier.c \
|
||||
bcol_ptpcoll_bcast.c \
|
||||
bcol_ptpcoll_bcast.h \
|
||||
bcol_ptpcoll_component.c \
|
||||
bcol_ptpcoll_fanin.c \
|
||||
bcol_ptpcoll_fanout.c \
|
||||
bcol_ptpcoll_module.c \
|
||||
bcol_ptpcoll_allreduce.h \
|
||||
bcol_ptpcoll_allreduce.c \
|
||||
bcol_ptpcoll_reduce.h \
|
||||
bcol_ptpcoll_reduce.c \
|
||||
bcol_ptpcoll_allgather.c
|
||||
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
component_noinst =
|
||||
component_install =
|
||||
if MCA_BUILD_ompi_bcol_ptpcoll_DSO
|
||||
component_install += mca_bcol_ptpcoll.la
|
||||
else
|
||||
component_noinst += libmca_bcol_ptpcoll.la
|
||||
endif
|
||||
|
||||
# See ompi/mca/btl/sm/Makefile.am for an explanation of
|
||||
# libmca_common_sm.la.
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_bcol_ptpcoll_la_SOURCES = $(sources)
|
||||
mca_bcol_ptpcoll_la_LDFLAGS = -module -avoid-version
|
||||
mca_bcol_ptpcoll_la_LIBADD =
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_bcol_ptpcoll_la_SOURCES =$(sources)
|
||||
libmca_bcol_ptpcoll_la_LDFLAGS = -module -avoid-version
|
@ -1,474 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_PTPCOLL_EXPORT_H
|
||||
#define MCA_BCOL_PTPCOLL_EXPORT_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/mca/mca.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/patterns/net/netpatterns.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#ifdef HAVE_SCHED_YIELD
|
||||
# include <sched.h>
|
||||
# define SPIN sched_yield()
|
||||
#else /* no switch available */
|
||||
# define SPIN
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Structure to hold the basic shared memory coll component. First it holds the
|
||||
* base coll component, and then holds a bunch of
|
||||
* sm-coll-component-specific stuff (e.g., current MCA param
|
||||
* values).
|
||||
*/
|
||||
struct mca_bcol_ptpcoll_component_t {
|
||||
/** Base coll component */
|
||||
mca_bcol_base_component_2_0_0_t super;
|
||||
/** Verbosity level, used only in debug enabled builds */
|
||||
int verbose;
|
||||
/** The radix of K-nomial tree, initilized by mca parameter */
|
||||
int k_nomial_radix;
|
||||
/** The radix of narray tree, initilized by mca parameter */
|
||||
int narray_radix;
|
||||
/** The radix is used for narray scatther and knomail gather for
|
||||
large message bcast **/
|
||||
int narray_knomial_radix;
|
||||
/** Number of times to poll for specific tag/src */
|
||||
int num_to_probe;
|
||||
/*
|
||||
* bcast small messages algorithm
|
||||
* 1 - Knomial bcast
|
||||
* 2 - Narray bcast
|
||||
*/
|
||||
int bcast_small_messages_known_root_alg;
|
||||
/*
|
||||
* bcast large messages algorithm
|
||||
* 1 - binomial scatter-gather
|
||||
* 2 - Narray scatther, knomial gather
|
||||
*/
|
||||
int bcast_large_messages_known_root_alg;
|
||||
/*
|
||||
* barrier algorithm
|
||||
* 1 - recursive doubling
|
||||
* 2 - recursive K-ing
|
||||
*/
|
||||
int barrier_alg;
|
||||
|
||||
int use_brucks_smsg_alltoall_rdma;
|
||||
};
|
||||
|
||||
struct mca_bcol_ptpcoll_collreq_t {
|
||||
opal_free_list_item_t super;
|
||||
|
||||
int tag;
|
||||
int num_reqs;
|
||||
int exchange;
|
||||
|
||||
int need_toserv_extra;
|
||||
int extra_partner_rank;
|
||||
|
||||
ompi_request_t **requests;
|
||||
};
|
||||
typedef struct mca_bcol_ptpcoll_collreq_t mca_bcol_ptpcoll_collreq_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_ptpcoll_collreq_t);
|
||||
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct mca_bcol_ptpcoll_component_t mca_bcol_ptpcoll_component_t;
|
||||
|
||||
/* Bcast small messages,
|
||||
known root algorithm */
|
||||
enum {
|
||||
PTPCOLL_KNOMIAL = 1,
|
||||
PTPCOLL_NARRAY
|
||||
};
|
||||
|
||||
/* Bcast large messages,
|
||||
known root algorithm */
|
||||
enum {
|
||||
PTPCOLL_BINOMIAL_SG = 1, /* Binomila scatter-gather */
|
||||
PTPCOLL_NARRAY_KNOMIAL_SG /* Narray-Knomial scatter-gather */
|
||||
};
|
||||
|
||||
/*
|
||||
* Implemented function index list
|
||||
*/
|
||||
|
||||
/* barrier */
|
||||
enum{
|
||||
FANIN_FAN_OUT_BARRIER_FN,
|
||||
RECURSIVE_DOUBLING_BARRIER_FN,
|
||||
N_BARRIER_FNS
|
||||
};
|
||||
|
||||
/* reduce */
|
||||
enum{
|
||||
FANIN_REDUCE_FN,
|
||||
REDUCE_SCATTER_GATHER_FN,
|
||||
N_REDUCE_FNS
|
||||
};
|
||||
enum{
|
||||
SHORT_DATA_FN_REDUCE,
|
||||
LONG_DATA_FN_REDUCE,
|
||||
N_REDUCE_FNS_USED
|
||||
};
|
||||
|
||||
/* all-reduce */
|
||||
enum{
|
||||
FANIN_FANOUT_ALLREDUCE_FN,
|
||||
REDUCE_SCATTER_ALLGATHER_FN,
|
||||
N_ALLREDUCE_FNS
|
||||
};
|
||||
enum{
|
||||
SHORT_DATA_FN_ALLREDUCE,
|
||||
LONG_DATA_FN_ALLREDUCE,
|
||||
N_ALLREDUCE_FNS_USED
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* N-order tree node description
|
||||
*/
|
||||
struct tree_node_t {
|
||||
/* my rank within the group */
|
||||
int my_rank;
|
||||
/* my node type - root, leaf, or interior */
|
||||
int my_node_type;
|
||||
/* number of nodes in the tree */
|
||||
int tree_size;
|
||||
/* number of parents (0/1) */
|
||||
int n_parents;
|
||||
/* number of children */
|
||||
int n_children;
|
||||
/* parent rank within the group */
|
||||
int parent_rank;
|
||||
/* chidren ranks within the group */
|
||||
int *children_ranks;
|
||||
};
|
||||
typedef struct tree_node_t tree_node_t;
|
||||
|
||||
struct pair_exchange_node_t {
|
||||
|
||||
/* number of nodes this node will exchange data with */
|
||||
int n_exchanges;
|
||||
|
||||
/* ranks of nodes involved in data exchnge */
|
||||
int *rank_exchanges;
|
||||
|
||||
/* number of extra sources of data - outside largest power of 2 in
|
||||
* this group */
|
||||
int n_extra_sources;
|
||||
|
||||
/* rank of the extra source */
|
||||
int rank_extra_source;
|
||||
|
||||
/* number of tags needed per stripe */
|
||||
int n_tags;
|
||||
|
||||
/* log 2 of largest full power of 2 for this node set */
|
||||
int log_2;
|
||||
|
||||
/* largest power of 2 that fits in this group */
|
||||
int n_largest_pow_2;
|
||||
|
||||
/* node type */
|
||||
int node_type;
|
||||
|
||||
};
|
||||
typedef struct pair_exchange_node_t pair_exchange_node_t;
|
||||
|
||||
/*
|
||||
* Barrier request objects
|
||||
*/
|
||||
|
||||
/* enum for phase at which the nb barrier is in */
|
||||
enum{
|
||||
NB_BARRIER_INACTIVE,
|
||||
NB_BARRIER_FAN_IN,
|
||||
NB_BARRIER_FAN_OUT,
|
||||
/* done and not started are the same for all practicle
|
||||
* purposes, as the init funtion always sets this flag
|
||||
*/
|
||||
NB_BARRIER_DONE
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
PTPCOLL_NOT_STARTED = 1,
|
||||
PTPCOLL_WAITING_FOR_DATA = 1 << 1,
|
||||
PTPCOLL_SCATTER_STARTED = 1 << 2,
|
||||
PTPCOLL_GATHER_STARTED = 1 << 3,
|
||||
PTPCOLL_EXTRA_SEND_STARTED = 1 << 4,
|
||||
PTPCOLL_ROOT_SEND_STARTED = 1 << 5
|
||||
} ptpcoll_op_status;
|
||||
|
||||
struct mca_bcol_ptpcoll_ml_buffer_desc_t {
|
||||
void *data_addr; /* buffer address */
|
||||
uint64_t bank_index; /* my bank */
|
||||
uint64_t buffer_index; /* my buff index */
|
||||
int active_requests; /* keep number of active requests */
|
||||
ompi_request_t **requests; /* caching pointers to requests */
|
||||
int data_src; /* used for bcast to cache internal data */
|
||||
int radix_mask; /* used for bcast to cache internal data */
|
||||
int radix_mask_pow; /* used for bcast to cache internal data */
|
||||
int iteration; /* buffer iteration in knomial, binomail, etc. algorithms */
|
||||
int tag; /* tag number that is attached to this operation */
|
||||
int status; /* operation status */
|
||||
/* Fixme: Probably we can get rid of these fields by redesigning
|
||||
* the reduce implementation
|
||||
*/
|
||||
int reduction_status; /* used for reduction to cache internal
|
||||
reduction status */
|
||||
bool reduce_init_called;
|
||||
};
|
||||
typedef struct mca_bcol_ptpcoll_ml_buffer_desc_t mca_bcol_ptpcoll_ml_buffer_desc_t;
|
||||
|
||||
/*
|
||||
* Information that we need to keep in order to access and
|
||||
* track local ML memory that is used as source and destinatination
|
||||
* for collectives operations
|
||||
*/
|
||||
struct mca_bcol_ptpcoll_local_mlmem_desc_t {
|
||||
/* Bank index to release */
|
||||
uint32_t bank_index_for_release;
|
||||
/* number of memory banks */
|
||||
uint32_t num_banks;
|
||||
/* number of buffers per bank */
|
||||
uint32_t num_buffers_per_bank;
|
||||
/* size of a payload buffer */
|
||||
uint32_t size_buffer;
|
||||
/* pointer to buffer descriptors initialized */
|
||||
mca_bcol_ptpcoll_ml_buffer_desc_t *ml_buf_desc;
|
||||
};
|
||||
typedef struct mca_bcol_ptpcoll_local_mlmem_desc_t mca_bcol_ptpcoll_local_mlmem_desc_t;
|
||||
|
||||
typedef enum {
|
||||
PTPCOLL_PROXY = 1,
|
||||
PTPCOLL_IN_GROUP = 1 << 1,
|
||||
PTPCOLL_EXTRA = 1 << 2,
|
||||
PTPCOLL_KN_PROXY = 1 << 3,
|
||||
PTPCOLL_KN_IN_GROUP = 1 << 4,
|
||||
PTPCOLL_KN_EXTRA = 1 << 5
|
||||
} node_type_pow2;
|
||||
|
||||
struct mca_bcol_ptpcoll_module_t {
|
||||
/* base structure */
|
||||
mca_bcol_base_module_t super;
|
||||
|
||||
/* size */
|
||||
int group_size;
|
||||
|
||||
/* size of each memory segment */
|
||||
size_t segment_size;
|
||||
|
||||
/* k_nomial radix */
|
||||
int k_nomial_radix;
|
||||
/* caching power of K, for K-nomial operations */
|
||||
int pow_k;
|
||||
/* caching power of K number that is smaller or equal to size of group */
|
||||
int pow_knum;
|
||||
/* caching power of 2, it is special case for some algorithms */
|
||||
int pow_2;
|
||||
/* caching power of 2 number that is closet to size of group */
|
||||
int pow_2num;
|
||||
/* type of this node in group of power 2 */
|
||||
int pow_2type;
|
||||
/* type of this node in group of K-nomaial tree */
|
||||
int pow_ktype;
|
||||
/* type of this node in group of narray tree */
|
||||
int narray_type;
|
||||
/* size of full narray tree */
|
||||
int full_narray_tree_size;
|
||||
/* num leafs on last level */
|
||||
int full_narray_tree_num_leafs;
|
||||
|
||||
/* Nary tree info */
|
||||
netpatterns_tree_node_t *narray_node;
|
||||
|
||||
/* if the rank in group, it keeps the extra peer.
|
||||
if the rank is extra, it keeps the proxy peer.
|
||||
*/
|
||||
int proxy_extra_index; /* pow2 algorithm */
|
||||
int *kn_proxy_extra_index; /* K nomaila algorithm */
|
||||
int kn_proxy_extra_num; /* number of extra peers , maximum k - 1*/
|
||||
|
||||
/* collective tag */
|
||||
long long collective_tag;
|
||||
|
||||
/* tag mask - the pml has a limit on tag size, so need
|
||||
* to wrap around
|
||||
*/
|
||||
uint64_t tag_mask;
|
||||
|
||||
/* Caching information about local ml memory.
|
||||
* Since ptpcoll does not support RDMA operations over pml,
|
||||
* we don't need to keep any information about remote buffers
|
||||
*/
|
||||
mca_bcol_ptpcoll_local_mlmem_desc_t ml_mem;
|
||||
|
||||
|
||||
/* Narray-Knomial scatther gather */
|
||||
|
||||
/* list of extra indexes */
|
||||
int *narray_knomial_proxy_extra_index;
|
||||
/* number of extra peers , maximum k - 1*/
|
||||
int narray_knomial_proxy_num;
|
||||
/* Narray-Knomial node information array */
|
||||
netpatterns_narray_knomial_tree_node_t *narray_knomial_node;
|
||||
/* Knomial exchange tree */
|
||||
netpatterns_k_exchange_node_t knomial_exchange_tree;
|
||||
/* knomial allgather tree --- Do not disable, we need both
|
||||
different algorithms define recursive k - ing differently
|
||||
*/
|
||||
netpatterns_k_exchange_node_t knomial_allgather_tree;
|
||||
|
||||
/* Knomial allgather offsets */
|
||||
int **allgather_offsets;
|
||||
|
||||
/* Free lists of outstanding collective operations */
|
||||
opal_free_list_t collreqs_free;
|
||||
|
||||
int log_group_size;
|
||||
struct iovec *alltoall_iovec;
|
||||
};
|
||||
|
||||
typedef struct mca_bcol_ptpcoll_module_t mca_bcol_ptpcoll_module_t;
|
||||
OBJ_CLASS_DECLARATION(mca_bcol_ptpcoll_module_t);
|
||||
|
||||
|
||||
/**
|
||||
* Global component instance
|
||||
*/
|
||||
OMPI_MODULE_DECLSPEC extern mca_bcol_ptpcoll_component_t
|
||||
mca_bcol_ptpcoll_component;
|
||||
|
||||
|
||||
/*
|
||||
* coll module functions
|
||||
*/
|
||||
|
||||
/* query to see if the component is available for use, and can
|
||||
* satisfy the thread and progress requirements
|
||||
*/
|
||||
int mca_bcol_ptpcoll_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads);
|
||||
|
||||
/* query to see if the module is available for use on the given
|
||||
* communicator, and if so, what it's priority is.
|
||||
*/
|
||||
mca_bcol_base_module_t **
|
||||
mca_bcol_ptpcoll_comm_query(mca_sbgp_base_module_t *sbgp, int *num_modules);
|
||||
|
||||
/* interface function to setup recursive k-ing tree */
|
||||
int mca_bcol_ptpcoll_setup_knomial_tree(mca_bcol_base_module_t *super);
|
||||
|
||||
/* barrier routines */
|
||||
int bcol_ptpcoll_barrier_recurs_dbl(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int bcol_ptpcoll_barrier_recurs_knomial(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int bcol_ptpcoll_barrier_init(mca_bcol_base_module_t *super);
|
||||
int mca_bcol_ptpcoll_memsync_init(mca_bcol_base_module_t *super);
|
||||
void * bcol_ptpcoll_allocate_memory(size_t length, size_t alignment,
|
||||
struct mca_bcol_base_module_t *bcol_module);
|
||||
int bcol_ptpcoll_register_memory(void * in_ptr, size_t length, size_t alignment,
|
||||
struct mca_bcol_base_module_t *bcol_module);
|
||||
int bcol_ptpcoll_deregister_memory( void * in_ptr,
|
||||
struct mca_bcol_base_module_t *bcol_module);
|
||||
int bcol_ptpcoll_free_memory(void *ptr,
|
||||
struct mca_bcol_base_module_t *bcol_module);
|
||||
int bcol_ptpcoll_fanin( bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_module_t *module);
|
||||
int bcol_ptpcoll_fanout( bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
|
||||
/* allgather routine */
|
||||
int bcol_ptpcoll_k_nomial_allgather_init(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
/* allgather progress */
|
||||
int bcol_ptpcoll_k_nomial_allgather_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
/* allgather register */
|
||||
int bcol_ptpcoll_allgather_init(mca_bcol_base_module_t *super);
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int mca_bcol_ptpcoll_test_for_match(ompi_request_t **request , int *rc)
|
||||
{
|
||||
int matched = 0;
|
||||
int i;
|
||||
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
|
||||
*rc = OMPI_SUCCESS;
|
||||
|
||||
for (i = 0; i < cm->num_to_probe &&
|
||||
0 == matched && OMPI_SUCCESS == *rc ; i++) {
|
||||
*rc = ompi_request_test(request, &matched, MPI_STATUS_IGNORE);
|
||||
}
|
||||
|
||||
return matched;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int mca_bcol_ptpcoll_test_all_for_match(int *n_requests, ompi_request_t **requests , int *rc)
|
||||
{
|
||||
int matched = 0;
|
||||
int i;
|
||||
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
|
||||
*rc = OMPI_SUCCESS;
|
||||
|
||||
assert(*n_requests >= 0);
|
||||
|
||||
if (0 == *n_requests) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < cm->num_to_probe &&
|
||||
0 == matched && OMPI_SUCCESS == *rc; i++) {
|
||||
*rc = ompi_request_test_all
|
||||
(*n_requests, requests, &matched, MPI_STATUS_IGNORE);
|
||||
}
|
||||
|
||||
if (matched) {
|
||||
*n_requests = 0;
|
||||
}
|
||||
|
||||
return matched;
|
||||
}
|
||||
|
||||
/* Some negative tags already used by OMPI, making sure that we take safe offset */
|
||||
#define PTPCOLL_TAG_OFFSET 100
|
||||
#define PTPCOLL_TAG_FACTOR 2
|
||||
|
||||
static inline int lognum(int n){
|
||||
int count = 1, lognum = 0;
|
||||
|
||||
while (count < n) {
|
||||
count = count << 1;
|
||||
lognum++;
|
||||
}
|
||||
return lognum;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_BCOL_PTPCOLL_EXPORT_H */
|
@ -1,605 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "bcol_ptpcoll_allreduce.h"
|
||||
/*
|
||||
* Recursive K-ing allgather
|
||||
*/
|
||||
|
||||
/*
|
||||
*
|
||||
* Recurssive k-ing algorithm
|
||||
* Example k=3 n=9
|
||||
*
|
||||
*
|
||||
* Number of Exchange steps = log (basek) n
|
||||
* Number of steps in exchange step = k (radix)
|
||||
*
|
||||
*/
|
||||
|
||||
int bcol_ptpcoll_k_nomial_allgather_init(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variables */
|
||||
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
netpatterns_k_exchange_node_t *exchange_node = &ptpcoll_module->knomial_allgather_tree;
|
||||
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
|
||||
int group_size = ptpcoll_module->group_size;
|
||||
int *list_connected = ptpcoll_module->super.list_n_connected; /* critical for hierarchical colls */
|
||||
|
||||
int tag;
|
||||
int i, j;
|
||||
int knt;
|
||||
int comm_src, comm_dst, src, dst;
|
||||
int recv_offset, recv_len;
|
||||
int send_offset, send_len;
|
||||
|
||||
uint32_t buffer_index = input_args->buffer_index;
|
||||
int pow_k, tree_order;
|
||||
int rc = OMPI_SUCCESS;
|
||||
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
ompi_request_t **requests =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
|
||||
int *active_requests =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
|
||||
int completed = 0; /* initialized */
|
||||
void *data_buffer = (void*)(
|
||||
(unsigned char *) input_args->sbuf +
|
||||
(size_t) input_args->sbuf_offset);
|
||||
int pack_len = input_args->count * input_args->dtype->super.size;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr,"entering p2p allgather pack_len %d. exchange node: %p\n",pack_len, exchange_node);
|
||||
#endif
|
||||
/* initialize the iteration counter */
|
||||
int *iteration = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
|
||||
*iteration = 0;
|
||||
|
||||
/* reset active request counter */
|
||||
*active_requests = 0;
|
||||
|
||||
/* keep tag within the limit supported by the pml */
|
||||
tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
|
||||
/* mark this as a collective tag, to avoid conflict with user-level flags */
|
||||
tag = -tag;
|
||||
|
||||
/* k-nomial parameters */
|
||||
tree_order = exchange_node->tree_order;
|
||||
pow_k = exchange_node->log_tree_order;
|
||||
|
||||
|
||||
/* let's begin the collective, starting with extra ranks and their
|
||||
* respective proxies
|
||||
*/
|
||||
if( EXTRA_NODE == exchange_node->node_type ) {
|
||||
|
||||
/* then I will send to my proxy rank*/
|
||||
dst = exchange_node->rank_extra_sources_array[0];
|
||||
/* find rank in the communicator */
|
||||
comm_dst = group_list[dst];
|
||||
/* now I need to calculate my own offset */
|
||||
knt = 0;
|
||||
for (i = 0 ; i < my_group_index; i++){
|
||||
knt += list_connected[i];
|
||||
}
|
||||
|
||||
/* send the data to my proxy */
|
||||
rc = MCA_PML_CALL(isend((void *) ( (unsigned char *) data_buffer +
|
||||
knt*pack_len),
|
||||
pack_len * list_connected[my_group_index],
|
||||
MPI_BYTE,
|
||||
comm_dst, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10,("Failed to isend data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
|
||||
/* now I go ahead and post the receive from my proxy */
|
||||
comm_src = comm_dst;
|
||||
knt = 0;
|
||||
for( i =0; i < group_size; i++){
|
||||
knt += list_connected[i];
|
||||
}
|
||||
rc = MCA_PML_CALL(irecv(data_buffer,
|
||||
knt * pack_len,
|
||||
MPI_BYTE,
|
||||
comm_src,
|
||||
tag , comm, &(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
++(*active_requests);
|
||||
/* poll for completion */
|
||||
/* this polls internally */
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if(completed){
|
||||
/* go to buffer release */
|
||||
goto FINISHED;
|
||||
}else{
|
||||
/* save state and hop out
|
||||
* nothing to save here
|
||||
*/
|
||||
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
}else if ( 0 < exchange_node->n_extra_sources ) {
|
||||
|
||||
/* I am a proxy for someone */
|
||||
src = exchange_node->rank_extra_sources_array[0];
|
||||
/* find the rank in the communicator */
|
||||
comm_src = group_list[src];
|
||||
knt = 0;
|
||||
for(i = 0; i < src; i++){
|
||||
knt += list_connected[i];
|
||||
}
|
||||
/* post the receive */
|
||||
rc = MCA_PML_CALL(irecv((void *) ( (unsigned char *) data_buffer
|
||||
+ knt*pack_len),
|
||||
pack_len * list_connected[src],
|
||||
MPI_BYTE,
|
||||
comm_src,
|
||||
tag , comm, &(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
++(*active_requests);
|
||||
/* poll for completion */
|
||||
/* this routine polls internally */
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if(!completed){
|
||||
/* save state and hop out
|
||||
* We really do need to block here so set
|
||||
* the iteration to -1 indicating we need to
|
||||
* finish this part first
|
||||
*/
|
||||
*iteration = -1;
|
||||
return ((OMPI_SUCCESS != rc )? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* we start the recursive k - ing phase */
|
||||
/* fprintf(stderr,"tree order %d pow_k %d \n",tree_order,pow_k);*/
|
||||
for( i = 0; i < pow_k; i++) {
|
||||
for(j = 0; j < (tree_order - 1); j++) {
|
||||
|
||||
/* send phase */
|
||||
dst = exchange_node->rank_exchanges[i][j];
|
||||
if( dst < 0 ){
|
||||
continue;
|
||||
}
|
||||
comm_dst = group_list[dst];
|
||||
send_offset = exchange_node->payload_info[i][j].s_offset * pack_len;
|
||||
send_len = exchange_node->payload_info[i][j].s_len * pack_len;
|
||||
/* debug print */
|
||||
/* fprintf(stderr,"sending %d bytes to rank %d at offset %d\n",send_len, */
|
||||
/* comm_dst,send_offset); */
|
||||
rc = MCA_PML_CALL(isend((void*)((unsigned char *) data_buffer +
|
||||
send_offset),
|
||||
send_len,
|
||||
MPI_BYTE,
|
||||
comm_dst, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10,("Failed to isend data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
|
||||
/* sends are posted */
|
||||
}
|
||||
|
||||
/* Now post the recv's */
|
||||
for( j = 0; j < (tree_order - 1); j++ ) {
|
||||
|
||||
/* recv phase */
|
||||
src = exchange_node->rank_exchanges[i][j];
|
||||
if( src < 0 ) {
|
||||
continue;
|
||||
}
|
||||
comm_src = group_list[src];
|
||||
recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len;
|
||||
recv_len = exchange_node->payload_info[i][j].r_len * pack_len;
|
||||
/* debug print */
|
||||
/* fprintf(stderr,"recving %d bytes to rank %d at offset %d\n",recv_len, */
|
||||
/* comm_src,recv_offset); */
|
||||
/* post the receive */
|
||||
rc = MCA_PML_CALL(irecv((void *) ((unsigned char *) data_buffer +
|
||||
recv_offset),
|
||||
recv_len,
|
||||
MPI_BYTE,
|
||||
comm_src,
|
||||
tag, comm, &(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
++(*active_requests);
|
||||
}
|
||||
/* finished all send/recv's now poll for completion before
|
||||
* continuing to next iteration
|
||||
*/
|
||||
completed = 0;
|
||||
/* polling internally on 2*(k - 1) requests */
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
|
||||
if(!completed){
|
||||
/* save state and hop out
|
||||
* only the iteration needs to be tracked
|
||||
*/
|
||||
*iteration = i; /* need to pick up here */
|
||||
|
||||
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
}
|
||||
|
||||
/* finish off the last piece, send the data back to the extra */
|
||||
if( 0 < exchange_node->n_extra_sources ) {
|
||||
dst = exchange_node->rank_extra_sources_array[0];
|
||||
comm_dst = group_list[dst];
|
||||
knt = 0;
|
||||
for( i = 0; i < group_size; i++){
|
||||
knt += list_connected[i];
|
||||
}
|
||||
/* debug print */
|
||||
/*
|
||||
fprintf(stderr,"sending %d bytes to extra %d \n",pack_len*knt,comm_dst);
|
||||
*/
|
||||
rc = MCA_PML_CALL(isend(data_buffer,
|
||||
pack_len * knt,
|
||||
MPI_BYTE,
|
||||
comm_dst, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10,("Failed to isend data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
|
||||
/* probe for send completion */
|
||||
completed = 0;
|
||||
/* polling internally */
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if(!completed){
|
||||
/* save state and hop out
|
||||
* We really do need to block here so set
|
||||
* the iteration to pow_k +1 indicating we need to
|
||||
* finish progressing the last part
|
||||
*/
|
||||
*iteration = pow_k + 1;
|
||||
|
||||
return (OMPI_SUCCESS != rc ? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
}
|
||||
|
||||
FINISHED:
|
||||
/* recycle buffer if need be */
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
/* allgather progress function */
|
||||
|
||||
int bcol_ptpcoll_k_nomial_allgather_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
|
||||
|
||||
/* local variables */
|
||||
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
netpatterns_k_exchange_node_t *exchange_node = &ptpcoll_module->knomial_allgather_tree;
|
||||
int group_size = ptpcoll_module->group_size;
|
||||
int *list_connected = ptpcoll_module->super.list_n_connected; /* critical for hierarchical colls */
|
||||
|
||||
|
||||
int tag;
|
||||
int i, j;
|
||||
int knt;
|
||||
int comm_src, comm_dst, src, dst;
|
||||
int recv_offset, recv_len;
|
||||
int send_offset, send_len;
|
||||
uint32_t buffer_index = input_args->buffer_index;
|
||||
|
||||
int pow_k, tree_order;
|
||||
int rc = OMPI_SUCCESS;
|
||||
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
ompi_request_t **requests =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
|
||||
int *active_requests =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
|
||||
int completed = 0; /* initialized */
|
||||
void *data_buffer = (void*)(
|
||||
(unsigned char *) input_args->sbuf +
|
||||
(size_t) input_args->sbuf_offset);
|
||||
int pack_len = input_args->count * input_args->dtype->super.size;
|
||||
/* initialize the counter */
|
||||
int *iteration = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
|
||||
|
||||
|
||||
#if 0
|
||||
fprintf(stderr,"%d: entering p2p allgather progress AR: %d iter: %d\n",my_group_index,*active_requests,
|
||||
*iteration);
|
||||
#endif
|
||||
/* keep tag within the limit supported by the pml */
|
||||
tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
|
||||
/* mark this as a collective tag, to avoid conflict with user-level flags */
|
||||
tag = -tag;
|
||||
|
||||
/* k-nomial tree parameters */
|
||||
tree_order = exchange_node->tree_order;
|
||||
pow_k = exchange_node->log_tree_order;
|
||||
|
||||
/* let's begin the collective, starting with extra ranks and their
|
||||
* respective proxies
|
||||
*/
|
||||
if( EXTRA_NODE == exchange_node->node_type ) {
|
||||
|
||||
/* debug print */
|
||||
/*fprintf(stderr,"666 \n");*/
|
||||
/* simply poll for completion */
|
||||
completed = 0;
|
||||
/* polling internally */
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if(completed){
|
||||
/* go to buffer release */
|
||||
goto FINISHED;
|
||||
}else{
|
||||
/* save state and hop out
|
||||
* nothing to save here
|
||||
*/
|
||||
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
}else if ( 0 < exchange_node->n_extra_sources && (-1 == *iteration)) {
|
||||
|
||||
/* I am a proxy for someone */
|
||||
/* Simply poll for completion */
|
||||
completed = 0;
|
||||
/* polling internally */
|
||||
assert( 1 == *active_requests);
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if(!completed){
|
||||
/* save state and hop out
|
||||
* We really do need to block here so set
|
||||
* the iteration to -1 indicating we need to
|
||||
* finish this part first
|
||||
*/
|
||||
(*iteration) = -1;
|
||||
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
/* I may now proceed to the recursive k - ing phase */
|
||||
*iteration = 0;
|
||||
}
|
||||
|
||||
|
||||
/* the ordering here between the extra rank and progress active requests
|
||||
* is critical
|
||||
*/
|
||||
/* extra rank */
|
||||
if( (pow_k + 1) == *iteration ){
|
||||
/* finish off the last one */
|
||||
goto PROGRESS_EXTRA;
|
||||
}
|
||||
|
||||
/* active requests must be completed before continuing on to
|
||||
* recursive k -ing step
|
||||
* CAREFUL HERE, IT THIS REALLY WHAT YOU WANT??
|
||||
*/
|
||||
if( 0 < (*active_requests) ) {
|
||||
/* then we have something to progress from last step */
|
||||
/* debug print */
|
||||
/*
|
||||
fprintf(stderr,"%d: entering progress AR: %d iter: %d\n",my_group_index,*active_requests,
|
||||
*iteration);
|
||||
*/
|
||||
completed = 0;
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if(!completed){
|
||||
/* save state and hop out
|
||||
* state hasn't changed
|
||||
*/
|
||||
|
||||
return ((MPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
++(*iteration);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* we start the recursive k - ing phase */
|
||||
for( i = *iteration; i < pow_k; i++) {
|
||||
/* nothing changes here */
|
||||
for(j = 0; j < (tree_order - 1); j++) {
|
||||
|
||||
/* send phase */
|
||||
dst = exchange_node->rank_exchanges[i][j];
|
||||
if( dst < 0 ){
|
||||
continue;
|
||||
}
|
||||
comm_dst = group_list[dst];
|
||||
send_offset = exchange_node->payload_info[i][j].s_offset * pack_len;
|
||||
send_len = exchange_node->payload_info[i][j].s_len * pack_len;
|
||||
rc = MCA_PML_CALL(isend((void*)((unsigned char *) data_buffer +
|
||||
send_offset),
|
||||
send_len,
|
||||
MPI_BYTE,
|
||||
comm_dst, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10,("Failed to isend data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
|
||||
/* sends are posted */
|
||||
}
|
||||
|
||||
/* Now post the recv's */
|
||||
for( j = 0; j < (tree_order - 1); j++ ) {
|
||||
|
||||
/* recv phase */
|
||||
src = exchange_node->rank_exchanges[i][j];
|
||||
if( src < 0 ) {
|
||||
continue;
|
||||
}
|
||||
comm_src = group_list[src];
|
||||
recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len;
|
||||
recv_len = exchange_node->payload_info[i][j].r_len * pack_len;
|
||||
/* post the receive */
|
||||
rc = MCA_PML_CALL(irecv((void *) ((unsigned char *) data_buffer +
|
||||
recv_offset),
|
||||
recv_len,
|
||||
MPI_BYTE,
|
||||
comm_src,
|
||||
tag, comm, &(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
++(*active_requests);
|
||||
}
|
||||
/* finished all send/recv's now poll for completion before
|
||||
* continuing to next iteration
|
||||
*/
|
||||
completed = 0;
|
||||
/* make this non-blocking */
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if(!completed){
|
||||
/* save state and hop out
|
||||
* We really do need to block here so set
|
||||
* the iteration to -1 indicating we need to
|
||||
* finish this part first
|
||||
*/
|
||||
*iteration = i; /* need to pick up here */
|
||||
|
||||
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
}
|
||||
|
||||
/* finish off the last piece, send the data back to the extra */
|
||||
if( 0 < exchange_node->n_extra_sources ) {
|
||||
dst = exchange_node->rank_extra_sources_array[0];
|
||||
comm_dst = group_list[dst];
|
||||
knt = 0;
|
||||
for( i = 0; i < group_size; i++){
|
||||
knt += list_connected[i];
|
||||
}
|
||||
rc = MCA_PML_CALL(isend(data_buffer,
|
||||
pack_len * knt,
|
||||
MPI_BYTE,
|
||||
comm_dst, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10,("Failed to isend data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
|
||||
/* probe for send completion */
|
||||
completed = 0;
|
||||
/* make this non-blocking */
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if(!completed){
|
||||
/* save state and hop out
|
||||
* We really do need to block here so set
|
||||
* the iteration to pow_k +1 indicating we need to
|
||||
* finish progressing the last part
|
||||
*/
|
||||
*iteration = pow_k + 1;
|
||||
|
||||
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
}
|
||||
/* folks need to skip this unless they really are the proxy
|
||||
* reentering with the intent of progressing the final send
|
||||
*/
|
||||
goto FINISHED;
|
||||
|
||||
PROGRESS_EXTRA:
|
||||
|
||||
/* probe for send completion */
|
||||
completed = 0;
|
||||
/* make this non-blocking */
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if(!completed){
|
||||
/* save state and hop out
|
||||
* We really do need to block here so set
|
||||
* the iteration to pow_k +1 indicating we need to
|
||||
* finish progressing the last part
|
||||
*/
|
||||
|
||||
return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
|
||||
}
|
||||
|
||||
FINISHED:
|
||||
/* recycle buffer if need be */
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Register allreduce functions to the BCOL function table,
|
||||
* so they can be selected
|
||||
*/
|
||||
int bcol_ptpcoll_allgather_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
comm_attribs.bcoll_type = BCOL_ALLGATHER;
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_ptpcoll_k_nomial_allgather_init,
|
||||
bcol_ptpcoll_k_nomial_allgather_progress);
|
||||
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
inv_attribs.bcol_msg_min = 10000000;
|
||||
inv_attribs.bcol_msg_max = 10485760; /* range 4 */
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_ptpcoll_k_nomial_allgather_init,
|
||||
bcol_ptpcoll_k_nomial_allgather_progress);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,95 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_PTPCOLL_ALLREDUCE_H
|
||||
#define MCA_BCOL_PTPCOLL_ALLREDUCE_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "bcol_ptpcoll.h"
|
||||
#include "bcol_ptpcoll_utils.h"
|
||||
|
||||
enum {
|
||||
BLOCK_OFFSET = 0,
|
||||
LOCAL_REDUCE_SEG_OFFSET,
|
||||
BLOCK_COUNT,
|
||||
SEG_SIZE,
|
||||
NOFFSETS
|
||||
};
|
||||
|
||||
BEGIN_C_DECLS
|
||||
int bcol_ptpcoll_allreduce_narraying(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
const int buffer_index, void *data_buffer,
|
||||
struct ompi_op_t *op,
|
||||
const int count, struct ompi_datatype_t *dtype, const int
|
||||
buffer_size, const int relative_group_index);
|
||||
|
||||
|
||||
int bcol_ptpcoll_allreduce_narraying_init(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
int bcol_ptpcoll_allreduce_recursivek_scatter_reduce(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
const int buffer_index, void *sbuf,
|
||||
void *rbuf,
|
||||
struct ompi_op_t *op,
|
||||
const int count, struct ompi_datatype_t *dtype,
|
||||
const int relative_group_index,
|
||||
const int padded_start_byte);
|
||||
|
||||
int bcol_ptpcoll_allreduce_knomial_allgather(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
const int buffer_index,
|
||||
void *sbuf,void *rbuf, int count, struct
|
||||
ompi_datatype_t *dtype,
|
||||
const int relative_group_index,
|
||||
const int padded_start_byte);
|
||||
|
||||
int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
|
||||
int compute_knomial_allgather_offsets(int group_index, int count, struct
|
||||
ompi_datatype_t *dtype,int k_radix,int n_exchanges,
|
||||
int **offsets);
|
||||
|
||||
|
||||
int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
int buffer_index,
|
||||
void *sbuf,
|
||||
void *rbuf,
|
||||
struct ompi_op_t *op,
|
||||
const int count, struct ompi_datatype_t *dtype);
|
||||
|
||||
int bcol_ptpcoll_allreduce_knomial_allgather_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
int buffer_index,
|
||||
void *sbuf,
|
||||
void *rbuf,
|
||||
const int count, struct ompi_datatype_t *dtype);
|
||||
|
||||
int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
int bcol_ptpcoll_allreduce_init(mca_bcol_base_module_t *super);
|
||||
|
||||
#if 0
|
||||
int knomial_reduce_scatter_offsets(int group_index,int count, struct ompi_datatype_t *dtype, int k_radix,
|
||||
int n_exchanges, int nth_exchange, size_t *recv_offset, size_t
|
||||
*block_offset, size_t *block_count, size_t *block_size, size_t
|
||||
*seg_size);
|
||||
|
||||
int allgather_offsets(int group_index,int count, struct ompi_datatype_t *dtype, int k_radix,
|
||||
int n_exchanges, int nth_exchange, size_t *send_offset, size_t
|
||||
*block_offset, size_t *block_count, size_t *block_size, size_t
|
||||
*seg_size);
|
||||
#endif
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,933 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "bcol_ptpcoll.h"
|
||||
#include "bcol_ptpcoll_utils.h"
|
||||
|
||||
/*
|
||||
* Fanin routines - no user data
|
||||
*/
|
||||
|
||||
/********************************************* New Barrier *********************************************/
|
||||
/*******************************************************************************************************/
|
||||
/*******************************************************************************************************/
|
||||
|
||||
/*************************************** K-nominal ***************************************/
|
||||
/*****************************************************************************************/
|
||||
static int bcol_ptpcoll_barrier_recurs_knomial_new(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variable */
|
||||
uint64_t sequence_number;
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module =
|
||||
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
|
||||
|
||||
netpatterns_k_exchange_node_t *my_exchange_node =
|
||||
&ptpcoll_module->knomial_exchange_tree;
|
||||
|
||||
int rc, k, pair_comm_rank, exchange, completed,
|
||||
tree_order = my_exchange_node->tree_order, tag,
|
||||
n_extra_sources = my_exchange_node->n_extra_sources,
|
||||
n_exchange = my_exchange_node->n_exchanges, num_reqs;
|
||||
|
||||
ompi_communicator_t *comm =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
|
||||
int *extra_sources_array = NULL,
|
||||
**rank_exchanges = my_exchange_node->rank_exchanges;
|
||||
|
||||
ompi_request_t **requests;
|
||||
opal_free_list_item_t *item;
|
||||
|
||||
mca_bcol_ptpcoll_collreq_t *collreq;
|
||||
|
||||
item = opal_free_list_wait (&ptpcoll_module->collreqs_free);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
PTPCOLL_ERROR(("Free list waiting failed."));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
collreq = (mca_bcol_ptpcoll_collreq_t *) item;
|
||||
input_args->bcol_opaque_data = (void *) collreq;
|
||||
|
||||
requests = collreq->requests;
|
||||
|
||||
/* TAG Calculation */
|
||||
sequence_number = input_args->sequence_num;
|
||||
|
||||
/* Keep tag within the limit supportd by the pml */
|
||||
tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
|
||||
|
||||
/* Mark this as a collective tag, to avoid conflict with user-level flags */
|
||||
tag = -tag;
|
||||
|
||||
if (0 < n_extra_sources) { /* EXCHANGE_NODE case */
|
||||
collreq->need_toserv_extra = 1;
|
||||
extra_sources_array = my_exchange_node->rank_extra_sources_array;
|
||||
|
||||
/* I will participate in the exchange (of the algorithm) -
|
||||
* wait for signal from extra process */
|
||||
for (k = 0; k < n_extra_sources; ++k) {
|
||||
pair_comm_rank =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];
|
||||
|
||||
rc = MCA_PML_CALL(irecv(
|
||||
NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
comm, &(requests[k])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("IRecv failed."));
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
num_reqs = n_extra_sources;
|
||||
|
||||
/* Test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->tag = tag;
|
||||
collreq->num_reqs = num_reqs;
|
||||
collreq->exchange = 0;
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
} else {
|
||||
collreq->need_toserv_extra = 0;
|
||||
}
|
||||
|
||||
/* loop over exchange send/recv pairs */
|
||||
for (exchange = 0; exchange < n_exchange; ++exchange) {
|
||||
for (k = 0; k < tree_order - 1; ++k) {
|
||||
/* rank of exchange partner within the group */
|
||||
pair_comm_rank =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_list[rank_exchanges[exchange][k]];
|
||||
|
||||
assert(2 * ptpcoll_module->k_nomial_radix > (k * 2 + 1));
|
||||
|
||||
/* send to partner - we will wait for completion, as send
|
||||
* completion is at the MPI level, and will not
|
||||
* incur network level completion costs
|
||||
*/
|
||||
rc = MCA_PML_CALL(isend(
|
||||
NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, &(requests[k * 2 + 1])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("ISend failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Ex %d, K %d send to %d[%d]", exchange, k,
|
||||
pair_comm_rank, rank_exchanges[exchange][k]));
|
||||
|
||||
/* recive from partner */
|
||||
rc = MCA_PML_CALL(irecv(
|
||||
NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
comm, &(requests[k * 2])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("IRecv failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Ex %d, K %d irecv from %d[%d]", exchange, k,
|
||||
pair_comm_rank, rank_exchanges[exchange][k]));
|
||||
}
|
||||
|
||||
num_reqs = 2 * (tree_order - 1);
|
||||
|
||||
/* Test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->tag = tag;
|
||||
collreq->num_reqs = num_reqs;
|
||||
collreq->exchange = exchange + 1;
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
/* If non power of 2, may need to send message to "extra" proc */
|
||||
if (0 < n_extra_sources) { /* EXCHANGE_NODE case */
|
||||
for (k = 0; k < n_extra_sources; ++k) {
|
||||
pair_comm_rank =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];
|
||||
|
||||
rc = MCA_PML_CALL(isend(
|
||||
NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, &(requests[k])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("ISend failed."));
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
num_reqs = n_extra_sources;
|
||||
|
||||
/* Test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->tag = tag;
|
||||
collreq->num_reqs = num_reqs;
|
||||
|
||||
collreq->exchange = n_exchange;
|
||||
collreq->need_toserv_extra = 0;
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
opal_free_list_return (&ptpcoll_module->collreqs_free, (opal_free_list_item_t *) collreq);
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static int bcol_ptpcoll_barrier_recurs_knomial_new_progress(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variable */
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module =
|
||||
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
|
||||
|
||||
netpatterns_k_exchange_node_t *my_exchange_node =
|
||||
&ptpcoll_module->knomial_exchange_tree;
|
||||
|
||||
int rc, k, tag, pair_comm_rank, exchange,
|
||||
tree_order = my_exchange_node->tree_order, num_reqs,
|
||||
n_exchange = my_exchange_node->n_exchanges, completed,
|
||||
n_extra_sources = my_exchange_node->n_extra_sources;
|
||||
|
||||
ompi_communicator_t *comm =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
|
||||
int *extra_sources_array,
|
||||
**rank_exchanges = my_exchange_node->rank_exchanges;
|
||||
|
||||
mca_bcol_ptpcoll_collreq_t *collreq =
|
||||
(mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;
|
||||
|
||||
ompi_request_t **requests = collreq->requests;
|
||||
|
||||
num_reqs = collreq->num_reqs;
|
||||
|
||||
/* Test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
/* Continue loop over exchange send/recv pairs */
|
||||
tag = collreq->tag;
|
||||
|
||||
for (exchange = collreq->exchange; exchange < n_exchange; ++exchange) {
|
||||
for (k = 0; k < tree_order - 1; ++k) {
|
||||
/* rank of exchange partner within the group */
|
||||
pair_comm_rank =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_list[rank_exchanges[exchange][k]];
|
||||
|
||||
assert(2 * ptpcoll_module->k_nomial_radix > (k * 2 + 1));
|
||||
|
||||
/* send to partner - we will wait for completion, as send
|
||||
* completion is at the MPI level, and will not
|
||||
* incur network level completion costs
|
||||
*/
|
||||
rc = MCA_PML_CALL(isend(
|
||||
NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, &(requests[k * 2 + 1])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("ISend failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Ex %d, K %d send to %d[%d]", exchange, k,
|
||||
pair_comm_rank, rank_exchanges[exchange][k]));
|
||||
|
||||
/* recive from partner */
|
||||
rc = MCA_PML_CALL(irecv(
|
||||
NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
comm, &(requests[k * 2])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("IRecv failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Ex %d, K %d irecv from %d[%d]", exchange, k,
|
||||
pair_comm_rank, rank_exchanges[exchange][k]));
|
||||
}
|
||||
|
||||
num_reqs = 2 * (tree_order - 1);
|
||||
|
||||
/* Test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->num_reqs = num_reqs;
|
||||
collreq->exchange = exchange + 1;
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
/* If non power of 2, may need to send message to "extra" proc */
|
||||
if (collreq->need_toserv_extra) { /* EXCHANGE_NODE case */
|
||||
extra_sources_array = my_exchange_node->rank_extra_sources_array;
|
||||
|
||||
for (k = 0; k < n_extra_sources; ++k) {
|
||||
pair_comm_rank =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];
|
||||
|
||||
rc = MCA_PML_CALL(isend(
|
||||
NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, &(requests[k])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("ISend failed."));
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
num_reqs = n_extra_sources;
|
||||
|
||||
/* Test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->num_reqs = num_reqs;
|
||||
collreq->exchange = n_exchange;
|
||||
collreq->need_toserv_extra = 0;
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
/****************************************** Extra node Barrier ******************************************/
|
||||
|
||||
static int bcol_ptpcoll_barrier_recurs_knomial_extra_new(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variable */
|
||||
uint64_t sequence_number;
|
||||
int rc, tag, pair_comm_rank,
|
||||
completed, num_reqs = 2;
|
||||
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module =
|
||||
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
|
||||
|
||||
netpatterns_k_exchange_node_t *my_exchange_node =
|
||||
&ptpcoll_module->knomial_exchange_tree;
|
||||
|
||||
ompi_communicator_t *comm =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
|
||||
int *extra_sources_array = my_exchange_node->rank_extra_sources_array;
|
||||
|
||||
ompi_request_t **requests;
|
||||
opal_free_list_item_t *item;
|
||||
|
||||
mca_bcol_ptpcoll_collreq_t *collreq;
|
||||
|
||||
item = opal_free_list_wait (&ptpcoll_module->collreqs_free);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
PTPCOLL_ERROR(("Free list waiting failed."));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
collreq = (mca_bcol_ptpcoll_collreq_t *) item;
|
||||
input_args->bcol_opaque_data = (void *) collreq;
|
||||
|
||||
requests = collreq->requests;
|
||||
|
||||
/* TAG Calculation */
|
||||
sequence_number = input_args->sequence_num;
|
||||
|
||||
/* Keep tag within the limit supportd by the pml */
|
||||
tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
|
||||
|
||||
/* Mark this as a collective tag, to avoid conflict with user-level flags */
|
||||
tag = -tag;
|
||||
|
||||
pair_comm_rank =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[0]];
|
||||
|
||||
rc = MCA_PML_CALL(isend(
|
||||
NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, &(requests[0])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("ISend failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = MCA_PML_CALL(irecv(
|
||||
NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
comm, &(requests[1])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("IRecv failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
opal_free_list_return (&ptpcoll_module->collreqs_free, (opal_free_list_item_t *) collreq);
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
/*************************************** Recursive-Doubling ***************************************/
|
||||
/**************************************************************************************************/
|
||||
|
||||
static int bcol_ptpcoll_barrier_recurs_dbl_new(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variable */
|
||||
uint64_t sequence_number;
|
||||
mca_bcol_ptpcoll_module_t *ptp_module =
|
||||
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
|
||||
|
||||
ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;
|
||||
|
||||
int rc, my_extra_partner_comm_rank = 0, exchange, completed,
|
||||
pair_comm_rank, pair_rank, delta, tag, num_reqs = 0,
|
||||
my_rank = ptp_module->super.sbgp_partner_module->my_index,
|
||||
n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2;
|
||||
|
||||
ompi_request_t **requests;
|
||||
opal_free_list_item_t *item;
|
||||
|
||||
mca_bcol_ptpcoll_collreq_t *collreq;
|
||||
|
||||
item = opal_free_list_wait (&ptp_module->collreqs_free);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
PTPCOLL_ERROR(("Free list waiting failed."));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
collreq = (mca_bcol_ptpcoll_collreq_t *) item;
|
||||
input_args->bcol_opaque_data = (void *) collreq;
|
||||
|
||||
assert(PTPCOLL_EXTRA != ptp_module->pow_2type);
|
||||
|
||||
requests = collreq->requests;
|
||||
|
||||
/* TAG Calculation */
|
||||
sequence_number = input_args->sequence_num;
|
||||
|
||||
/* keep tag within the limit supportd by the pml */
|
||||
tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);
|
||||
|
||||
/* mark this as a collective tag, to avoid conflict with user-level flags */
|
||||
tag = -tag;
|
||||
|
||||
if (PTPCOLL_PROXY == ptp_module->pow_2type) {
|
||||
/* I will participate in the exchange - wait for signal from extra
|
||||
** process */
|
||||
/*
|
||||
* recv from extra rank - my_extra_partner_comm_rank
|
||||
* can use blocking recv, as no other communications
|
||||
* need to take place.
|
||||
*/
|
||||
my_extra_partner_comm_rank =
|
||||
ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];
|
||||
|
||||
collreq->need_toserv_extra = 1;
|
||||
collreq->extra_partner_rank = my_extra_partner_comm_rank;
|
||||
|
||||
rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
|
||||
my_extra_partner_comm_rank, tag, comm,
|
||||
&(requests[0])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("IRecv failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for irecv failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->tag = tag;
|
||||
collreq->num_reqs = 1;
|
||||
collreq->exchange = 0;
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
} else {
|
||||
collreq->need_toserv_extra = 0;
|
||||
}
|
||||
|
||||
/* Loop over exchange send/recv pairs */
|
||||
delta = 1;
|
||||
for (exchange = 0; exchange < n_exchange; ++exchange) {
|
||||
|
||||
/* rank of exchange partner within the group */
|
||||
pair_rank = my_rank ^ delta;
|
||||
|
||||
/* rank within the communicator */
|
||||
pair_comm_rank =
|
||||
ptp_module->super.sbgp_partner_module->group_list[pair_rank];
|
||||
|
||||
/* send to partner - we will wait for completion, as send
|
||||
* completion is at the MPI level, and will not
|
||||
* incur network level completion costs
|
||||
*/
|
||||
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[0])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("ISend failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
++num_reqs;
|
||||
|
||||
/* recive from partner */
|
||||
rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag, comm,
|
||||
&(requests[1])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("IRecv failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
++num_reqs;
|
||||
|
||||
PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d",
|
||||
exchange, pair_rank, pair_comm_rank));
|
||||
|
||||
/* test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->tag = tag;
|
||||
collreq->num_reqs = num_reqs;
|
||||
|
||||
collreq->exchange = exchange + 1;
|
||||
assert(collreq->exchange >= 0);
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
delta <<= 1; /* delta *= 2 */
|
||||
}
|
||||
|
||||
if (PTPCOLL_PROXY == ptp_module->pow_2type) {
|
||||
/* send - let the extra rank know that we are done */
|
||||
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
|
||||
my_extra_partner_comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[0])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("ISend failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for isend failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->tag = tag;
|
||||
collreq->num_reqs = 1;
|
||||
|
||||
collreq->need_toserv_extra = 0;
|
||||
collreq->exchange = n_exchange;
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
opal_free_list_return (&ptp_module->collreqs_free, (opal_free_list_item_t *) collreq);
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static int bcol_ptpcoll_barrier_recurs_dbl_new_progress(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variable */
|
||||
mca_bcol_ptpcoll_module_t *ptp_module =
|
||||
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
|
||||
|
||||
ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;
|
||||
|
||||
int rc, exchange, pair_comm_rank, tag,
|
||||
pair_rank, delta, num_reqs, completed,
|
||||
my_rank = ptp_module->super.sbgp_partner_module->my_index,
|
||||
n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2;
|
||||
|
||||
ompi_request_t **requests;
|
||||
mca_bcol_ptpcoll_collreq_t *collreq =
|
||||
(mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;
|
||||
|
||||
num_reqs = collreq->num_reqs;
|
||||
requests = collreq->requests;
|
||||
|
||||
/* test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
assert(PTPCOLL_EXTRA != ptp_module->pow_2type);
|
||||
|
||||
/* Continue loop over exchange send/recv pairs */
|
||||
num_reqs = 0;
|
||||
tag = collreq->tag;
|
||||
|
||||
exchange = collreq->exchange;
|
||||
assert(exchange >= 0);
|
||||
|
||||
delta = 1 << exchange;
|
||||
for (; exchange < n_exchange; ++exchange) {
|
||||
|
||||
/* rank of exchange partner within the group */
|
||||
pair_rank = my_rank ^ delta;
|
||||
|
||||
/* rank within the communicator */
|
||||
pair_comm_rank =
|
||||
ptp_module->super.sbgp_partner_module->group_list[pair_rank];
|
||||
|
||||
/* send to partner - we will wait for completion, as send
|
||||
* completion is at the MPI level, and will not
|
||||
* incur network level completion costs
|
||||
*/
|
||||
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[0])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("ISend failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
++num_reqs;
|
||||
|
||||
/* recive from partner */
|
||||
rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
|
||||
pair_comm_rank, tag, comm,
|
||||
&(requests[1])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("IRecv failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
++num_reqs;
|
||||
|
||||
PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d",
|
||||
exchange, pair_rank, pair_comm_rank));
|
||||
|
||||
/* test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->num_reqs = num_reqs;
|
||||
collreq->exchange = exchange + 1;
|
||||
assert(collreq->exchange >= 0);
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
delta <<= 1; /* delta *= 2 */
|
||||
}
|
||||
|
||||
/* if non power of 2, may need to send message to "extra" proc */
|
||||
if (collreq->need_toserv_extra) {
|
||||
/* send - let the extra rank know that we are done */
|
||||
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
|
||||
collreq->extra_partner_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[0])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("ISend failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for isend failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
collreq->num_reqs = 1;
|
||||
collreq->need_toserv_extra = 0;
|
||||
collreq->exchange = n_exchange;
|
||||
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
/****************************************** Extra node Barrier ******************************************/
|
||||
|
||||
static int bcol_ptpcoll_barrier_recurs_dbl_extra_new(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variable */
|
||||
uint64_t sequence_number;
|
||||
int rc, completed, num_reqs = 2,
|
||||
tag, my_extra_partner_comm_rank;
|
||||
|
||||
ompi_request_t **requests;
|
||||
opal_free_list_item_t *item;
|
||||
|
||||
mca_bcol_ptpcoll_collreq_t *collreq;
|
||||
|
||||
mca_bcol_ptpcoll_module_t *ptp_module =
|
||||
(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
|
||||
ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;
|
||||
|
||||
item = opal_free_list_wait (&ptp_module->collreqs_free);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
PTPCOLL_ERROR(("Free list waiting failed."));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
collreq = (mca_bcol_ptpcoll_collreq_t *) item;
|
||||
input_args->bcol_opaque_data = (void *) collreq;
|
||||
|
||||
requests = collreq->requests;
|
||||
|
||||
/* TAG Calculation */
|
||||
sequence_number = input_args->sequence_num;
|
||||
|
||||
/* Keep tag within the limit supportd by the pml */
|
||||
tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);
|
||||
|
||||
/* mark this as a collective tag, to avoid conflict with user-level flags */
|
||||
tag = -tag;
|
||||
|
||||
/* I will not participate in the exchange - so just "register" as here,
|
||||
* signal the extra rank that I am here */
|
||||
|
||||
my_extra_partner_comm_rank =
|
||||
ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];
|
||||
|
||||
rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
|
||||
my_extra_partner_comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[0])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Send failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Recv signal that the rest are done - my_extra_partner_comm_rank */
|
||||
rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
|
||||
my_extra_partner_comm_rank, tag, comm,
|
||||
&(requests[1])));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("IRecv failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
opal_free_list_return (&ptp_module->collreqs_free, (opal_free_list_item_t *) collreq);
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
/* We have the same progress func for both cases (R-D and K-Nominal) */
|
||||
static int bcol_ptpcoll_barrier_extra_node_progress(
|
||||
bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variable */
|
||||
ompi_request_t **requests;
|
||||
int rc, completed, num_reqs = 2;
|
||||
|
||||
mca_bcol_ptpcoll_collreq_t *collreq =
|
||||
(mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;
|
||||
|
||||
requests = collreq->requests;
|
||||
|
||||
/* test for completion */
|
||||
completed =
|
||||
mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
PTPCOLL_ERROR(("Test for all failed."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!completed) {
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static int mca_bcol_ptpcoll_barrier_setup(mca_bcol_base_module_t *super, int bcoll_type)
|
||||
{
|
||||
netpatterns_k_exchange_node_t *my_exchange_node;
|
||||
mca_bcol_ptpcoll_module_t * ptpcoll_module =
|
||||
(mca_bcol_ptpcoll_module_t *) super;
|
||||
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
comm_attribs.bcoll_type = bcoll_type;
|
||||
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
switch(mca_bcol_ptpcoll_component.barrier_alg) {
|
||||
case 1:
|
||||
if (PTPCOLL_EXTRA == ptpcoll_module->pow_2type) {
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_ptpcoll_barrier_recurs_dbl_extra_new,
|
||||
bcol_ptpcoll_barrier_extra_node_progress);
|
||||
break;
|
||||
}
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_ptpcoll_barrier_recurs_dbl_new,
|
||||
bcol_ptpcoll_barrier_recurs_dbl_new_progress);
|
||||
break;
|
||||
case 2:
|
||||
my_exchange_node = &ptpcoll_module->knomial_exchange_tree;
|
||||
if (my_exchange_node->n_extra_sources > 0 &&
|
||||
EXTRA_NODE == my_exchange_node->node_type) {
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_ptpcoll_barrier_recurs_knomial_extra_new,
|
||||
bcol_ptpcoll_barrier_extra_node_progress);
|
||||
break;
|
||||
}
|
||||
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_ptpcoll_barrier_recurs_knomial_new,
|
||||
bcol_ptpcoll_barrier_recurs_knomial_new_progress);
|
||||
break;
|
||||
default:
|
||||
PTPCOLL_ERROR(("Wrong barrier_alg flag value."));
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_bcol_ptpcoll_memsync_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
return mca_bcol_ptpcoll_barrier_setup(super, BCOL_SYNC);
|
||||
}
|
||||
|
||||
int bcol_ptpcoll_barrier_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
return mca_bcol_ptpcoll_barrier_setup(super, BCOL_BARRIER);
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,868 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_PTPCOLL_BCAST_H
|
||||
#define MCA_BCOL_PTPCOLL_BCAST_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "bcol_ptpcoll.h"
|
||||
#include "bcol_ptpcoll_utils.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
int bcol_ptpcoll_bcast_init(mca_bcol_base_module_t *super);
|
||||
|
||||
int bcol_ptpcoll_bcast_k_nomial_anyroot (bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int bcol_ptpcoll_bcast_k_nomial_anyroot_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
int bcol_ptpcoll_bcast_k_nomial_known_root(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int bcol_ptpcoll_bcast_k_nomial_known_root_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
|
||||
/* macros */
|
||||
#define K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER( \
|
||||
radix_mask_pow, \
|
||||
my_group_index, group_size, group_list, \
|
||||
data_buffer, segment_size, count, tag, \
|
||||
comm, send_requests, num_pending_sends) \
|
||||
do { \
|
||||
int rc = OMPI_SUCCESS; \
|
||||
int dst; \
|
||||
int comm_dst; \
|
||||
int send_size; \
|
||||
int send_offset; \
|
||||
int delta; \
|
||||
int dst_boundary_rank; \
|
||||
int radix_mask = radix_mask_pow >= 0 ? 1 << radix_mask_pow : 0; \
|
||||
\
|
||||
while(radix_mask_pow >= 0) { \
|
||||
/* For each level of tree, do sends */ \
|
||||
dst = my_group_index ^ radix_mask; \
|
||||
comm_dst = group_list[dst]; \
|
||||
\
|
||||
dst_boundary_rank = dst & ((~(int)0) << (radix_mask_pow)); \
|
||||
\
|
||||
send_offset = segment_size * dst_boundary_rank; \
|
||||
/* Pasha: make sure that we handle the corner cases */ \
|
||||
delta = count - send_offset; \
|
||||
if (delta <= 0) { \
|
||||
send_size = 0; /* we have to send something, other way it will hang */ \
|
||||
} else { \
|
||||
/* the tail case */ \
|
||||
send_size = (int) \
|
||||
(delta - (int)segment_size * radix_mask) < 0 ? delta : \
|
||||
(int)segment_size * radix_mask; \
|
||||
} \
|
||||
\
|
||||
/* Non blocking send .... */ \
|
||||
PTPCOLL_VERBOSE(9 , \
|
||||
("Bcast p2s, Isend to %d[%d],count %d,tag %d,addr %p [%p] send_size %d,send_offset %d, radix %d %d",\
|
||||
dst, comm_dst, count, tag, \
|
||||
data_buffer, (void *)((unsigned char *)data_buffer + (size_t)send_offset), \
|
||||
send_size, \
|
||||
send_offset, \
|
||||
radix_mask, \
|
||||
radix_mask_pow \
|
||||
)); \
|
||||
rc = MCA_PML_CALL(isend((void *)((unsigned char *)data_buffer + (size_t)send_offset), \
|
||||
send_size, MPI_BYTE, \
|
||||
comm_dst, tag, \
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, \
|
||||
&(send_requests[*num_pending_sends]))); \
|
||||
PTPCOLL_VERBOSE(10, ("send request addr is %p", send_requests[*num_pending_sends])); \
|
||||
if( OMPI_SUCCESS != rc ) { \
|
||||
PTPCOLL_VERBOSE(10, ("Failed to isend data")); \
|
||||
return OMPI_ERROR; \
|
||||
} \
|
||||
++(*num_pending_sends); \
|
||||
radix_mask >>= 1; \
|
||||
radix_mask_pow--; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define NARRAY_SCATTER_NB(narray_node, process_shift, group_size, \
|
||||
data_buffer, base_block_size, count, tag, comm, send_requests, \
|
||||
num_pending_sends) \
|
||||
do { \
|
||||
int n, rc = OMPI_SUCCESS; \
|
||||
int dst; \
|
||||
int comm_dst; \
|
||||
int offset; \
|
||||
int size_count = count; \
|
||||
\
|
||||
/* Send out data to all relevant childrens */ \
|
||||
for (n = 0; n < narray_node->n_children && size_count > 0; n++) { \
|
||||
\
|
||||
dst = narray_node->children_ranks[n] + process_shift; \
|
||||
if (dst >= group_size) { \
|
||||
dst -= group_size; \
|
||||
} \
|
||||
\
|
||||
comm_dst = group_list[dst]; \
|
||||
offset = n * base_block_size; \
|
||||
size_count -= base_block_size; \
|
||||
if (OPAL_UNLIKELY(size_count < 0)) { \
|
||||
count = base_block_size + size_count; \
|
||||
} else { \
|
||||
count = base_block_size; \
|
||||
} \
|
||||
\
|
||||
/* Non blocking send .... */ \
|
||||
PTPCOLL_VERBOSE(9 , ("Bcast, Isend data to %d[%d], count %d, tag %d, addr %p", \
|
||||
dst, comm_dst, count, tag, \
|
||||
data_buffer)); \
|
||||
rc = MCA_PML_CALL(isend((void *)((char *)data_buffer + (size_t)offset), count, MPI_BYTE,\
|
||||
comm_dst, tag, \
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, \
|
||||
&(send_requests[*num_pending_sends]))); \
|
||||
if( OMPI_SUCCESS != rc ) { \
|
||||
PTPCOLL_VERBOSE(10, ("Failed to isend data")); \
|
||||
return OMPI_ERROR; \
|
||||
} \
|
||||
++(*num_pending_sends); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define NARRAY_SCATTER_B(narray_node, process_shift, group_size, \
|
||||
data_buffer, base_block_size, count, tag, comm, send_requests, \
|
||||
num_pending_sends, completed) \
|
||||
do { \
|
||||
NARRAY_SCATTER_NB(narray_node, process_shift, group_size, \
|
||||
data_buffer, base_block_size, count, tag, comm, send_requests, \
|
||||
num_pending_sends); \
|
||||
if (*num_pending_sends > 0) { \
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(num_pending_sends, send_requests, &rc); \
|
||||
if (OMPI_SUCCESS != rc) { \
|
||||
return OMPI_ERROR; \
|
||||
} \
|
||||
} else { \
|
||||
completed = 1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CHECK_IF_ROOT_OR_VROOT(module, i) \
|
||||
(module->pow_2 == module->ml_mem.ml_buf_desc[i].radix_mask_pow)
|
||||
|
||||
/* inline functions */
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra(
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
void *data_buffer, int count, int tag,
|
||||
int extra_peer, ompi_communicator_t *comm,
|
||||
int *active_requests, ompi_request_t **requests)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
int completed = 0;
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
|
||||
/* tag is -1 already */
|
||||
/* send the all data to your extra peer */
|
||||
PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra to %d tag %d",
|
||||
extra_peer, tag));
|
||||
rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE,
|
||||
group_list[extra_peer], tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to send data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
++(*active_requests);
|
||||
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if (0 == completed) {
|
||||
PTPCOLL_VERBOSE(10, ("PR Extra send was not completed"));
|
||||
/* we have to store the iteration number somewhere */
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_ptpcoll_send_n_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
void *data_buffer, int count, int tag,
|
||||
int *extra_peers, int num_peers, int skip,
|
||||
ompi_communicator_t *comm,
|
||||
int *active_requests, ompi_request_t **requests)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
int completed = 0;
|
||||
int i;
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
|
||||
/* send the all data to your extra peer */
|
||||
for (i = 0; i < num_peers; i++) {
|
||||
PTPCOLL_VERBOSE(10, ("send_n_extra to %d tag %d",
|
||||
extra_peers[i], tag));
|
||||
if (extra_peers[i] == skip) {
|
||||
PTPCOLL_VERBOSE(10, ("SKIP"));
|
||||
continue;
|
||||
}
|
||||
|
||||
rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE,
|
||||
group_list[extra_peers[i]], tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to send data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
++(*active_requests);
|
||||
}
|
||||
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if (0 == completed) {
|
||||
PTPCOLL_VERBOSE(10, ("PR Extra send was not completed"));
|
||||
/* we have to store the iteration number somewhere */
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_ptpcoll_bcast_binomial_gather_anyroot(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
int buffer_index, void *data_buffer, int count, int base_block_size)
|
||||
{
|
||||
int rc;
|
||||
int completed = 0; /* not completed */
|
||||
int *active_requests =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
|
||||
int i;
|
||||
int *iteration =
|
||||
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
|
||||
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
ompi_request_t **requests =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
|
||||
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
|
||||
void *curr_data_sbuffer = NULL,
|
||||
*curr_data_rbuffer = NULL;
|
||||
int radix_mask_pow = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow;
|
||||
int delta;
|
||||
int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_gather_anyroot %d %d %d",
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration,
|
||||
ptpcoll_module->pow_2,
|
||||
1 << ptpcoll_module->pow_2));
|
||||
|
||||
/* we assume the iteration #iteration already was completed with probe */
|
||||
for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
|
||||
i < ptpcoll_module->pow_2; i++) {
|
||||
int pow2 = 1 << i;
|
||||
int peer_index = my_group_index ^ pow2;
|
||||
int comm_rank = group_list[peer_index];
|
||||
int slen, rlen,
|
||||
send_offset,
|
||||
recv_offset;
|
||||
|
||||
if (i > radix_mask_pow) {
|
||||
/* *active_requests = 0; */
|
||||
/* send - receive data from the peer */
|
||||
slen = rlen = pow2 * base_block_size;
|
||||
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
|
||||
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
|
||||
curr_data_sbuffer = (void *)((unsigned char *)data_buffer + send_offset);
|
||||
curr_data_rbuffer = (void *)((unsigned char *)data_buffer + recv_offset);
|
||||
|
||||
delta = count - recv_offset;
|
||||
if (delta > 0) {
|
||||
if (delta < rlen) {
|
||||
/* recv the tail */
|
||||
rlen = delta;
|
||||
}
|
||||
PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] recv data %p (offset %d) , len %d , dest %d",
|
||||
pow2,
|
||||
1 << ptpcoll_module->pow_2,
|
||||
curr_data_rbuffer,
|
||||
recv_offset,
|
||||
rlen,
|
||||
comm_rank));
|
||||
rc = MCA_PML_CALL(irecv(curr_data_rbuffer, rlen, MPI_BYTE,
|
||||
comm_rank, tag, comm, &requests[*active_requests]));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to receive data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
}
|
||||
|
||||
delta = count - send_offset;
|
||||
if (delta > 0) {
|
||||
if (delta < slen) {
|
||||
/* recv the tail */
|
||||
slen = delta;
|
||||
}
|
||||
PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] sending data %p (offset %d) , len %d , dest %d",
|
||||
pow2,
|
||||
1 << ptpcoll_module->pow_2,
|
||||
curr_data_sbuffer,
|
||||
send_offset,
|
||||
slen,
|
||||
comm_rank));
|
||||
rc = MCA_PML_CALL(isend(curr_data_sbuffer, slen, MPI_BYTE,
|
||||
comm_rank, tag,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to send data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
}
|
||||
|
||||
if (*active_requests > 0) {
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if (0 == completed) {
|
||||
*iteration = i;
|
||||
/* we have to store the iteration number somewhere */
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
} else if (i == radix_mask_pow) {
|
||||
/* only receive data */
|
||||
rlen = pow2 * base_block_size;
|
||||
recv_offset = base_block_size * ((peer_index) & ((~(int)0) << i));
|
||||
curr_data_rbuffer = (void *)((unsigned char *)data_buffer + recv_offset);
|
||||
delta = count - recv_offset;
|
||||
if (0 >= delta) {
|
||||
/* we have nothing to send, skip the iteration */
|
||||
continue;
|
||||
}
|
||||
if (delta < rlen) {
|
||||
/* recv the tail */
|
||||
rlen = delta;
|
||||
}
|
||||
/* receive data from the peer */
|
||||
PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] recv data %p (offset %d) , len %d , dest %d",
|
||||
pow2,
|
||||
1 << ptpcoll_module->pow_2,
|
||||
curr_data_rbuffer,
|
||||
recv_offset,
|
||||
rlen,
|
||||
comm_rank));
|
||||
rc = MCA_PML_CALL(irecv(curr_data_rbuffer, rlen, MPI_BYTE,
|
||||
comm_rank, tag, comm, &(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to receive data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if (0 == completed) {
|
||||
*iteration = i;
|
||||
PTPCOLL_VERBOSE(10, ("Recv was not completed"));
|
||||
/* we have to store the iteration number somewhere */
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
PTPCOLL_VERBOSE(10, ("Recv was completed"));
|
||||
} else if (i < radix_mask_pow) {
|
||||
/* Only send data */
|
||||
slen = pow2 * base_block_size;
|
||||
send_offset = base_block_size * ((my_group_index) & ((~(int)0) << i));
|
||||
curr_data_sbuffer = (void *)((unsigned char *)data_buffer + send_offset);
|
||||
delta = count - send_offset;
|
||||
if (0 >= delta) {
|
||||
/* we have nothing to send, skip the iteration */
|
||||
continue;
|
||||
}
|
||||
if (delta < slen) {
|
||||
slen = delta;
|
||||
}
|
||||
PTPCOLL_VERBOSE(10, ("[ pow2 %d, radix %d ] sending data %p (offset %d) , len %d , dest %d",
|
||||
pow2,
|
||||
1 << ptpcoll_module->pow_2,
|
||||
curr_data_sbuffer,
|
||||
send_offset,
|
||||
slen,
|
||||
comm_rank));
|
||||
rc = MCA_PML_CALL(isend(curr_data_sbuffer, slen, MPI_BYTE,
|
||||
comm_rank, tag, MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to send data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if (0 == completed) {
|
||||
*iteration = i;
|
||||
/* we have to store the iteration number somewhere */
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
int buffer_index, void *data_buffer, int count, int base_block_size)
|
||||
{
|
||||
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
int rc;
|
||||
int completed = 0; /* not completed */
|
||||
int comm_root;
|
||||
int i;
|
||||
int *radix_mask_pow =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow);
|
||||
int *active_requests =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
|
||||
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
ompi_status_public_t status;
|
||||
ompi_request_t **requests =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
|
||||
int pow2_group_size = ptpcoll_module->pow_2num;
|
||||
int pow2_distance;
|
||||
int my_left_boundary_rank;
|
||||
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
|
||||
int group_root_index = 0;
|
||||
void *curr_data_buffer = NULL;
|
||||
int tag =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
|
||||
int recv_count = 0;
|
||||
int *coll_status =
|
||||
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status;
|
||||
|
||||
assert(0 == *active_requests);
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot"));
|
||||
for (i = 0; i < cm->num_to_probe &&
|
||||
0 == completed; i++) {
|
||||
MCA_PML_CALL(iprobe(MPI_ANY_SOURCE, tag,
|
||||
comm, &completed, &status));
|
||||
PTPCOLL_VERBOSE(10, ("Bcast, iprobe tag %d",
|
||||
tag));
|
||||
}
|
||||
|
||||
/* the function always returns OMPI_SUCCESS, so we don't check return code */
|
||||
if (0 == completed) {
|
||||
PTPCOLL_VERBOSE(10, ("IPROBE was not matched"));
|
||||
/* No data was received, return no match error */
|
||||
return BCOL_FN_NOT_STARTED;
|
||||
}
|
||||
|
||||
comm_root = status.MPI_SOURCE;
|
||||
|
||||
|
||||
PTPCOLL_VERBOSE(9, ("IPROBE was matched, root of the data on communicator is %d", comm_root));
|
||||
|
||||
/* For proxy we have to check if we got something from extra node */
|
||||
if (PTPCOLL_PROXY & ptpcoll_module->pow_2type) {
|
||||
if (group_list[ptpcoll_module->proxy_extra_index] == comm_root) {
|
||||
PTPCOLL_VERBOSE(9, ("IPROBE was matched, root of the data on communicator is extra node %d",
|
||||
comm_root));
|
||||
/* scatter the data among other peer in the pow2 group */
|
||||
*radix_mask_pow = ptpcoll_module->pow_2;
|
||||
|
||||
pow2_distance = ptpcoll_module->pow_2 - 1;
|
||||
curr_data_buffer = data_buffer;
|
||||
recv_count = count;
|
||||
goto PR_SCATTHER;
|
||||
}
|
||||
}
|
||||
|
||||
/* Find group index for communicator root of the data */
|
||||
group_root_index = get_group_index_and_distance_for_binomial
|
||||
(my_group_index, comm_root, pow2_group_size, group_list, &pow2_distance);
|
||||
if (OPAL_UNLIKELY(group_root_index < 0)) {
|
||||
PTPCOLL_ERROR(("Fatal error, no group root index found, my id %d, pow2_g_size %d comm_root %d",
|
||||
my_group_index, pow2_group_size, comm_root));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Group root index is %d distance is %d",
|
||||
group_root_index, pow2_distance));
|
||||
|
||||
/* Use group_root_index to calculate the */
|
||||
|
||||
/* Post receive that will fetch the data */
|
||||
/* Pasha: Who is packing data ?
|
||||
Should I assume that we get contiguous buffer ?
|
||||
Or should I pack by myself
|
||||
===================================================================================================
|
||||
=== On this stage I assume that data is contiguous. So I use MPI_BYTE datatype and COUNT = size ===
|
||||
===================================================================================================
|
||||
*/
|
||||
|
||||
recv_count = base_block_size * (1 << pow2_distance); /* we may receive larger data */
|
||||
|
||||
my_left_boundary_rank = my_group_index & ((~(int)0) << pow2_distance );
|
||||
|
||||
curr_data_buffer = (void *)((unsigned char *)data_buffer +
|
||||
(size_t) base_block_size * my_left_boundary_rank);
|
||||
|
||||
*radix_mask_pow = pow2_distance;
|
||||
|
||||
pow2_distance--;
|
||||
|
||||
PR_SCATTHER:
|
||||
PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], "
|
||||
"recv_count %d, tag %d, addr %p, offset %d, pow2_distace %d",
|
||||
comm_root, group_root_index, recv_count,
|
||||
tag, curr_data_buffer,
|
||||
my_group_index * base_block_size, pow2_distance));
|
||||
|
||||
rc = MCA_PML_CALL(recv(curr_data_buffer, recv_count, MPI_BYTE,
|
||||
comm_root, tag, comm, MPI_STATUS_IGNORE));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to receive data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Bcast, Data was received"));
|
||||
|
||||
/* Sending forward the data over K-nomial tree */
|
||||
*coll_status = PTPCOLL_SCATTER_STARTED;
|
||||
K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(
|
||||
pow2_distance,
|
||||
my_group_index, group_size, group_list,
|
||||
data_buffer, base_block_size,
|
||||
count, tag, comm, requests,
|
||||
active_requests);
|
||||
|
||||
/* Since the next step (gather) does not really require
|
||||
completion on scatter , we may return complete */
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_ptpcoll_binomial_root_to_src(int group_root, int my_rank,
|
||||
int pow2_size, int group_size, int *distance)
|
||||
{
|
||||
int root, relative_rank, src,
|
||||
pow2_distance = 0, i;
|
||||
|
||||
if (group_root < pow2_size) {
|
||||
root = group_root;
|
||||
} else {
|
||||
/* the source of the data is extra node,
|
||||
the real root it represented by some rank from
|
||||
pow2 group */
|
||||
root = group_root - pow2_size;
|
||||
/* shortcut for the case when my rank is root for the group */
|
||||
if (my_rank == root) {
|
||||
*distance = -1;
|
||||
return group_root;
|
||||
}
|
||||
}
|
||||
|
||||
relative_rank = (my_rank - root) < 0 ? my_rank - root + pow2_size :
|
||||
my_rank - root;
|
||||
|
||||
for (i = 1; i < pow2_size; i<<=1, pow2_distance++) {
|
||||
if (relative_rank & i) {
|
||||
src = my_rank ^ i;
|
||||
if (src >= pow2_size)
|
||||
src -= pow2_size;
|
||||
|
||||
*distance = pow2_distance;
|
||||
return src;
|
||||
}
|
||||
}
|
||||
|
||||
/* error case */
|
||||
*distance = -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_ptpcoll_bcast_binomial_test_and_scatter_known_root(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
int buffer_index, void *data_buffer, int count, int base_block_size)
|
||||
{
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
int rc;
|
||||
int *active_requests =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
|
||||
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
ompi_request_t **requests =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
|
||||
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
|
||||
int tmp_radix_mask_pow =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow - 1;
|
||||
int tag =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
|
||||
int *status =
|
||||
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status;
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot"));
|
||||
|
||||
if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests,
|
||||
requests, &rc)) {
|
||||
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Bcast, Data was received"));
|
||||
|
||||
/* Sending forward the data over binimial nomial tree */
|
||||
*status = PTPCOLL_SCATTER_STARTED;
|
||||
K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(
|
||||
tmp_radix_mask_pow,
|
||||
my_group_index, group_size, group_list,
|
||||
data_buffer, base_block_size,
|
||||
count, tag, comm, requests,
|
||||
active_requests);
|
||||
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
#define NARRAY_BLOCK_SIZE(size, module, level_size) \
|
||||
((size + (module)->full_narray_tree_num_leafs - 1) / \
|
||||
(module)->full_narray_tree_num_leafs) * \
|
||||
((module)->full_narray_tree_num_leafs / \
|
||||
((0 == level_size) ? \
|
||||
mca_bcol_ptpcoll_component.narray_knomial_radix : \
|
||||
level_size))
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_ptpcoll_bcast_narray_test_and_scatter_known_root(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
int buffer_index, void *data_buffer, int count, int process_shift,
|
||||
int relative_group_index)
|
||||
{
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
int rc;
|
||||
int *active_requests =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
|
||||
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
ompi_request_t **requests =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
|
||||
int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
|
||||
int *status =
|
||||
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status;
|
||||
int scatter_count = 0;
|
||||
int offset = 0;
|
||||
int base_block_size = 0;
|
||||
void *curr_data_buffer = NULL;
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Running bcol_ptpcoll_bcast_narray_test_and_scatter_known_root"));
|
||||
|
||||
if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests,
|
||||
requests, &rc)) {
|
||||
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
/* Sending forward the data over binimial nomial tree */
|
||||
*status = PTPCOLL_SCATTER_STARTED;
|
||||
if(0 == relative_group_index) {
|
||||
scatter_count = count;
|
||||
} else {
|
||||
scatter_count = NARRAY_BLOCK_SIZE(count, ptpcoll_module,
|
||||
ptpcoll_module->narray_knomial_node[relative_group_index].level_size);
|
||||
}
|
||||
|
||||
offset = scatter_count *
|
||||
ptpcoll_module->narray_knomial_node[relative_group_index].rank_on_level;
|
||||
|
||||
/* make sure that we do not overun memory */
|
||||
if (OPAL_UNLIKELY(offset + scatter_count > count)) {
|
||||
scatter_count = count - offset;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Bcast, Data was received %d %d %d",
|
||||
scatter_count,
|
||||
ptpcoll_module->narray_knomial_node[relative_group_index].level_size,
|
||||
ptpcoll_module->narray_knomial_node[relative_group_index].rank_on_level));
|
||||
|
||||
|
||||
curr_data_buffer = (void *)((unsigned char *)data_buffer + (size_t)offset);
|
||||
|
||||
/* calculating scatter block size for next level of tree */
|
||||
base_block_size = NARRAY_BLOCK_SIZE(count, ptpcoll_module,
|
||||
ptpcoll_module->narray_knomial_node[relative_group_index].level_size *
|
||||
mca_bcol_ptpcoll_component.narray_knomial_radix);
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("scatter_known_rootaaa %d %d %d %d %d",scatter_count, offset, base_block_size,
|
||||
ptpcoll_module->narray_knomial_node[relative_group_index].level_size /mca_bcol_ptpcoll_component.narray_knomial_radix,
|
||||
ptpcoll_module->full_narray_tree_num_leafs));
|
||||
|
||||
NARRAY_SCATTER_NB((&ptpcoll_module->narray_knomial_node[relative_group_index]),
|
||||
process_shift, ptpcoll_module->full_narray_tree_size,
|
||||
curr_data_buffer, base_block_size, scatter_count, tag, comm,
|
||||
requests, active_requests);
|
||||
|
||||
/* Bummer, I tried to prevent this, special case for virtual root */
|
||||
if(0 == relative_group_index) {
|
||||
if (0 == mca_bcol_ptpcoll_test_all_for_match(active_requests,
|
||||
requests, &rc)) {
|
||||
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
|
||||
*status = PTPCOLL_ROOT_SEND_STARTED;
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int bcol_ptpcoll_bcast_narray_knomial_gather(mca_bcol_ptpcoll_module_t *ptpcoll_module,
|
||||
const int buffer_index, void *data_buffer, const int count,
|
||||
const int relative_group_index)
|
||||
{
|
||||
int completed = 0; /* not completed */
|
||||
int *active_requests =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
|
||||
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
|
||||
int blocks_in_step =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask;
|
||||
int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
int group_size = ptpcoll_module->full_narray_tree_size;
|
||||
int i, k,
|
||||
rc,
|
||||
len, slen, rlen,
|
||||
peer, group_peer;
|
||||
size_t s_offset,
|
||||
r_offset;
|
||||
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
ompi_request_t **requests =
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
|
||||
netpatterns_narray_knomial_tree_node_t *narray_node =
|
||||
&ptpcoll_module->narray_knomial_node[relative_group_index];
|
||||
netpatterns_k_exchange_node_t *k_node =
|
||||
&narray_node->k_node;
|
||||
mca_bcol_ptpcoll_component_t *cm =
|
||||
&mca_bcol_ptpcoll_component;
|
||||
size_t base_block_size =
|
||||
NARRAY_BLOCK_SIZE(count, ptpcoll_module, narray_node->level_size);
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_narray_knomial_gather %d %d %d %d %d %d %d",
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration,
|
||||
base_block_size, count, narray_node->level_size,
|
||||
relative_group_index, k_node->n_exchanges, tag));
|
||||
|
||||
/* we assume the iteration #iteration already was completed with probe */
|
||||
for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
|
||||
i < k_node->n_exchanges; i++, blocks_in_step *= cm->narray_knomial_radix) {
|
||||
|
||||
len = base_block_size * blocks_in_step;
|
||||
|
||||
for (k = 0; k < cm->narray_knomial_radix - 1; k++) {
|
||||
group_peer = my_group_index +
|
||||
(k_node->rank_exchanges[i][k] - narray_node->rank_on_level);
|
||||
if (group_peer >= group_size) {
|
||||
group_peer -= group_size;
|
||||
} else if (group_peer < 0) {
|
||||
group_peer += group_size;
|
||||
}
|
||||
peer = group_list[group_peer];
|
||||
|
||||
r_offset = (size_t)k_node->rank_exchanges[i][k] / blocks_in_step *
|
||||
len;
|
||||
|
||||
/* check that we do not run out of message boundary */
|
||||
if (OPAL_UNLIKELY(r_offset + len > (size_t)count)) {
|
||||
rlen = count - r_offset;
|
||||
if (OPAL_UNLIKELY(rlen <= 0)) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
rlen = len;
|
||||
}
|
||||
PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p offset %d len %d %d %d tag %d",
|
||||
peer, data_buffer, r_offset, rlen, len, blocks_in_step, tag));
|
||||
rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + r_offset),
|
||||
rlen, MPI_BYTE,
|
||||
peer, tag, comm, &requests[*active_requests]));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to receive data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
}
|
||||
|
||||
for (k = 0; k < cm->narray_knomial_radix - 1; k++) {
|
||||
group_peer = my_group_index +
|
||||
(k_node->rank_exchanges[i][k] - narray_node->rank_on_level);
|
||||
if (group_peer >= group_size) {
|
||||
group_peer -= group_size;
|
||||
} else if (group_peer < 0) {
|
||||
group_peer += group_size;
|
||||
}
|
||||
peer = group_list[group_peer];
|
||||
|
||||
s_offset = (size_t)narray_node->rank_on_level / blocks_in_step *
|
||||
len;
|
||||
|
||||
/* check that we do not run out of message boundary */
|
||||
if (OPAL_UNLIKELY(s_offset + len > (size_t)count)) {
|
||||
slen = count - s_offset;
|
||||
if (OPAL_UNLIKELY(slen <= 0)) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
slen = len;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("Send data from %d, addr %p offset %d len %d %d %d tag %d",
|
||||
peer, data_buffer, s_offset, slen, len, blocks_in_step, tag));
|
||||
rc = MCA_PML_CALL(isend((void *)((unsigned char *)data_buffer + s_offset),
|
||||
slen, MPI_BYTE,
|
||||
peer, tag, MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[*active_requests])));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to send data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
++(*active_requests);
|
||||
}
|
||||
|
||||
completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
|
||||
if (0 == completed) {
|
||||
/* cache data for next iteration */
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration =
|
||||
i; /* why not to store step for next iteration ?! */
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask =
|
||||
blocks_in_step * cm->narray_knomial_radix;
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,174 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "bcol_ptpcoll.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
|
||||
#include "bcol_ptpcoll_mca.h"
|
||||
#include "bcol_ptpcoll_utils.h"
|
||||
|
||||
/*
|
||||
* Public string showing the bcol ptpcoll V2 component version number
|
||||
*/
|
||||
const char *mca_bcol_ptpcoll_component_version_string =
|
||||
"Open MPI bcol - ptpcoll collective MCA component version " OMPI_VERSION;
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int ptpcoll_open(void);
|
||||
static int ptpcoll_close(void);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
mca_bcol_ptpcoll_component_t mca_bcol_ptpcoll_component = {
|
||||
|
||||
/* First, fill in the super */
|
||||
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
.bcol_version = {
|
||||
MCA_BCOL_BASE_VERSION_2_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
|
||||
.mca_component_name = "ptpcoll",
|
||||
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
|
||||
OMPI_RELEASE_VERSION),
|
||||
|
||||
/* Component open and close functions */
|
||||
|
||||
.mca_open_component = ptpcoll_open,
|
||||
.mca_close_component = ptpcoll_close,
|
||||
.mca_register_component_params = mca_bcol_ptpcoll_register_mca_params,
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
|
||||
.collm_init_query = mca_bcol_ptpcoll_init_query,
|
||||
.collm_comm_query = mca_bcol_ptpcoll_comm_query,
|
||||
.init_done = false,
|
||||
.need_ordering = false,
|
||||
},
|
||||
|
||||
/* component specific */
|
||||
|
||||
};
|
||||
|
||||
static void
|
||||
collreq_construct(mca_bcol_ptpcoll_collreq_t *collreq)
|
||||
{
|
||||
collreq->requests = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
collreq_destruct(mca_bcol_ptpcoll_collreq_t *collreq)
|
||||
{
|
||||
if (NULL != collreq->requests) {
|
||||
free(collreq->requests);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_bcol_ptpcoll_collreq_t,
|
||||
opal_free_list_item_t,
|
||||
collreq_construct,
|
||||
collreq_destruct);
|
||||
|
||||
/*
|
||||
* Open the component
|
||||
*/
|
||||
static int ptpcoll_open(void)
|
||||
{
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the component
|
||||
*/
|
||||
static int ptpcoll_close(void)
|
||||
{
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* query to see if the component is available for use, and can
|
||||
* satisfy the thread and progress requirements
|
||||
*/
|
||||
int mca_bcol_ptpcoll_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
/* at this stage there is no reason to disaulify this component */
|
||||
|
||||
/* done */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* memory management routines */
|
||||
|
||||
/* allocte memory - this is a no-op function intended to work with
|
||||
* mpool2, which will use malloc for allocation, if no other allocator
|
||||
* is available.
|
||||
*/
|
||||
void * bcol_ptpcoll_allocate_memory(size_t length, size_t alignment,
|
||||
struct mca_bcol_base_module_t *bcol_module)
|
||||
{
|
||||
/* do nothing */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* register memory - nothing to do
|
||||
*/
|
||||
int bcol_ptpcoll_register_memory(void * in_ptr, size_t length, size_t alignment,
|
||||
struct mca_bcol_base_module_t *bcol_module)
|
||||
{
|
||||
/* nothing to do */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* deregister memory - nothing to do
|
||||
*/
|
||||
int bcol_ptpcoll_deregister_memory( void * in_ptr,
|
||||
struct mca_bcol_base_module_t *bcol_module)
|
||||
{
|
||||
/* nothing to do */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* free memory - since we don't allocate, we also don't free */
|
||||
int bcol_ptpcoll_free_memory(void *ptr,
|
||||
struct mca_bcol_base_module_t *bcol_module)
|
||||
{
|
||||
/* nnthing to do */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h"
|
||||
|
||||
/*
|
||||
* Fanin routines - no user data
|
||||
*/
|
||||
|
||||
int bcol_ptpcoll_fanin( bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_module_t *module)
|
||||
{
|
||||
/* local variable */
|
||||
int ret=OMPI_SUCCESS;
|
||||
/* mca_bcol_ptpcoll_module_t *ptp_module=(mca_bcol_ptpcoll_module_t *) module; */
|
||||
|
||||
/* done */
|
||||
return ret;
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h"
|
||||
|
||||
/*
|
||||
* Fanin routines - no user data
|
||||
*/
|
||||
|
||||
int bcol_ptpcoll_fanout( bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
/* local variable */
|
||||
int ret = OMPI_SUCCESS;
|
||||
/* TBD:
|
||||
mca_bcol_ptpcoll_module_t *ptp_module=(mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
|
||||
*/
|
||||
|
||||
/* done */
|
||||
return ret;
|
||||
}
|
@ -1,197 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "bcol_ptpcoll_mca.h"
|
||||
#include "bcol_ptpcoll.h"
|
||||
|
||||
/*
|
||||
* Local flags
|
||||
*/
|
||||
enum {
|
||||
REGINT_NEG_ONE_OK = 0x01,
|
||||
REGINT_GE_ZERO = 0x02,
|
||||
REGINT_GE_ONE = 0x04,
|
||||
REGINT_NONZERO = 0x08,
|
||||
REGINT_MAX = 0x88
|
||||
};
|
||||
|
||||
enum {
|
||||
REGSTR_EMPTY_OK = 0x01,
|
||||
|
||||
REGSTR_MAX = 0x88
|
||||
};
|
||||
|
||||
#if 0 /* Pasha: we will be need this function in future */
|
||||
/*
|
||||
* utility routine for string parameter registration
|
||||
*/
|
||||
static int reg_string(const char* param_name,
|
||||
const char* deprecated_param_name,
|
||||
const char* param_desc,
|
||||
const char* default_value, char **storage,
|
||||
int flags)
|
||||
{
|
||||
int index;
|
||||
|
||||
*storage = default_value;
|
||||
index = mca_base_component_var_register(&mca_bcol_ptpcoll_component.super.bcol_version,
|
||||
param_name, param_desc, MCA_BASE_VAR_TYPE_STRING,
|
||||
NULL, 0, 0, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
if (NULL != deprecated_param_name) {
|
||||
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "ptpcoll",
|
||||
deprecated_param_name,
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
if (0 != (flags & REGSTR_EMPTY_OK) && (NULL == *storage || 0 == strlen(*storage))) {
|
||||
opal_output(0, "Bad parameter value for parameter \"%s\"",
|
||||
param_name);
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* utility routine for integer parameter registration
|
||||
*/
|
||||
static int reg_int(const char* param_name,
|
||||
const char* deprecated_param_name,
|
||||
const char* param_desc,
|
||||
int default_value, int *storage, int flags)
|
||||
{
|
||||
int index;
|
||||
|
||||
*storage = default_value;
|
||||
index = mca_base_component_var_register(&mca_bcol_ptpcoll_component.super.bcol_version,
|
||||
param_name, param_desc, MCA_BASE_VAR_TYPE_INT,
|
||||
NULL, 0, 0, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
if (NULL != deprecated_param_name) {
|
||||
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "ptpcoll",
|
||||
deprecated_param_name,
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) ||
|
||||
(0 != (flags & REGINT_GE_ONE) && *storage < 1) ||
|
||||
(0 != (flags & REGINT_NONZERO) && 0 == *storage)) {
|
||||
opal_output(0, "Bad parameter value for parameter \"%s\"",
|
||||
param_name);
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int reg_bool(const char* param_name,
|
||||
const char* deprecated_param_name,
|
||||
const char* param_desc,
|
||||
bool default_value, bool *storage)
|
||||
{
|
||||
int index;
|
||||
|
||||
*storage = default_value;
|
||||
index = mca_base_component_var_register(&mca_bcol_ptpcoll_component.super.bcol_version,
|
||||
param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL,
|
||||
NULL, 0, 0, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, storage);
|
||||
if (0 > index) {
|
||||
return index;
|
||||
}
|
||||
|
||||
if (NULL != deprecated_param_name) {
|
||||
(void) mca_base_var_register_synonym(index, "ompi", "bcol", "ptpcoll",
|
||||
deprecated_param_name,
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_bcol_ptpcoll_register_mca_params(void)
|
||||
{
|
||||
int ret, tmp;
|
||||
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
|
||||
|
||||
ret = OMPI_SUCCESS;
|
||||
#define CHECK(expr) do {\
|
||||
tmp = (expr); \
|
||||
if (OMPI_SUCCESS != tmp) ret = tmp; \
|
||||
} while (0)
|
||||
|
||||
CHECK(reg_int("priority", NULL,
|
||||
"PTPCOLL component priority"
|
||||
"(from 0(low) to 90 (high))", 90, &cm->super.priority, 0));
|
||||
|
||||
CHECK(reg_int("verbose", NULL,
|
||||
"Output some verbose PTPCOLL information "
|
||||
"(0 = no output, nonzero = output)", 0, &cm->verbose, REGINT_GE_ZERO));
|
||||
|
||||
CHECK(reg_int("k_nomial_radix", NULL,
|
||||
"The radix of K-Nomial Tree "
|
||||
"(starts from 2)", 2, &cm->k_nomial_radix, REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("narray_radix", NULL,
|
||||
"The radix of Narray Tree "
|
||||
"(starts from 2)", 2, &cm->narray_radix, REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("narray_knomial_radix", NULL,
|
||||
"The radix of Narray/Knomial Tree for scatther-gather type algorithms"
|
||||
"(starts from 2)", 2, &cm->narray_knomial_radix, REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("num_to_probe", NULL,
|
||||
"Number of probe operation in single source data check"
|
||||
"(starts from 8)", 8, &cm->num_to_probe, REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("bcast_small_msg_known_root_alg", NULL,
|
||||
"Algorithm selection for bcast small messages known root"
|
||||
"(1 - K-nomial, 2 - N-array)", 1, &cm->bcast_small_messages_known_root_alg,
|
||||
REGINT_GE_ZERO));
|
||||
|
||||
CHECK(reg_int("bcast_large_msg_known_root_alg", NULL,
|
||||
"Algorithm selection for bcast large messages known root"
|
||||
"(1 - Binomial scatther-gather, 2 - N-array scather, K-nomial gather)",
|
||||
1, &cm->bcast_large_messages_known_root_alg, REGINT_GE_ZERO));
|
||||
|
||||
CHECK(reg_int("barrier_alg", NULL,
|
||||
"Algorithm selection for Barrier"
|
||||
"(1 - Recursive doubling, 2 - Recursive K-ing)",
|
||||
1, &cm->barrier_alg, REGINT_GE_ZERO));
|
||||
|
||||
/* register parmeters controlling message fragementation */
|
||||
CHECK(reg_int("min_frag_size", NULL,
|
||||
"Minimum fragment size",
|
||||
getpagesize(), &cm->super.min_frag_size, REGINT_GE_ONE));
|
||||
|
||||
CHECK(reg_int("max_frag_size", NULL,
|
||||
"Maximum fragment size",
|
||||
FRAG_SIZE_NO_LIMIT, &cm->super.max_frag_size, REGINT_NONZERO));
|
||||
|
||||
CHECK(reg_bool("can_use_user_buffers", NULL,
|
||||
"User memory can be used by the collective algorithms",
|
||||
1, &cm->super.can_use_user_buffers));
|
||||
|
||||
return ret;
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#ifndef MCA_BCOL_PTPCOLL_MCA_H
|
||||
#define MCA_BCOL_PTPCOLL_MCA_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
int mca_bcol_ptpcoll_register_mca_params(void);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
@ -1,760 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/mca/pml/pml.h" /* need this for the max tag size */
|
||||
|
||||
#include "bcol_ptpcoll.h"
|
||||
#include "bcol_ptpcoll_utils.h"
|
||||
#include "bcol_ptpcoll_bcast.h"
|
||||
#include "bcol_ptpcoll_allreduce.h"
|
||||
#include "bcol_ptpcoll_reduce.h"
|
||||
|
||||
#define BCOL_PTP_CACHE_LINE_SIZE 128
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int alloc_allreduce_offsets_array(mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
int rc = OMPI_SUCCESS, i = 0;
|
||||
netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
|
||||
int n_exchanges = k_node->n_exchanges;
|
||||
|
||||
/* Precalculate the allreduce offsets */
|
||||
if (0 < k_node->n_exchanges) {
|
||||
ptpcoll_module->allgather_offsets = (int **) calloc (n_exchanges, sizeof(int *));
|
||||
|
||||
if (!ptpcoll_module->allgather_offsets) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
for (i = 0; i < n_exchanges ; i++) {
|
||||
ptpcoll_module->allgather_offsets[i] = (int *) calloc (NOFFSETS, sizeof(int));
|
||||
|
||||
if (!ptpcoll_module->allgather_offsets[i]){
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int free_allreduce_offsets_array(mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
int rc = OMPI_SUCCESS, i = 0;
|
||||
netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
|
||||
int n_exchanges = k_node->n_exchanges;
|
||||
|
||||
if (ptpcoll_module->allgather_offsets) {
|
||||
for (i=0; i < n_exchanges; i++) {
|
||||
free (ptpcoll_module->allgather_offsets[i]);
|
||||
}
|
||||
}
|
||||
|
||||
free(ptpcoll_module->allgather_offsets);
|
||||
ptpcoll_module->allgather_offsets = NULL;
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void
|
||||
mca_bcol_ptpcoll_module_construct(mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
uint64_t i;
|
||||
/* Pointer to component */
|
||||
ptpcoll_module->narray_node = NULL;
|
||||
ptpcoll_module->allgather_offsets = NULL;
|
||||
ptpcoll_module->super.bcol_component = (mca_bcol_base_component_t *) &mca_bcol_ptpcoll_component;
|
||||
ptpcoll_module->super.list_n_connected = NULL;
|
||||
ptpcoll_module->super.hier_scather_offset = 0;
|
||||
/* no header support in ptp */
|
||||
ptpcoll_module->super.header_size = 0;
|
||||
/* No network context */
|
||||
ptpcoll_module->super.network_context = NULL;
|
||||
/* set the upper limit on the tag */
|
||||
i = 2;
|
||||
ptpcoll_module->tag_mask = 1;
|
||||
while ( i <= (uint64_t) mca_pml.pml_max_tag && i > 0) {
|
||||
i <<= 1;
|
||||
}
|
||||
ptpcoll_module->ml_mem.ml_buf_desc = NULL;
|
||||
ptpcoll_module->tag_mask = i - 1;
|
||||
}
|
||||
|
||||
static void
|
||||
mca_bcol_ptpcoll_module_destruct(mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
int i;
|
||||
mca_bcol_ptpcoll_local_mlmem_desc_t *ml_mem = &ptpcoll_module->ml_mem;
|
||||
|
||||
if (NULL != ml_mem->ml_buf_desc) {
|
||||
/* Release the memory structs that were cache ML memory data */
|
||||
uint32_t i, j, ci;
|
||||
for (i = 0; i < ml_mem->num_banks; i++) {
|
||||
for (j = 0; j < ml_mem->num_buffers_per_bank; j++) {
|
||||
ci = i * ml_mem->num_buffers_per_bank + j;
|
||||
if (NULL != ml_mem->ml_buf_desc[ci].requests) {
|
||||
free(ml_mem->ml_buf_desc[ci].requests);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* release the buffer descriptor */
|
||||
free(ml_mem->ml_buf_desc);
|
||||
ml_mem->ml_buf_desc = NULL;
|
||||
}
|
||||
|
||||
if (NULL != ptpcoll_module->allgather_offsets) {
|
||||
free_allreduce_offsets_array(ptpcoll_module);
|
||||
}
|
||||
|
||||
if (NULL != ptpcoll_module->narray_node) {
|
||||
for (i = 0; i < ptpcoll_module->group_size; i++) {
|
||||
if (NULL != ptpcoll_module->narray_node[i].children_ranks) {
|
||||
free(ptpcoll_module->narray_node[i].children_ranks);
|
||||
}
|
||||
}
|
||||
|
||||
free(ptpcoll_module->narray_node);
|
||||
ptpcoll_module->narray_node = NULL;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&ptpcoll_module->collreqs_free);
|
||||
|
||||
if (NULL != ptpcoll_module->super.list_n_connected) {
|
||||
free(ptpcoll_module->super.list_n_connected);
|
||||
ptpcoll_module->super.list_n_connected = NULL;
|
||||
}
|
||||
|
||||
for (i = 0; i < BCOL_NUM_OF_FUNCTIONS; i++){
|
||||
OPAL_LIST_DESTRUCT((&ptpcoll_module->super.bcol_fns_table[i]));
|
||||
}
|
||||
|
||||
|
||||
if (NULL != ptpcoll_module->kn_proxy_extra_index) {
|
||||
free(ptpcoll_module->kn_proxy_extra_index);
|
||||
ptpcoll_module->kn_proxy_extra_index = NULL;
|
||||
}
|
||||
|
||||
if (NULL != ptpcoll_module->alltoall_iovec) {
|
||||
free(ptpcoll_module->alltoall_iovec);
|
||||
ptpcoll_module->alltoall_iovec = NULL;
|
||||
}
|
||||
|
||||
if (NULL != ptpcoll_module->narray_knomial_proxy_extra_index) {
|
||||
free(ptpcoll_module->narray_knomial_proxy_extra_index);
|
||||
ptpcoll_module->narray_knomial_proxy_extra_index = NULL;
|
||||
}
|
||||
|
||||
if (NULL != ptpcoll_module->narray_knomial_node) {
|
||||
for(i = 0; i < ptpcoll_module->full_narray_tree_size; i++) {
|
||||
netpatterns_cleanup_narray_knomial_tree (ptpcoll_module->narray_knomial_node + i);
|
||||
}
|
||||
free(ptpcoll_module->narray_knomial_node);
|
||||
ptpcoll_module->narray_knomial_node = NULL;
|
||||
}
|
||||
|
||||
netpatterns_cleanup_recursive_knomial_allgather_tree_node(&ptpcoll_module->knomial_allgather_tree);
|
||||
netpatterns_cleanup_recursive_knomial_tree_node(&ptpcoll_module->knomial_exchange_tree);
|
||||
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_bcol_ptpcoll_module_t,
|
||||
mca_bcol_base_module_t,
|
||||
mca_bcol_ptpcoll_module_construct,
|
||||
mca_bcol_ptpcoll_module_destruct);
|
||||
|
||||
static int init_ml_buf_desc(mca_bcol_ptpcoll_ml_buffer_desc_t **desc, void *base_addr, uint32_t num_banks,
|
||||
uint32_t num_buffers_per_bank, uint32_t size_buffer, uint32_t header_size, int group_size, int pow_k)
|
||||
{
|
||||
uint32_t i, j, ci;
|
||||
mca_bcol_ptpcoll_ml_buffer_desc_t *tmp_desc = NULL;
|
||||
int k_nomial_radix = mca_bcol_ptpcoll_component.k_nomial_radix;
|
||||
int pow_k_val = (0 == pow_k) ? 1 : pow_k;
|
||||
int num_to_alloc =
|
||||
((k_nomial_radix - 1) * pow_k_val * 2 + 1 > mca_bcol_ptpcoll_component.narray_radix) ?
|
||||
(k_nomial_radix - 1) * pow_k_val * 2 + 1 :
|
||||
mca_bcol_ptpcoll_component.narray_radix * 2;
|
||||
|
||||
|
||||
*desc = (mca_bcol_ptpcoll_ml_buffer_desc_t *)calloc(num_banks * num_buffers_per_bank,
|
||||
sizeof(mca_bcol_ptpcoll_ml_buffer_desc_t));
|
||||
if (NULL == *desc) {
|
||||
PTPCOLL_ERROR(("Failed to allocate memory"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
tmp_desc = *desc;
|
||||
|
||||
for (i = 0; i < num_banks; i++) {
|
||||
for (j = 0; j < num_buffers_per_bank; j++) {
|
||||
ci = i * num_buffers_per_bank + j;
|
||||
tmp_desc[ci].bank_index = i;
|
||||
tmp_desc[ci].buffer_index = j;
|
||||
/* *2 is for gather session +1 for extra peer */
|
||||
tmp_desc[ci].requests = (ompi_request_t **)
|
||||
calloc(num_to_alloc, sizeof(ompi_request_t *));
|
||||
if (NULL == tmp_desc[ci].requests) {
|
||||
PTPCOLL_ERROR(("Failed to allocate memory for requests"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
/*
|
||||
* ptpcoll don't have any header, but other bcols may to have. So
|
||||
* we need to take it in account.
|
||||
*/
|
||||
tmp_desc[ci].data_addr = (void *)
|
||||
((unsigned char*)base_addr + ci * size_buffer + header_size);
|
||||
PTPCOLL_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr));
|
||||
|
||||
/* init reduce implementation flags */
|
||||
tmp_desc[ci].reduce_init_called = false;
|
||||
tmp_desc[ci].reduction_status = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static void mca_bcol_ptpcoll_set_small_msg_thresholds(struct mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module =
|
||||
(mca_bcol_ptpcoll_module_t *) super;
|
||||
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
|
||||
|
||||
/* Subtract out the maximum header size when calculating the thresholds. This
|
||||
* will account for the headers used by the basesmuma component. If we do not
|
||||
* take these headers into account we may overrun our buffer. */
|
||||
|
||||
/* Set the Allgather threshold equals to a ML buff size */
|
||||
super->small_message_thresholds[BCOL_ALLGATHER] =
|
||||
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) /
|
||||
ompi_comm_size(ptpcoll_module->super.sbgp_partner_module->group_comm);
|
||||
|
||||
/* Set the Bcast threshold, all Bcast algths have the same threshold */
|
||||
super->small_message_thresholds[BCOL_BCAST] =
|
||||
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX);
|
||||
|
||||
/* Set the Alltoall threshold, the Ring algth sets some limitation */
|
||||
super->small_message_thresholds[BCOL_ALLTOALL] =
|
||||
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / 2;
|
||||
|
||||
/* Set the Allreduce threshold, NARRAY algth sets some limitation */
|
||||
super->small_message_thresholds[BCOL_ALLREDUCE] =
|
||||
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / ptpcoll_module->k_nomial_radix;
|
||||
|
||||
/* Set the Reduce threshold, NARRAY algth sets some limitation */
|
||||
super->small_message_thresholds[BCOL_REDUCE] =
|
||||
(ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / cm->narray_radix;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cache information about ML memory
|
||||
*/
|
||||
static int mca_bcol_ptpcoll_cache_ml_memory_info(struct mca_bcol_base_memory_block_desc_t *payload_block,
|
||||
uint32_t data_offset,
|
||||
struct mca_bcol_base_module_t *bcol,
|
||||
void *reg_data)
|
||||
{
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) bcol;
|
||||
mca_bcol_ptpcoll_local_mlmem_desc_t *ml_mem = &ptpcoll_module->ml_mem;
|
||||
struct mca_bcol_base_memory_block_desc_t *desc = payload_block;
|
||||
int group_size = ptpcoll_module->super.sbgp_partner_module->group_size;
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("mca_bcol_ptpcoll_init_buffer_memory was called"));
|
||||
|
||||
/* cache ml mem desc tunings localy */
|
||||
ml_mem->num_banks = desc->num_banks;
|
||||
ml_mem->num_buffers_per_bank = desc->num_buffers_per_bank;
|
||||
ml_mem->size_buffer = desc->size_buffer;
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("ML buffer configuration num banks %d num_per_bank %d size %d base addr %p",
|
||||
desc->num_banks, desc->num_buffers_per_bank, desc->size_buffer, desc->block->base_addr));
|
||||
|
||||
/* Set first bank index for release */
|
||||
ml_mem->bank_index_for_release = 0;
|
||||
|
||||
if (OMPI_SUCCESS != init_ml_buf_desc(&ml_mem->ml_buf_desc,
|
||||
desc->block->base_addr,
|
||||
ml_mem->num_banks,
|
||||
ml_mem->num_buffers_per_bank,
|
||||
ml_mem->size_buffer,
|
||||
data_offset,
|
||||
group_size,
|
||||
ptpcoll_module->pow_k)) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to allocate rdma memory descriptor\n"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10, ("ptpcoll_module = %p, ml_mem_desc = %p.\n",
|
||||
ptpcoll_module));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Load ptpcoll bcol functions
|
||||
*/
|
||||
static void load_func(mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
int fnc;
|
||||
|
||||
/* reset everything to NULL */
|
||||
for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
|
||||
|
||||
/*ptpcoll_module->super.bcol_function_table[fnc] = NULL;*/
|
||||
ptpcoll_module->super.bcol_function_table[fnc] = NULL;
|
||||
ptpcoll_module->super.bcol_function_init_table[fnc] = NULL;
|
||||
}
|
||||
|
||||
ptpcoll_module->super.bcol_function_init_table[BCOL_BARRIER] = bcol_ptpcoll_barrier_init;
|
||||
|
||||
ptpcoll_module->super.bcol_function_init_table[BCOL_BCAST] = bcol_ptpcoll_bcast_init;
|
||||
ptpcoll_module->super.bcol_function_init_table[BCOL_ALLREDUCE] = bcol_ptpcoll_allreduce_init;
|
||||
ptpcoll_module->super.bcol_function_init_table[BCOL_ALLGATHER] = bcol_ptpcoll_allgather_init;
|
||||
ptpcoll_module->super.bcol_function_table[BCOL_BCAST] = bcol_ptpcoll_bcast_k_nomial_anyroot;
|
||||
ptpcoll_module->super.bcol_function_init_table[BCOL_ALLTOALL] = NULL;
|
||||
ptpcoll_module->super.bcol_function_init_table[BCOL_SYNC] = mca_bcol_ptpcoll_memsync_init;
|
||||
ptpcoll_module->super.bcol_function_init_table[BCOL_REDUCE] = bcol_ptpcoll_reduce_init;
|
||||
|
||||
/* ML memory cacher */
|
||||
ptpcoll_module->super.bcol_memory_init = mca_bcol_ptpcoll_cache_ml_memory_info;
|
||||
|
||||
/* Set thresholds */
|
||||
ptpcoll_module->super.set_small_msg_thresholds = mca_bcol_ptpcoll_set_small_msg_thresholds;
|
||||
|
||||
/* setup recursive k-ing tree */
|
||||
ptpcoll_module->super.k_nomial_tree = mca_bcol_ptpcoll_setup_knomial_tree;
|
||||
}
|
||||
|
||||
int mca_bcol_ptpcoll_setup_knomial_tree(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_ptpcoll_module_t *p2p_module = (mca_bcol_ptpcoll_module_t *) super;
|
||||
int rc = 0;
|
||||
|
||||
rc = netpatterns_setup_recursive_knomial_allgather_tree_node(
|
||||
p2p_module->super.sbgp_partner_module->group_size,
|
||||
p2p_module->super.sbgp_partner_module->my_index,
|
||||
mca_bcol_ptpcoll_component.k_nomial_radix,
|
||||
super->list_n_connected,
|
||||
&p2p_module->knomial_allgather_tree);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* The function used to calculate size */
|
||||
static int calc_full_tree_size(int radix, int group_size, int *num_leafs)
|
||||
{
|
||||
int level_cnt = 1;
|
||||
int total_cnt = 0;
|
||||
|
||||
while( total_cnt < group_size ) {
|
||||
total_cnt += level_cnt;
|
||||
level_cnt *= radix;
|
||||
}
|
||||
|
||||
if (total_cnt > group_size) {
|
||||
*num_leafs = level_cnt / radix;
|
||||
return total_cnt - level_cnt / radix;
|
||||
} else {
|
||||
*num_leafs = level_cnt;
|
||||
return group_size;
|
||||
}
|
||||
}
|
||||
|
||||
/* Setup N-array scatter Knomial-gather static information */
|
||||
static int load_narray_knomial_tree (mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
int rc, i, peer;
|
||||
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
|
||||
|
||||
ptpcoll_module->full_narray_tree_size = calc_full_tree_size(
|
||||
cm->narray_knomial_radix,
|
||||
ptpcoll_module->group_size,
|
||||
&ptpcoll_module->full_narray_tree_num_leafs);
|
||||
|
||||
ptpcoll_module->narray_knomial_proxy_extra_index = (int *)
|
||||
malloc(sizeof(int) * (cm->narray_knomial_radix));
|
||||
if (NULL == ptpcoll_module->narray_knomial_proxy_extra_index) {
|
||||
PTPCOLL_ERROR(("Failed to allocate memory"));
|
||||
goto Error;
|
||||
}
|
||||
|
||||
ptpcoll_module->narray_knomial_node = calloc(
|
||||
ptpcoll_module->full_narray_tree_size,
|
||||
sizeof(netpatterns_narray_knomial_tree_node_t));
|
||||
if(NULL == ptpcoll_module->narray_knomial_node) {
|
||||
goto Error;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(10 ,("My type is proxy, full tree size = %d [%d]",
|
||||
ptpcoll_module->full_narray_tree_size,
|
||||
cm->narray_knomial_radix
|
||||
));
|
||||
|
||||
if (ptpcoll_module->super.sbgp_partner_module->my_index <
|
||||
ptpcoll_module->full_narray_tree_size) {
|
||||
if (ptpcoll_module->super.sbgp_partner_module->my_index <
|
||||
ptpcoll_module->group_size - ptpcoll_module->full_narray_tree_size) {
|
||||
ptpcoll_module->narray_type = PTPCOLL_PROXY;
|
||||
for (i = 0; i < cm->narray_knomial_radix; i++) {
|
||||
peer =
|
||||
ptpcoll_module->super.sbgp_partner_module->my_index *
|
||||
cm->narray_knomial_radix + i +
|
||||
ptpcoll_module->full_narray_tree_size;
|
||||
if (peer >= ptpcoll_module->group_size) {
|
||||
break;
|
||||
}
|
||||
ptpcoll_module->narray_knomial_proxy_extra_index[i] = peer;
|
||||
}
|
||||
ptpcoll_module->narray_knomial_proxy_num = i;
|
||||
} else {
|
||||
ptpcoll_module->narray_type = PTPCOLL_IN_GROUP;;
|
||||
}
|
||||
/* Setting node info */
|
||||
for(i = 0; i < ptpcoll_module->full_narray_tree_size; i++) {
|
||||
rc = netpatterns_setup_narray_knomial_tree(
|
||||
cm->narray_knomial_radix,
|
||||
i,
|
||||
ptpcoll_module->full_narray_tree_size,
|
||||
&ptpcoll_module->narray_knomial_node[i]);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
goto Error;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ptpcoll_module->narray_type = PTPCOLL_EXTRA;
|
||||
ptpcoll_module->narray_knomial_proxy_extra_index[0] =
|
||||
(ptpcoll_module->super.sbgp_partner_module->my_index -
|
||||
ptpcoll_module->full_narray_tree_size) /
|
||||
cm->narray_knomial_radix;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
Error:
|
||||
if (NULL != ptpcoll_module->narray_knomial_node) {
|
||||
free(ptpcoll_module->narray_knomial_node);
|
||||
}
|
||||
if (NULL != ptpcoll_module->narray_knomial_proxy_extra_index) {
|
||||
free(ptpcoll_module->narray_knomial_proxy_extra_index);
|
||||
}
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Setup N-array static information */
|
||||
static int load_narray_tree(mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
int rc, i;
|
||||
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
|
||||
|
||||
ptpcoll_module->narray_node = calloc(ptpcoll_module->group_size,
|
||||
sizeof(netpatterns_tree_node_t));
|
||||
if(NULL == ptpcoll_module->narray_node ) {
|
||||
goto Error;
|
||||
}
|
||||
|
||||
for(i = 0; i < ptpcoll_module->group_size; i++) {
|
||||
rc = netpatterns_setup_narray_tree(
|
||||
cm->narray_radix,
|
||||
i,
|
||||
ptpcoll_module->group_size,
|
||||
&ptpcoll_module->narray_node[i]);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
goto Error;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
Error:
|
||||
if (NULL != ptpcoll_module->narray_node) {
|
||||
free(ptpcoll_module->narray_node);
|
||||
}
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
static int load_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
int i;
|
||||
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
|
||||
|
||||
ptpcoll_module->k_nomial_radix =
|
||||
cm->k_nomial_radix > ptpcoll_module->group_size ?
|
||||
ptpcoll_module->group_size :
|
||||
cm->k_nomial_radix;
|
||||
|
||||
ptpcoll_module->pow_k = pow_k_calc(ptpcoll_module->k_nomial_radix,
|
||||
ptpcoll_module->group_size,
|
||||
&ptpcoll_module->pow_knum);
|
||||
|
||||
ptpcoll_module->kn_proxy_extra_index = (int *)
|
||||
malloc(sizeof(int) * (ptpcoll_module->k_nomial_radix - 1));
|
||||
if (NULL == ptpcoll_module->kn_proxy_extra_index) {
|
||||
PTPCOLL_ERROR(("Failed to allocate memory"));
|
||||
goto Error;
|
||||
}
|
||||
|
||||
/* Setting peer type for K-nomial algorithm*/
|
||||
if (ptpcoll_module->super.sbgp_partner_module->my_index < ptpcoll_module->pow_knum ) {
|
||||
if (ptpcoll_module->super.sbgp_partner_module->my_index <
|
||||
ptpcoll_module->group_size - ptpcoll_module->pow_knum) {
|
||||
for (i = 0;
|
||||
i < (ptpcoll_module->k_nomial_radix - 1) &&
|
||||
ptpcoll_module->super.sbgp_partner_module->my_index *
|
||||
(ptpcoll_module->k_nomial_radix - 1) +
|
||||
i + ptpcoll_module->pow_knum < ptpcoll_module->group_size
|
||||
; i++) {
|
||||
ptpcoll_module->pow_ktype = PTPCOLL_KN_PROXY;
|
||||
ptpcoll_module->kn_proxy_extra_index[i] =
|
||||
ptpcoll_module->super.sbgp_partner_module->my_index *
|
||||
(ptpcoll_module->k_nomial_radix - 1) +
|
||||
i + ptpcoll_module->pow_knum;
|
||||
PTPCOLL_VERBOSE(10 ,("My type is proxy, pow_knum = %d [%d] my extra %d",
|
||||
ptpcoll_module->pow_knum,
|
||||
ptpcoll_module->pow_k,
|
||||
ptpcoll_module->kn_proxy_extra_index[i]));
|
||||
}
|
||||
ptpcoll_module->kn_proxy_extra_num = i;
|
||||
} else {
|
||||
PTPCOLL_VERBOSE(10 ,("My type is in group, pow_knum = %d [%d]", ptpcoll_module->pow_knum,
|
||||
ptpcoll_module->pow_k));
|
||||
ptpcoll_module->pow_ktype = PTPCOLL_KN_IN_GROUP;
|
||||
}
|
||||
} else {
|
||||
ptpcoll_module->pow_ktype = PTPCOLL_KN_EXTRA;
|
||||
ptpcoll_module->kn_proxy_extra_index[0] = (ptpcoll_module->super.sbgp_partner_module->my_index -
|
||||
ptpcoll_module->pow_knum) / (ptpcoll_module->k_nomial_radix - 1);
|
||||
PTPCOLL_VERBOSE(10 ,("My type is extra , pow_knum = %d [%d] my proxy %d",
|
||||
ptpcoll_module->pow_knum,
|
||||
ptpcoll_module->pow_k,
|
||||
ptpcoll_module->kn_proxy_extra_index[0]));
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
Error:
|
||||
if (NULL == ptpcoll_module->kn_proxy_extra_index) {
|
||||
free(ptpcoll_module->kn_proxy_extra_index);
|
||||
}
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
static int load_binomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
ptpcoll_module->pow_2 = pow_k_calc(2,
|
||||
ptpcoll_module->group_size,
|
||||
&ptpcoll_module->pow_2num);
|
||||
|
||||
assert(ptpcoll_module->pow_2num == 1 << ptpcoll_module->pow_2);
|
||||
assert(ptpcoll_module->pow_2num <= ptpcoll_module->group_size);
|
||||
|
||||
/* Setting peer type for binary algorithm*/
|
||||
if (ptpcoll_module->super.sbgp_partner_module->my_index < ptpcoll_module->pow_2num ) {
|
||||
if (ptpcoll_module->super.sbgp_partner_module->my_index <
|
||||
ptpcoll_module->group_size - ptpcoll_module->pow_2num) {
|
||||
PTPCOLL_VERBOSE(10 ,("My type is proxy, pow_2num = %d [%d]", ptpcoll_module->pow_2num,
|
||||
ptpcoll_module->pow_2));
|
||||
ptpcoll_module->pow_2type = PTPCOLL_PROXY;
|
||||
ptpcoll_module->proxy_extra_index = ptpcoll_module->super.sbgp_partner_module->my_index +
|
||||
ptpcoll_module->pow_2num;
|
||||
} else {
|
||||
PTPCOLL_VERBOSE(10 ,("My type is in group, pow_2num = %d [%d]", ptpcoll_module->pow_2num,
|
||||
ptpcoll_module->pow_2));
|
||||
ptpcoll_module->pow_2type = PTPCOLL_IN_GROUP;
|
||||
}
|
||||
} else {
|
||||
PTPCOLL_VERBOSE(10 ,("My type is extra , pow_2num = %d [%d]", ptpcoll_module->pow_2num,
|
||||
ptpcoll_module->pow_2));
|
||||
ptpcoll_module->pow_2type = PTPCOLL_EXTRA;
|
||||
ptpcoll_module->proxy_extra_index = ptpcoll_module->super.sbgp_partner_module->my_index -
|
||||
ptpcoll_module->pow_2num;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int load_recursive_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
rc = netpatterns_setup_recursive_knomial_tree_node(
|
||||
ptpcoll_module->group_size,
|
||||
ptpcoll_module->super.sbgp_partner_module->my_index,
|
||||
mca_bcol_ptpcoll_component.k_nomial_radix,
|
||||
&ptpcoll_module->knomial_exchange_tree);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int bcol_ptpcoll_collreq_init(opal_free_list_item_t *item, void* ctx)
|
||||
{
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module= (mca_bcol_ptpcoll_module_t *) ctx;
|
||||
mca_bcol_ptpcoll_collreq_t *collreq = (mca_bcol_ptpcoll_collreq_t *) item;
|
||||
|
||||
switch(mca_bcol_ptpcoll_component.barrier_alg) {
|
||||
case 1:
|
||||
collreq->requests = (ompi_request_t **)
|
||||
calloc(2, sizeof(ompi_request_t *));
|
||||
break;
|
||||
case 2:
|
||||
collreq->requests = (ompi_request_t **)
|
||||
calloc(2 * ptpcoll_module->k_nomial_radix, sizeof(ompi_request_t *));
|
||||
break;
|
||||
}
|
||||
|
||||
if (NULL == collreq->requests) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* query to see if the module is available for use on the given
|
||||
* communicator, and if so, what it's priority is. This is where
|
||||
* the backing shared-memory file is created.
|
||||
*/
|
||||
mca_bcol_base_module_t **mca_bcol_ptpcoll_comm_query(mca_sbgp_base_module_t *sbgp,
|
||||
int *num_modules)
|
||||
{
|
||||
int rc;
|
||||
/* local variables */
|
||||
struct ompi_communicator_t *comm = sbgp->group_comm;
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module = NULL;
|
||||
mca_bcol_base_module_t **ptpcoll_modules = NULL;
|
||||
int iovec_size;
|
||||
|
||||
/* initialize local variables */
|
||||
*num_modules = 0;
|
||||
|
||||
/*
|
||||
* This is activated only for intra-communicators
|
||||
*/
|
||||
if (OMPI_COMM_IS_INTER(comm) ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* allocate and initialize an sm-v2 module */
|
||||
ptpcoll_modules = (mca_bcol_base_module_t **) malloc(sizeof(mca_bcol_base_module_t *));
|
||||
if (NULL == ptpcoll_modules) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ptpcoll_module = OBJ_NEW(mca_bcol_ptpcoll_module_t);
|
||||
if (NULL == ptpcoll_module) {
|
||||
free(ptpcoll_modules);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* On this stage we support only one single module */
|
||||
ptpcoll_modules[*num_modules] = &(ptpcoll_module->super);
|
||||
|
||||
(*num_modules)++;
|
||||
/* set the subgroup */
|
||||
ptpcoll_module->super.sbgp_partner_module = sbgp;
|
||||
/* caching some useful information */
|
||||
ptpcoll_module->group_size =
|
||||
ptpcoll_module->super.sbgp_partner_module->group_size;
|
||||
|
||||
rc = load_binomial_info(ptpcoll_module);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to load knomial info"));
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
rc = load_knomial_info(ptpcoll_module);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to load knomial info"));
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
rc = load_narray_tree(ptpcoll_module);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to load narray tree"));
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
rc = load_narray_knomial_tree(ptpcoll_module);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to load narray-knomila tree"));
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
rc = load_recursive_knomial_info(ptpcoll_module);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to load recursive knomial tree"));
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* creating collfrag free list */
|
||||
OBJ_CONSTRUCT(&ptpcoll_module->collreqs_free, opal_free_list_t);
|
||||
rc = opal_free_list_init (&ptpcoll_module->collreqs_free,
|
||||
sizeof(mca_bcol_ptpcoll_collreq_t),
|
||||
BCOL_PTP_CACHE_LINE_SIZE,
|
||||
OBJ_CLASS(mca_bcol_ptpcoll_collreq_t),
|
||||
0, BCOL_PTP_CACHE_LINE_SIZE,
|
||||
256 /* free_list_num */,
|
||||
-1 /* free_list_max, -1 = infinite */,
|
||||
32 /* free_list_inc */,
|
||||
NULL, 0, NULL,
|
||||
bcol_ptpcoll_collreq_init,
|
||||
ptpcoll_module);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
load_func(ptpcoll_module);
|
||||
|
||||
rc = alloc_allreduce_offsets_array(ptpcoll_module);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* Allocating iovec for PTP alltoall */
|
||||
iovec_size = ptpcoll_module->group_size / 2 + ptpcoll_module->group_size % 2;
|
||||
ptpcoll_module->alltoall_iovec = (struct iovec *) malloc(sizeof(struct iovec)
|
||||
* iovec_size);
|
||||
ptpcoll_module->log_group_size = lognum(ptpcoll_module->group_size);
|
||||
|
||||
rc = mca_bcol_base_bcol_fns_table_init(&(ptpcoll_module->super));
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* Zero copy is supported */
|
||||
ptpcoll_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY;
|
||||
|
||||
/* return */
|
||||
return ptpcoll_modules;
|
||||
|
||||
CLEANUP:
|
||||
|
||||
OBJ_RELEASE(ptpcoll_module);
|
||||
free(ptpcoll_modules);
|
||||
return NULL;
|
||||
}
|
@ -1,405 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "bcol_ptpcoll_reduce.h"
|
||||
#include "bcol_ptpcoll_utils.h"
|
||||
|
||||
static int bcol_ptpcoll_reduce_narray_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
static int bcol_ptpcoll_reduce_narray(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args);
|
||||
|
||||
|
||||
#define NARRAY_RECV_NB(narray_node, process_shift, group_size, \
|
||||
recv_buffer, pack_len, tag, comm, recv_requests, \
|
||||
num_pending_recvs) \
|
||||
do { \
|
||||
int n, rc = OMPI_SUCCESS; \
|
||||
int dst; \
|
||||
int comm_dst; \
|
||||
int offset = 0 ; \
|
||||
\
|
||||
/* Recieve data from all relevant childrens */ \
|
||||
for (n = 0; n < narray_node->n_children; n++) { \
|
||||
\
|
||||
dst = narray_node->children_ranks[n] + process_shift; \
|
||||
if (dst >= group_size) { \
|
||||
dst -= group_size; \
|
||||
} \
|
||||
comm_dst = group_list[dst]; \
|
||||
\
|
||||
/* Non blocking send .... */ \
|
||||
PTPCOLL_VERBOSE(1 , ("Reduce, Irecv data to %d[%d], count %d, tag %d, addr %p", \
|
||||
dst, comm_dst, pack_len, tag, \
|
||||
data_buffer)); \
|
||||
rc = MCA_PML_CALL(irecv((void *)((unsigned char*)recv_buffer + offset), pack_len, MPI_BYTE, \
|
||||
comm_dst, tag, comm, \
|
||||
&(recv_requests[*num_pending_recvs]))); \
|
||||
if( OMPI_SUCCESS != rc ) { \
|
||||
PTPCOLL_VERBOSE(10, ("Failed to start non-blocking receive")); \
|
||||
return OMPI_ERROR; \
|
||||
} \
|
||||
++(*num_pending_recvs); \
|
||||
offset += pack_len; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
static inline int narray_reduce(void *data_buffer, void *recv_buffer,
|
||||
int nrecvs, int count,
|
||||
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
||||
int *reduction_status) {
|
||||
int pack_len = count * dtype->super.size;
|
||||
int i = 0;
|
||||
void *source_buffer = NULL, *result_buffer = NULL;
|
||||
|
||||
source_buffer = data_buffer;
|
||||
result_buffer = recv_buffer;
|
||||
|
||||
for (i = 0; i < nrecvs; i++) {
|
||||
ompi_op_reduce(op, (void*)((unsigned char*) source_buffer) ,
|
||||
(void*)((unsigned char*) result_buffer),
|
||||
count,dtype);
|
||||
|
||||
source_buffer = (void *)((unsigned char*)recv_buffer
|
||||
+ (i+1) * pack_len);
|
||||
}
|
||||
|
||||
*reduction_status = 1;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
static int bcol_ptpcoll_reduce_narray_progress(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module;
|
||||
|
||||
int tag = -1;
|
||||
int rc;
|
||||
int group_size = ptpcoll_module->group_size;
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
uint32_t buffer_index = input_args->buffer_index;
|
||||
struct ompi_op_t *op = input_args->op;
|
||||
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
ompi_request_t **send_request =
|
||||
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0];
|
||||
ompi_request_t **recv_requests =
|
||||
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[1];
|
||||
void *data_buffer = NULL;
|
||||
void *src_buffer = (void *) (
|
||||
(unsigned char *)input_args->sbuf +
|
||||
(size_t)input_args->sbuf_offset);
|
||||
void *recv_buffer = (void *) (
|
||||
(unsigned char *)input_args->rbuf +
|
||||
(size_t)input_args->rbuf_offset);
|
||||
int count = input_args->count;
|
||||
struct ompi_datatype_t *dtype = input_args->dtype;
|
||||
int pack_len = input_args->count * input_args->dtype->super.size;
|
||||
int *active_requests =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
|
||||
int matched = false;
|
||||
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
|
||||
int relative_group_index = 0;
|
||||
netpatterns_tree_node_t *narray_node = NULL;
|
||||
bool not_sent = false;
|
||||
int parent_rank = -1, comm_parent_rank = -1;
|
||||
int group_root_index = input_args->root;
|
||||
|
||||
if (!ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduce_init_called) {
|
||||
bcol_ptpcoll_reduce_narray(input_args, const_args);
|
||||
}
|
||||
/*
|
||||
* By default the src buffer is the data buffer,
|
||||
* only after reduction, the recv buffer becomes the
|
||||
* data buffer
|
||||
*/
|
||||
data_buffer = src_buffer;
|
||||
|
||||
relative_group_index = my_group_index - group_root_index;
|
||||
if (relative_group_index < 0) {
|
||||
relative_group_index +=group_size;
|
||||
}
|
||||
|
||||
/* keep tag within the limit support by the pml */
|
||||
tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
|
||||
/* mark this as a collective tag, to avoid conflict with user-level tags */
|
||||
tag = -tag;
|
||||
|
||||
narray_node = &ptpcoll_module->narray_node[relative_group_index];
|
||||
|
||||
PTPCOLL_VERBOSE(3, ("reduce, Narray tree Progress"));
|
||||
|
||||
PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_reduce_narray, buffer index: %d "
|
||||
"tag: %d "
|
||||
"tag_mask: %d "
|
||||
"sn: %d "
|
||||
"root: %d [%d]"
|
||||
"buff: %p ",
|
||||
buffer_index, tag,
|
||||
ptpcoll_module->tag_mask, input_args->sequence_num,
|
||||
input_args->root_flag, input_args->root_route->rank,
|
||||
data_buffer));
|
||||
|
||||
/*
|
||||
Check if the data was received
|
||||
*/
|
||||
if (0 != *active_requests) {
|
||||
matched = mca_bcol_ptpcoll_test_all_for_match
|
||||
(active_requests, recv_requests, &rc);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
||||
/* All data was received, then do a reduction*/
|
||||
if(matched) {
|
||||
narray_reduce(data_buffer, recv_buffer, narray_node->n_children, count, dtype, op,
|
||||
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status);
|
||||
|
||||
/*
|
||||
* The reduction result is in the recv buffer, so it is the new data
|
||||
* buffer
|
||||
*/
|
||||
data_buffer = recv_buffer;
|
||||
|
||||
/* If not reduced, means also, you might not posted a send */
|
||||
not_sent = true;
|
||||
} else {
|
||||
PTPCOLL_VERBOSE(10, ("reduce root is started"));
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
}
|
||||
|
||||
/* I'm root, I'm done */
|
||||
if (input_args->root_flag) {
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
PTPCOLL_VERBOSE(1,("Testing Sending Match"));
|
||||
|
||||
/* If send was not posted */
|
||||
/* Manju: Leaf node should never post in the progress logic */
|
||||
if (not_sent) {
|
||||
parent_rank =
|
||||
ptpcoll_module->narray_node[relative_group_index].parent_rank +
|
||||
group_root_index;
|
||||
if (parent_rank >= group_size) {
|
||||
parent_rank -= group_size;
|
||||
}
|
||||
|
||||
comm_parent_rank = group_list[parent_rank];
|
||||
PTPCOLL_VERBOSE(1,("Sending data to %d ",comm_parent_rank));
|
||||
|
||||
rc = MCA_PML_CALL(isend(data_buffer, pack_len, MPI_BYTE,
|
||||
comm_parent_rank,
|
||||
tag, MCA_PML_BASE_SEND_STANDARD, comm, send_request));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to send data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 == mca_bcol_ptpcoll_test_for_match(send_request, &rc)) {
|
||||
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
|
||||
/* Data has not been sent. Return that the collective has been stated
|
||||
* because we MUST call test on this request once it is finished to
|
||||
* ensure that it is properly freed. */
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
static int bcol_ptpcoll_reduce_narray(bcol_function_args_t *input_args,
|
||||
struct mca_bcol_base_function_t *const_args)
|
||||
{
|
||||
mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module;
|
||||
|
||||
int tag;
|
||||
int rc;
|
||||
int group_size = ptpcoll_module->group_size;
|
||||
int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
|
||||
uint32_t buffer_index = input_args->buffer_index;
|
||||
|
||||
struct ompi_op_t *op = input_args->op;
|
||||
ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
|
||||
ompi_request_t **recv_requests =
|
||||
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[1];
|
||||
ompi_request_t **send_request =
|
||||
&ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0];
|
||||
|
||||
void *data_buffer = NULL;
|
||||
void *src_buffer = (void *) (
|
||||
(unsigned char *)input_args->sbuf +
|
||||
(size_t)input_args->sbuf_offset);
|
||||
void *recv_buffer = (void *) (
|
||||
(unsigned char *)input_args->rbuf +
|
||||
(size_t)input_args->rbuf_offset);
|
||||
int count = input_args->count;
|
||||
struct ompi_datatype_t *dtype = input_args->dtype;
|
||||
int pack_len = input_args->count * input_args->dtype->super.size;
|
||||
int *active_requests =
|
||||
&(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
|
||||
int matched = true;
|
||||
int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
|
||||
int group_root_index = -1;
|
||||
int relative_group_index = 0;
|
||||
netpatterns_tree_node_t *narray_node = NULL;
|
||||
int parent_rank = -1, comm_parent_rank = -1;
|
||||
|
||||
|
||||
/* This is first function call that should be called, not progress.
|
||||
* The fragmentation code does this, so switch from progress to here.
|
||||
* The flag indicates whether, we have entered this code *
|
||||
*/
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduce_init_called = true;
|
||||
|
||||
PTPCOLL_VERBOSE(1, ("Reduce, Narray tree"));
|
||||
/* reset active request counter */
|
||||
(*active_requests) = 0;
|
||||
/* keep tag within the limit support by the pml */
|
||||
tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
|
||||
/* mark this as a collective tag, to avoid conflict with user-level flags */
|
||||
tag = -tag;
|
||||
|
||||
PTPCOLL_VERBOSE(1, ("bcol_ptpcoll_reduce_narray, buffer index: %d "
|
||||
"tag: %d "
|
||||
"tag_mask: %d "
|
||||
"sn: %d "
|
||||
"root: %d "
|
||||
"buff: %p ",
|
||||
buffer_index, tag,
|
||||
ptpcoll_module->tag_mask, input_args->sequence_num,
|
||||
input_args->root_flag,
|
||||
src_buffer));
|
||||
|
||||
/* Compute Root Index Shift */
|
||||
group_root_index = input_args->root;
|
||||
relative_group_index = my_group_index - group_root_index;
|
||||
if (relative_group_index < 0) {
|
||||
relative_group_index += group_size;
|
||||
}
|
||||
|
||||
narray_node = &ptpcoll_module->narray_node[relative_group_index];
|
||||
|
||||
if (0 == narray_node->n_children) {
|
||||
PTPCOLL_VERBOSE(10, ("I'm leaf of the data"));
|
||||
/*
|
||||
* I'm root of the operation
|
||||
* send data to N childrens
|
||||
*/
|
||||
data_buffer = src_buffer;
|
||||
goto NARRAY_SEND_DATA;
|
||||
}
|
||||
|
||||
/* Not leaf, either an internal node or root */
|
||||
NARRAY_RECV_NB(narray_node, group_root_index, group_size,
|
||||
recv_buffer, pack_len, tag, comm, recv_requests,
|
||||
active_requests);
|
||||
|
||||
|
||||
/* We have not done reduction, yet */
|
||||
ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status = 0;
|
||||
|
||||
/* We can not block. So run couple of test for data arrival */
|
||||
matched = mca_bcol_ptpcoll_test_all_for_match
|
||||
(active_requests, recv_requests, &rc);
|
||||
|
||||
/* Check if received the data */
|
||||
if(matched) {
|
||||
|
||||
narray_reduce(src_buffer, recv_buffer, narray_node->n_children,
|
||||
count, dtype, op, &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status);
|
||||
PTPCOLL_VERBOSE(1, ("Reduce, received data from all childrend "));
|
||||
data_buffer = recv_buffer;
|
||||
|
||||
} else {
|
||||
|
||||
PTPCOLL_VERBOSE(1, ("reduce root is started"));
|
||||
return BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
/* I'm root, I'm done */
|
||||
if (input_args->root_flag) {
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
|
||||
NARRAY_SEND_DATA:
|
||||
|
||||
/*
|
||||
* Send the data (reduce in case of internal nodes, or just data in
|
||||
* case of leaf nodes) to the parent
|
||||
*/
|
||||
narray_node = &ptpcoll_module->narray_node[relative_group_index];
|
||||
|
||||
parent_rank =
|
||||
ptpcoll_module->narray_node[relative_group_index].parent_rank +
|
||||
group_root_index;
|
||||
if (parent_rank >= group_size) {
|
||||
parent_rank -= group_size;
|
||||
}
|
||||
|
||||
comm_parent_rank = group_list[parent_rank];
|
||||
PTPCOLL_VERBOSE(1,("Sending data to %d ",comm_parent_rank));
|
||||
|
||||
rc = MCA_PML_CALL(isend(data_buffer, pack_len, MPI_BYTE,
|
||||
comm_parent_rank,
|
||||
tag, MCA_PML_BASE_SEND_STANDARD, comm, send_request));
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
PTPCOLL_VERBOSE(10, ("Failed to send data"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* We can not block. So run couple of test for data arrival */
|
||||
if (0 == mca_bcol_ptpcoll_test_for_match(send_request, &rc)) {
|
||||
PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
|
||||
/* No data was received, return no match error */
|
||||
return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
|
||||
}
|
||||
|
||||
return BCOL_FN_COMPLETE;
|
||||
}
|
||||
|
||||
|
||||
int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super)
|
||||
{
|
||||
mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
|
||||
mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
|
||||
|
||||
PTPCOLL_VERBOSE(1,("Initialization Reduce - Narray"));
|
||||
comm_attribs.bcoll_type = BCOL_REDUCE;
|
||||
comm_attribs.comm_size_min = 0;
|
||||
comm_attribs.comm_size_max = 1024 * 1024;
|
||||
comm_attribs.waiting_semantics = NON_BLOCKING;
|
||||
|
||||
inv_attribs.bcol_msg_min = 0;
|
||||
inv_attribs.bcol_msg_max = 20000; /* range 1 */
|
||||
|
||||
inv_attribs.datatype_bitmap = 0xffffffff;
|
||||
inv_attribs.op_types_bitmap = 0xffffffff;
|
||||
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
|
||||
bcol_ptpcoll_reduce_narray,
|
||||
bcol_ptpcoll_reduce_narray_progress);
|
||||
|
||||
comm_attribs.data_src = DATA_SRC_KNOWN;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,25 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2013 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_PTPCOLL_REDUCE_H
|
||||
#define MCA_BCOL_PTPCOLL_REDUCE_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "bcol_ptpcoll.h"
|
||||
#include "bcol_ptpcoll_utils.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super);
|
||||
|
||||
int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super);
|
||||
|
||||
#endif /* MCA_BCOL_PTPCOLL_REDUCE_H */
|
@ -1,139 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "bcol_ptpcoll.h"
|
||||
#include "bcol_ptpcoll_utils.h"
|
||||
|
||||
/*
|
||||
* Return closet power of K, for the number, and the number
|
||||
*/
|
||||
int pow_k_calc(int k, int number, int *out_number)
|
||||
{
|
||||
int power = 0;
|
||||
int n = 1;
|
||||
|
||||
while (n < number) {
|
||||
n *= k;
|
||||
++power;
|
||||
}
|
||||
|
||||
if (n > number) {
|
||||
n /= k;
|
||||
--power;
|
||||
}
|
||||
if (NULL != out_number) {
|
||||
*out_number = n;
|
||||
}
|
||||
|
||||
return power;
|
||||
}
|
||||
|
||||
/*
|
||||
* Communicator rank to group index conversion function for K-nomial tree.
|
||||
* Complexity: (K-1) Log _base_K N
|
||||
*
|
||||
* Input:
|
||||
* my_group_index - my process index in the group
|
||||
* comm_source - the communicator rank of the source of data
|
||||
* radix - radix of K-nomial tree
|
||||
* group_size - the size of my group
|
||||
* group_array[] - one to one map from group index to communicator rank
|
||||
*
|
||||
* Output:
|
||||
* Group index for comm_source.
|
||||
*/
|
||||
|
||||
int get_group_index_and_distance_for_binomial(int my_group_index, int comm_source,
|
||||
int group_size, int *group_array, int *pow_distance)
|
||||
{
|
||||
int group_index;
|
||||
int i;
|
||||
*pow_distance = 0;
|
||||
|
||||
for (i = 1; i < group_size; i<<=1, (*pow_distance)++) {
|
||||
group_index = my_group_index ^ i;
|
||||
if (comm_source == group_array[group_index]) {
|
||||
return group_index;
|
||||
}
|
||||
}
|
||||
|
||||
*pow_distance = -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int get_group_index_and_distance_for_k_nomial(int my_group_index, int comm_source, int radix,
|
||||
int group_size, int *group_array, int *pow_distance)
|
||||
{
|
||||
int group_index;
|
||||
int offset = 1; /* offset equal to 1 (radix_power) */
|
||||
int radix_power = 1; /* radix power 0 */
|
||||
*pow_distance = 0;
|
||||
|
||||
/*
|
||||
* Go trough range of possible offsets from my rank,
|
||||
* for each offset we calculate k-nomial tree root.
|
||||
*/
|
||||
while(offset < group_size) {
|
||||
/* K-nomial tree root calculation for the offset */
|
||||
if (offset % (radix * radix_power)) {
|
||||
group_index = my_group_index - offset;
|
||||
/* wrap around if the group is negative */
|
||||
if (group_index < 0) {
|
||||
group_index += group_size;
|
||||
}
|
||||
PTPCOLL_VERBOSE(10, ("Checking %d", group_index));
|
||||
if (comm_source == group_array[group_index]) {
|
||||
return group_index;
|
||||
}
|
||||
offset += radix_power;
|
||||
} else {
|
||||
/* we done with the section of the tree, go to next one */
|
||||
radix_power *= radix;
|
||||
(*pow_distance)++;
|
||||
}
|
||||
}
|
||||
|
||||
/* No source was found, return -1 */
|
||||
*pow_distance = -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int get_group_index_for_k_nomial(int my_group_index, int comm_source, int radix, int group_size, int *group_array)
|
||||
{
|
||||
int group_index;
|
||||
int radix_power = 1; /* radix power 0 */
|
||||
int offset = 1; /* offset equal to 1 (radix_power) */
|
||||
|
||||
/*
|
||||
* Go trough range of possible offsets from my rank,
|
||||
* for each offset we calculate k-nomial tree root.
|
||||
*/
|
||||
while(offset < group_size) {
|
||||
/* K-nomial tree root calculation for the offset */
|
||||
if (offset % (radix * radix_power)) {
|
||||
group_index = my_group_index - offset;
|
||||
/* wrap around if the group is negative */
|
||||
if (group_index < 0) {
|
||||
group_index += group_size;
|
||||
}
|
||||
if (comm_source == group_array[group_index]) {
|
||||
return group_index;
|
||||
}
|
||||
offset += radix_power;
|
||||
} else {
|
||||
/* we done with the section of the tree, go to next one */
|
||||
radix_power *= radix;
|
||||
}
|
||||
}
|
||||
|
||||
/* No source was found, return -1 */
|
||||
return -1;
|
||||
}
|
@ -1,80 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BCOL_PTPCOLL_UTILS_H
|
||||
#define MCA_BCOL_PTPCOLL_UTILS_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/mca/rte/rte.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Return closet power of K, for the number
|
||||
*/
|
||||
int pow_k_calc(int k, int number, int *out_number);
|
||||
|
||||
/*
|
||||
* Communicator rank to group index conversion function for K-nomial tree.
|
||||
*/
|
||||
int get_group_index_for_k_nomial(int my_group_index, int comm_source, int radix, int group_size, int *group_array);
|
||||
|
||||
/* the same like above, just more information on return */
|
||||
int get_group_index_and_distance_for_k_nomial(int my_group_index, int comm_source, int radix,
|
||||
int group_size, int *group_array, int *pow_distance);
|
||||
|
||||
int get_group_index_and_distance_for_binomial(int my_group_index, int comm_source,
|
||||
int group_size, int *group_array, int *pow_distance);
|
||||
/*
|
||||
* Error and debug Macros/Functions
|
||||
*/
|
||||
static inline int mca_bcol_ptpcoll_err(const char* fmt, ...)
|
||||
{
|
||||
va_list list;
|
||||
int ret;
|
||||
|
||||
va_start(list, fmt);
|
||||
ret = vfprintf(stderr, fmt, list);
|
||||
va_end(list);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define PTPCOLL_ERROR(args) \
|
||||
do { \
|
||||
mca_bcol_ptpcoll_err("[%s]%s[%s:%d:%s] PTPCOLL ", \
|
||||
ompi_process_info.nodename, \
|
||||
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__); \
|
||||
mca_bcol_ptpcoll_err args; \
|
||||
mca_bcol_ptpcoll_err("\n"); \
|
||||
} while(0)
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
#define PTPCOLL_VERBOSE(level, args) \
|
||||
do { \
|
||||
if (mca_bcol_ptpcoll_component.verbose >= level) { \
|
||||
mca_bcol_ptpcoll_err("[%s]%s[%s:%d:%s] PTPCOLL ", \
|
||||
ompi_process_info.nodename, \
|
||||
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__); \
|
||||
mca_bcol_ptpcoll_err args; \
|
||||
mca_bcol_ptpcoll_err("\n"); \
|
||||
} \
|
||||
} while(0)
|
||||
#else
|
||||
#define PTPCOLL_VERBOSE(level, args)
|
||||
#endif
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,7 +0,0 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: ORNL
|
||||
status: unmaintained
|
@ -1,89 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
# Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2016 Research Organization for Information Science
|
||||
# and Technology (RIST). All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
AM_LFLAGS = -Pcoll_ml_config_yy
|
||||
LEX_OUTPUT_ROOT = lex.coll_ml_config_yy
|
||||
|
||||
dist_ompidata_DATA = \
|
||||
mca-coll-ml.config \
|
||||
help-mpi-coll-ml.txt
|
||||
|
||||
sources = coll_ml.h \
|
||||
coll_ml_inlines.h \
|
||||
coll_ml_module.c \
|
||||
coll_ml_allocation.h \
|
||||
coll_ml_allocation.c \
|
||||
coll_ml_barrier.c \
|
||||
coll_ml_bcast.c \
|
||||
coll_ml_colls.h \
|
||||
coll_ml_component.c \
|
||||
coll_ml_copy_fns.c \
|
||||
coll_ml_descriptors.c \
|
||||
coll_ml_functions.h \
|
||||
coll_ml_hier_algorithms.c \
|
||||
coll_ml_hier_algorithms_setup.c \
|
||||
coll_ml_hier_algorithms_bcast_setup.c \
|
||||
coll_ml_hier_algorithms_allreduce_setup.c \
|
||||
coll_ml_hier_algorithms_reduce_setup.c \
|
||||
coll_ml_hier_algorithms_common_setup.c \
|
||||
coll_ml_hier_algorithms_common_setup.h \
|
||||
coll_ml_hier_algorithms_allgather_setup.c \
|
||||
coll_ml_hier_algorithm_memsync_setup.c \
|
||||
coll_ml_custom_utils.h \
|
||||
coll_ml_custom_utils.c \
|
||||
coll_ml_progress.c \
|
||||
coll_ml_reduce.c \
|
||||
coll_ml_allreduce.c \
|
||||
coll_ml_allgather.c \
|
||||
coll_ml_mca.h \
|
||||
coll_ml_mca.c \
|
||||
coll_ml_lmngr.h \
|
||||
coll_ml_lmngr.c \
|
||||
coll_ml_hier_algorithms_barrier_setup.c \
|
||||
coll_ml_select.h \
|
||||
coll_ml_select.c \
|
||||
coll_ml_memsync.c \
|
||||
coll_ml_lex.h \
|
||||
coll_ml_lex.l \
|
||||
coll_ml_config.c \
|
||||
coll_ml_config.h
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
component_noinst =
|
||||
component_install =
|
||||
if MCA_BUILD_ompi_coll_ml_DSO
|
||||
component_install += mca_coll_ml.la
|
||||
else
|
||||
component_noinst += libmca_coll_ml.la
|
||||
endif
|
||||
|
||||
# See ompi/mca/btl/ml/Makefile.am for an explanation of
|
||||
# libmca_common_ml.la.
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_coll_ml_la_SOURCES = $(sources)
|
||||
mca_coll_ml_la_LDFLAGS = -module -avoid-version
|
||||
mca_coll_ml_la_LIBADD =
|
||||
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_coll_ml_la_SOURCES =$(sources)
|
||||
libmca_coll_ml_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
maintainer-clean-local:
|
||||
rm -f coll_ml_lex.c
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,633 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
#include "coll_ml.h"
|
||||
#include "coll_ml_select.h"
|
||||
#include "coll_ml_allocation.h"
|
||||
|
||||
static int mca_coll_ml_allgather_small_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
bool rcontig = coll_op->full_message.recv_data_continguous;
|
||||
int n_ranks_in_comm = ompi_comm_size(OP_ML_MODULE(coll_op)->comm);
|
||||
|
||||
void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr +
|
||||
(uintptr_t)coll_op->full_message.n_bytes_delivered);
|
||||
void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
|
||||
(size_t)coll_op->variable_fn_params.rbuf_offset);
|
||||
|
||||
if (rcontig) {
|
||||
memcpy(dest, src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled);
|
||||
} else {
|
||||
mca_coll_ml_convertor_unpack(src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled,
|
||||
&coll_op->fragment_data.message_descriptor->recv_convertor);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline void copy_data (mca_coll_ml_collective_operation_progress_t *coll_op, rank_properties_t *rank_props, int soffset) {
|
||||
bool rcontig = coll_op->fragment_data.message_descriptor->recv_data_continguous;
|
||||
size_t total_bytes = coll_op->fragment_data.message_descriptor->n_bytes_total;
|
||||
size_t pack_len = coll_op->fragment_data.fragment_size;
|
||||
int doffset = rank_props->rank;
|
||||
void *dest, *src;
|
||||
|
||||
src = (void *) ((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
|
||||
(size_t)coll_op->variable_fn_params.rbuf_offset + soffset * pack_len);
|
||||
|
||||
if (rcontig) {
|
||||
dest = (void *) ((uintptr_t) coll_op->full_message.dest_user_addr +
|
||||
(uintptr_t) coll_op->fragment_data.offset_into_user_buffer +
|
||||
doffset * total_bytes);
|
||||
|
||||
memcpy(dest, src, pack_len);
|
||||
} else {
|
||||
size_t position;
|
||||
opal_convertor_t *recv_convertor =
|
||||
&coll_op->fragment_data.message_descriptor->recv_convertor;
|
||||
|
||||
position = (size_t) coll_op->fragment_data.offset_into_user_buffer +
|
||||
doffset * total_bytes;
|
||||
|
||||
opal_convertor_set_position(recv_convertor, &position);
|
||||
mca_coll_ml_convertor_unpack(src, pack_len, recv_convertor);
|
||||
}
|
||||
}
|
||||
|
||||
static int mca_coll_ml_allgather_noncontiguous_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
int i, j, n_level_one_sbgps;
|
||||
size_t soffset;
|
||||
|
||||
mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info;
|
||||
sub_group_params_t *array_of_all_subgroup_ranks = topo_info->array_of_all_subgroups;
|
||||
|
||||
n_level_one_sbgps = array_of_all_subgroup_ranks->level_one_index;
|
||||
|
||||
for (i = 0 ; i < n_level_one_sbgps; i++) {
|
||||
/* determine where in the source buffer the data can be found */
|
||||
soffset = array_of_all_subgroup_ranks[i].index_of_first_element;
|
||||
for (j = 0 ; j < array_of_all_subgroup_ranks[i].n_ranks; j++, ++soffset) {
|
||||
copy_data (coll_op, array_of_all_subgroup_ranks[i].rank_data + j, soffset);
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* Allgather dependencies seem easy, everyone needs to work from the "bottom up".
|
||||
* Following Pasha, I too will put the simplest dependencies graph and change it later
|
||||
* when we add hierarchy. Basically, allgather has the same dependency profile as the
|
||||
* sequential broadcast except that there is only a single ordering of tasks.
|
||||
*/
|
||||
static int mca_coll_ml_allgather_task_setup(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
int fn_idx, h_level, my_index, root;
|
||||
mca_sbgp_base_module_t *sbgp;
|
||||
mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info;
|
||||
|
||||
fn_idx = coll_op->sequential_routine.current_active_bcol_fn;
|
||||
h_level = coll_op->coll_schedule->component_functions[fn_idx].h_level;
|
||||
sbgp = topo->component_pairs[h_level].
|
||||
subgroup_module;
|
||||
my_index = sbgp->my_index;
|
||||
|
||||
/* In the case of allgather, the local leader is always the root */
|
||||
root = 0;
|
||||
if (my_index == root) {
|
||||
coll_op->variable_fn_params.root_flag = true;
|
||||
coll_op->variable_fn_params.root_route = NULL;
|
||||
} else {
|
||||
coll_op->variable_fn_params.root_flag = false;
|
||||
coll_op->variable_fn_params.root_route = &topo->route_vector[root];
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_coll_ml_allgather_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
/* local variables */
|
||||
int ret;
|
||||
size_t frag_len, dt_size;
|
||||
|
||||
const void *buf;
|
||||
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
|
||||
mca_coll_ml_collective_operation_progress_t *new_op;
|
||||
|
||||
mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
|
||||
bool scontig = coll_op->fragment_data.message_descriptor->send_data_continguous;
|
||||
|
||||
ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size);
|
||||
/* Keep the pipeline filled with fragments */
|
||||
while (coll_op->fragment_data.message_descriptor->n_active <
|
||||
coll_op->fragment_data.message_descriptor->pipeline_depth) {
|
||||
/* If an active fragment happens to have completed the collective during
|
||||
* a hop into the progress engine, then don't launch a new fragment,
|
||||
* instead break and return.
|
||||
*/
|
||||
if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
|
||||
== coll_op->fragment_data.message_descriptor->n_bytes_total) {
|
||||
break;
|
||||
}
|
||||
/* Get an ml buffer */
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
if (NULL == src_buffer_desc) {
|
||||
/* If there exist outstanding fragments, then break out
|
||||
* and let an active fragment deal with this later,
|
||||
* there are no buffers available.
|
||||
*/
|
||||
if (0 < coll_op->fragment_data.message_descriptor->n_active) {
|
||||
return OMPI_SUCCESS;
|
||||
} else {
|
||||
/* The fragment is already on list and
|
||||
* the we still have no ml resources
|
||||
* Return busy */
|
||||
if (coll_op->pending & REQ_OUT_OF_MEMORY) {
|
||||
ML_VERBOSE(10,("Out of resources %p", coll_op));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
coll_op->pending |= REQ_OUT_OF_MEMORY;
|
||||
opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
|
||||
(opal_list_item_t *)coll_op);
|
||||
ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
|
||||
/* Get a new collective descriptor and initialize it */
|
||||
new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
|
||||
coll_op->fragment_data.message_descriptor->src_user_addr,
|
||||
coll_op->fragment_data.message_descriptor->dest_user_addr,
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_total,
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_scheduled);
|
||||
|
||||
new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
|
||||
new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;
|
||||
|
||||
/* set the task setup callback */
|
||||
new_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
|
||||
|
||||
/*
|
||||
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
|
||||
src_buffer_desc->buffer_index, src_buffer_desc);
|
||||
*/
|
||||
|
||||
/* We need this address for pointer arithmetic in memcpy */
|
||||
buf = coll_op->fragment_data.message_descriptor->src_user_addr;
|
||||
|
||||
if (!scontig) {
|
||||
frag_len = ml_module->small_message_thresholds[BCOL_ALLGATHER];
|
||||
mca_coll_ml_convertor_get_send_frag_size(
|
||||
ml_module, &frag_len,
|
||||
coll_op->fragment_data.message_descriptor);
|
||||
|
||||
mca_coll_ml_convertor_pack(
|
||||
(void *) ((uintptr_t) src_buffer_desc->data_addr +
|
||||
frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
|
||||
frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
|
||||
frag_len, &coll_op->fragment_data.message_descriptor->send_convertor);
|
||||
} else {
|
||||
/* calculate new frag length, there are some issues here */
|
||||
frag_len = (coll_op->fragment_data.message_descriptor->n_bytes_total -
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
|
||||
coll_op->fragment_data.fragment_size ?
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_total -
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_scheduled :
|
||||
coll_op->fragment_data.fragment_size);
|
||||
|
||||
/* everybody copies in, based on the new values */
|
||||
memcpy((void *) ((uintptr_t)src_buffer_desc->data_addr +
|
||||
frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].offset +
|
||||
frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
|
||||
(void *) ((uintptr_t) buf + (uintptr_t)
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_scheduled), frag_len);
|
||||
}
|
||||
|
||||
new_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
|
||||
new_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
|
||||
|
||||
/* update the number of bytes scheduled */
|
||||
new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
|
||||
/* everyone needs an unpack function */
|
||||
new_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
|
||||
|
||||
new_op->fragment_data.fragment_size = frag_len;
|
||||
new_op->fragment_data.buffer_desc = src_buffer_desc;
|
||||
|
||||
/* Setup fragment specific data */
|
||||
++(new_op->fragment_data.message_descriptor->n_active);
|
||||
|
||||
ML_VERBOSE(10, ("Start more, My index %d ",
|
||||
new_op->fragment_data.buffer_desc->buffer_index));
|
||||
|
||||
/* this is a bit buggy */
|
||||
ML_SET_VARIABLE_PARAMS_BCAST(
|
||||
new_op,
|
||||
OP_ML_MODULE(new_op),
|
||||
frag_len /* yes, we have consistent units, so this makes sense */,
|
||||
MPI_BYTE /* we fragment according to buffer size
|
||||
* we don't reduce the data thus we needn't
|
||||
* keep "whole" datatypes, we may freely
|
||||
* fragment without regard for multiples
|
||||
* of any specific datatype
|
||||
*/,
|
||||
src_buffer_desc,
|
||||
0,
|
||||
0,
|
||||
frag_len,
|
||||
src_buffer_desc->data_addr);
|
||||
/* initialize first coll */
|
||||
ret = new_op->sequential_routine.seq_task_setup(new_op);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
ML_VERBOSE(3, ("Fragment failed to initialize itself"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
new_op->variable_fn_params.buffer_size = frag_len;
|
||||
new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
|
||||
new_op->variable_fn_params.root = 0;
|
||||
|
||||
MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
|
||||
|
||||
/* append this collective !! */
|
||||
OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
|
||||
opal_list_append(&mca_coll_ml_component.sequential_collectives,
|
||||
(opal_list_item_t *)new_op);
|
||||
OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int mca_coll_ml_allgather_start (const void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
ompi_request_t **req)
|
||||
{
|
||||
size_t pack_len, sdt_size;
|
||||
int ret, n_fragments = 1, comm_size;
|
||||
|
||||
mca_coll_ml_topology_t *topo_info;
|
||||
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
|
||||
|
||||
mca_coll_ml_component_t *cm = &mca_coll_ml_component;
|
||||
|
||||
mca_coll_ml_collective_operation_progress_t *coll_op;
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
|
||||
|
||||
ptrdiff_t lb, extent;
|
||||
bool scontig, rcontig, in_place = false;
|
||||
|
||||
/* check for in place setting */
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
in_place = true;
|
||||
sdtype = rdtype;
|
||||
scount = rcount;
|
||||
}
|
||||
|
||||
/* scontig could be != to rcontig */
|
||||
scontig = ompi_datatype_is_contiguous_memory_layout(sdtype, scount);
|
||||
rcontig = ompi_datatype_is_contiguous_memory_layout(rdtype, rcount);
|
||||
|
||||
comm_size = ompi_comm_size(comm);
|
||||
|
||||
ML_VERBOSE(10, ("Starting allgather"));
|
||||
|
||||
assert(NULL != sdtype);
|
||||
/* Calculate size of the data,
|
||||
* at this stage, only contiguous data is supported */
|
||||
|
||||
/* this is valid for allagther */
|
||||
ompi_datatype_type_size(sdtype, &sdt_size);
|
||||
pack_len = scount * sdt_size;
|
||||
|
||||
if (in_place) {
|
||||
sbuf = (char *) rbuf + ompi_comm_rank(comm) * pack_len;
|
||||
}
|
||||
|
||||
/* Allocate collective schedule and pack message */
|
||||
/* this is the total ending message size that will need to fit in the ml-buffer */
|
||||
if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]) {
|
||||
/* The len of the message can not be larger than ML buffer size */
|
||||
ML_VERBOSE(10, ("Single frag %d %d %d", pack_len, comm_size, ml_module->payload_block->size_buffer));
|
||||
assert(pack_len * comm_size <= ml_module->payload_block->size_buffer);
|
||||
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
while (NULL == src_buffer_desc) {
|
||||
opal_progress();
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
}
|
||||
|
||||
/* change 1 */
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
|
||||
sbuf, rbuf, pack_len, 0 /* offset for first pack */);
|
||||
|
||||
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
|
||||
src_buffer_desc->buffer_index, src_buffer_desc);
|
||||
|
||||
coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;
|
||||
/* task setup callback function */
|
||||
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
|
||||
|
||||
/* change 2 */
|
||||
if (!scontig) {
|
||||
coll_op->full_message.n_bytes_scheduled =
|
||||
mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
|
||||
&coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);
|
||||
|
||||
mca_coll_ml_convertor_pack(
|
||||
(void *) ((uintptr_t) src_buffer_desc->data_addr + pack_len *
|
||||
(coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
|
||||
coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
|
||||
pack_len, &coll_op->full_message.send_convertor);
|
||||
} else {
|
||||
/* change 3 */
|
||||
memcpy((void *)((uintptr_t) src_buffer_desc->data_addr + pack_len *
|
||||
(coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
|
||||
coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
|
||||
sbuf, pack_len);
|
||||
|
||||
coll_op->full_message.n_bytes_scheduled = pack_len;
|
||||
}
|
||||
|
||||
if (!rcontig) {
|
||||
mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
|
||||
&coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
|
||||
}
|
||||
|
||||
if (coll_op->coll_schedule->topo_info->ranks_contiguous) {
|
||||
coll_op->process_fn = mca_coll_ml_allgather_small_unpack_data;
|
||||
} else {
|
||||
coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
|
||||
}
|
||||
|
||||
/* whole ml-buffer is used to send AND receive */
|
||||
coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
|
||||
coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
|
||||
|
||||
/* we can set the initial offset here */
|
||||
coll_op->variable_fn_params.sbuf_offset = 0;
|
||||
coll_op->variable_fn_params.rbuf_offset = 0;
|
||||
|
||||
coll_op->variable_fn_params.count = scount;
|
||||
coll_op->fragment_data.fragment_size =
|
||||
coll_op->full_message.n_bytes_scheduled;
|
||||
|
||||
/* For small CINCO, we may use the native datatype */
|
||||
coll_op->variable_fn_params.dtype = sdtype;
|
||||
coll_op->variable_fn_params.buffer_size = pack_len;
|
||||
coll_op->variable_fn_params.root = 0;
|
||||
} else if (cm->enable_fragmentation || pack_len * comm_size < (1 << 20)) {
|
||||
/* calculate the number of fragments and the size of each frag */
|
||||
size_t n_dts_per_frag, frag_len;
|
||||
int pipeline_depth = mca_coll_ml_component.pipeline_depth;
|
||||
|
||||
/* Calculate the number of fragments required for this message careful watch the integer division !*/
|
||||
frag_len = (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER] ?
|
||||
pack_len : (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]);
|
||||
|
||||
n_dts_per_frag = frag_len / sdt_size;
|
||||
n_fragments = (pack_len + sdt_size * n_dts_per_frag - 1) / (sdt_size * n_dts_per_frag);
|
||||
pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth);
|
||||
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
while (NULL == src_buffer_desc) {
|
||||
opal_progress();
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
}
|
||||
|
||||
/* change 4 */
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
|
||||
sbuf, rbuf, pack_len,
|
||||
0 /* offset for first pack */);
|
||||
|
||||
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
|
||||
src_buffer_desc->buffer_index, src_buffer_desc);
|
||||
topo_info = coll_op->coll_schedule->topo_info;
|
||||
|
||||
/* task setup callback function */
|
||||
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
|
||||
|
||||
if (!scontig) {
|
||||
coll_op->full_message.send_converter_bytes_packed =
|
||||
mca_coll_ml_convertor_prepare(
|
||||
sdtype, scount, NULL,
|
||||
&coll_op->full_message.dummy_convertor,
|
||||
MCA_COLL_ML_NET_STREAM_SEND);
|
||||
|
||||
coll_op->full_message.dummy_conv_position = 0;
|
||||
mca_coll_ml_convertor_get_send_frag_size(
|
||||
ml_module, &frag_len,
|
||||
&coll_op->full_message);
|
||||
|
||||
/* change 5 */
|
||||
mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
|
||||
&coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);
|
||||
|
||||
mca_coll_ml_convertor_pack(
|
||||
(void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len *
|
||||
(topo_info->hier_layout_info[0].offset +
|
||||
topo_info->hier_layout_info[0].level_one_index)),
|
||||
frag_len, &coll_op->full_message.send_convertor);
|
||||
} else {
|
||||
/* change 6 */
|
||||
memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len *
|
||||
(topo_info->hier_layout_info[0].offset +
|
||||
topo_info->hier_layout_info[0].level_one_index)),
|
||||
sbuf, frag_len);
|
||||
}
|
||||
|
||||
if (!rcontig) {
|
||||
mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
|
||||
&coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
|
||||
}
|
||||
|
||||
coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
|
||||
|
||||
/* hopefully this doesn't royaly screw things up idea behind this is the
|
||||
* whole ml-buffer is used to send and receive
|
||||
*/
|
||||
coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
|
||||
coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
|
||||
|
||||
/* we can set the initial offset here */
|
||||
coll_op->variable_fn_params.sbuf_offset = 0;
|
||||
coll_op->variable_fn_params.rbuf_offset = 0;
|
||||
|
||||
coll_op->fragment_data.buffer_desc = src_buffer_desc;
|
||||
|
||||
coll_op->fragment_data.fragment_size = frag_len;
|
||||
coll_op->fragment_data.message_descriptor->n_active = 1;
|
||||
|
||||
coll_op->full_message.n_bytes_scheduled = frag_len;
|
||||
coll_op->full_message.fragment_launcher = mca_coll_ml_allgather_frag_progress;
|
||||
|
||||
coll_op->full_message.pipeline_depth = pipeline_depth;
|
||||
coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;
|
||||
|
||||
/* remember this is different for frags !! Caused data corruption when
|
||||
* not properly set. Need to be sure you have consistent units.
|
||||
*/
|
||||
coll_op->variable_fn_params.count = frag_len;
|
||||
coll_op->variable_fn_params.dtype = MPI_BYTE; /* for fragmented data, we work in
|
||||
* units of bytes. This means that
|
||||
* all of our arithmetic is done
|
||||
* in terms of bytes
|
||||
*/
|
||||
|
||||
coll_op->variable_fn_params.root = 0;
|
||||
coll_op->variable_fn_params.frag_size = frag_len;
|
||||
coll_op->variable_fn_params.buffer_size = frag_len;
|
||||
} else {
|
||||
/* change 7 */
|
||||
ML_VERBOSE(10, ("ML_ALLGATHER_LARGE_DATA_KNOWN case."));
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_allgather_functions[ML_LARGE_DATA_ALLGATHER],
|
||||
sbuf, rbuf, pack_len, 0 /* offset for first pack */);
|
||||
topo_info = coll_op->coll_schedule->topo_info;
|
||||
if (MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG & topo_info->all_bcols_mode) {
|
||||
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, MCA_COLL_ML_NO_BUFFER, NULL);
|
||||
} else {
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
while (NULL == src_buffer_desc) {
|
||||
opal_progress();
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
}
|
||||
|
||||
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc);
|
||||
}
|
||||
|
||||
/* not sure if I really need this here */
|
||||
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
|
||||
coll_op->process_fn = NULL;
|
||||
/* probably the most important piece */
|
||||
coll_op->variable_fn_params.sbuf = sbuf;
|
||||
coll_op->variable_fn_params.rbuf = rbuf;
|
||||
coll_op->variable_fn_params.sbuf_offset = 0;
|
||||
coll_op->variable_fn_params.rbuf_offset = 0;
|
||||
coll_op->variable_fn_params.count = scount;
|
||||
coll_op->variable_fn_params.dtype = sdtype;/* for zero copy, we want the
|
||||
* native datatype and actual count
|
||||
*/
|
||||
coll_op->variable_fn_params.root = 0;
|
||||
|
||||
/* you still need to copy in your own data into the rbuf */
|
||||
/* don't need to do this if you have in place data */
|
||||
if (!in_place) {
|
||||
memcpy((char *) rbuf + ompi_comm_rank(comm) * pack_len, sbuf, pack_len);
|
||||
}
|
||||
}
|
||||
|
||||
coll_op->full_message.send_count = scount;
|
||||
coll_op->full_message.recv_count = rcount;
|
||||
|
||||
coll_op->full_message.send_data_continguous = scontig;
|
||||
coll_op->full_message.recv_data_continguous = rcontig;
|
||||
|
||||
ompi_datatype_get_extent(sdtype, &lb, &extent);
|
||||
coll_op->full_message.send_extent = (size_t) extent;
|
||||
|
||||
ompi_datatype_get_extent(rdtype, &lb, &extent);
|
||||
coll_op->full_message.recv_extent = (size_t) extent;
|
||||
|
||||
|
||||
/* Fill in the function arguments */
|
||||
coll_op->variable_fn_params.sequence_num =
|
||||
OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
|
||||
coll_op->variable_fn_params.hier_factor = comm_size;
|
||||
|
||||
MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);
|
||||
|
||||
|
||||
ret = mca_coll_ml_launch_sequential_collective (coll_op);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
ML_VERBOSE(10, ("Failed to launch"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
*req = &coll_op->full_message.super;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_coll_ml_allgather(const void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
ompi_request_t *req;
|
||||
int ret;
|
||||
|
||||
ML_VERBOSE(10, ("Starting blocking allgather"));
|
||||
|
||||
ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module, &req);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = ompi_request_wait (&req, MPI_STATUS_IGNORE);
|
||||
|
||||
ML_VERBOSE(10, ("Blocking allgather is complete"));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int mca_coll_ml_allgather_nb(const void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
ompi_request_t **req,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ML_VERBOSE(10, ("Starting non-blocking allgather"));
|
||||
|
||||
ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module, req);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ML_VERBOSE(10, ("Non-blocking allgather started"));
|
||||
|
||||
return ret;
|
||||
}
|
@ -1,213 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "coll_ml.h"
|
||||
#include "coll_ml_inlines.h"
|
||||
#include "coll_ml_allocation.h"
|
||||
|
||||
mca_bcol_base_memory_block_desc_t *mca_coll_ml_allocate_block(struct mca_coll_ml_component_t *ml_component,
|
||||
mca_bcol_base_memory_block_desc_t *ml_memblock)
|
||||
{
|
||||
mca_bcol_base_memory_block_desc_t *ret = NULL;
|
||||
mca_bcol_base_memory_block_desc_t *memory_block = NULL;
|
||||
mca_coll_ml_lmngr_t *memory_manager = NULL;
|
||||
|
||||
if (ml_memblock) {
|
||||
ML_ERROR(("Memory already allocated - expecting NULL pointer"));
|
||||
return ret;
|
||||
}
|
||||
memory_block = (mca_bcol_base_memory_block_desc_t*) calloc(1, sizeof(mca_bcol_base_memory_block_desc_t));
|
||||
|
||||
if (NULL == memory_block){
|
||||
ML_ERROR(("Couldn't allocate memory for ml_memblock"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
memory_manager = &ml_component->memory_manager;
|
||||
memory_block->block = mca_coll_ml_lmngr_alloc(memory_manager);
|
||||
memory_block->size_block = memory_manager->list_block_size;
|
||||
|
||||
if (!memory_block->block){
|
||||
ML_VERBOSE(1, ("lmngr failed."));
|
||||
free(memory_block);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return memory_block;
|
||||
}
|
||||
|
||||
void mca_coll_ml_free_block (mca_bcol_base_memory_block_desc_t *ml_memblock)
|
||||
{
|
||||
if (!ml_memblock)
|
||||
return;
|
||||
|
||||
if (ml_memblock->buffer_descs){
|
||||
free(ml_memblock->buffer_descs);
|
||||
}
|
||||
|
||||
mca_coll_ml_lmngr_free(ml_memblock->block);
|
||||
free(ml_memblock->bank_release_counters);
|
||||
free(ml_memblock->ready_for_memsync);
|
||||
free(ml_memblock->bank_is_busy);
|
||||
free(ml_memblock);
|
||||
}
|
||||
|
||||
int mca_coll_ml_initialize_block(mca_bcol_base_memory_block_desc_t *ml_memblock,
|
||||
uint32_t num_buffers,
|
||||
uint32_t num_banks,
|
||||
uint32_t buffer_size,
|
||||
int32_t data_offset,
|
||||
opal_list_t *bcols_in_use)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
uint32_t bank_loop, buff_loop;
|
||||
uint64_t addr_offset = 0;
|
||||
mca_bcol_base_payload_buffer_desc_t *pbuff_descs = NULL,*pbuff_desc = NULL;
|
||||
|
||||
if (0 == num_banks || 0 == num_buffers || 0 == buffer_size) {
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
if (NULL == ml_memblock){
|
||||
ML_ERROR(("Memory block not initialized"));
|
||||
ret = OMPI_ERROR;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
if (ml_memblock->size_block < (num_buffers * num_banks * buffer_size) ){
|
||||
ML_ERROR(("Not enough memory for all buffers and banks in the memory block"));
|
||||
ret = OMPI_ERROR;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
pbuff_descs = (mca_bcol_base_payload_buffer_desc_t*) malloc(sizeof(mca_bcol_base_payload_buffer_desc_t)
|
||||
* num_banks * num_buffers);
|
||||
if (NULL == pbuff_descs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for(bank_loop = 0; bank_loop < num_banks; bank_loop++)
|
||||
for(buff_loop = 0; buff_loop < num_buffers; buff_loop++){
|
||||
pbuff_desc = &pbuff_descs[bank_loop*num_buffers + buff_loop];
|
||||
|
||||
pbuff_desc->base_data_addr = (void *)
|
||||
((char *)ml_memblock->block->base_addr + addr_offset);
|
||||
pbuff_desc->data_addr = (void *)
|
||||
((char *)pbuff_desc->base_data_addr + (size_t)data_offset);
|
||||
|
||||
addr_offset+=buffer_size;
|
||||
pbuff_desc->buffer_index = BUFFER_INDEX(bank_loop,num_buffers,buff_loop);
|
||||
|
||||
pbuff_desc->bank_index=bank_loop;
|
||||
pbuff_desc->generation_number=0;
|
||||
}
|
||||
|
||||
/* Initialize ml memory block */
|
||||
/* gvm FIX:This counter when zero indicates that the bank is ready for
|
||||
* recycle. This is initialized to number of bcol components as each bcol is responsible for
|
||||
* releasing the buffers of a bank. This initialization will have
|
||||
* faulty behavior, example in case of multiple interfaces, when more than
|
||||
* one bcol module of the component type is in use.
|
||||
*/
|
||||
ml_memblock->bank_release_counters = (uint32_t *) calloc(num_banks, sizeof(uint32_t));
|
||||
if (NULL == ml_memblock->bank_release_counters) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
ml_memblock->ready_for_memsync = (bool *) calloc(num_banks, sizeof(bool));
|
||||
if (NULL == ml_memblock->ready_for_memsync) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
ml_memblock->bank_is_busy = (bool *) calloc(num_banks, sizeof(bool));
|
||||
if (NULL == ml_memblock->bank_is_busy) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto exit_ERROR;
|
||||
}
|
||||
|
||||
/* Set index for first bank to sync */
|
||||
ml_memblock->memsync_counter = 0;
|
||||
|
||||
/* use first bank and first buffer */
|
||||
ml_memblock->next_free_buffer = 0;
|
||||
|
||||
ml_memblock->block_addr_offset = addr_offset;
|
||||
ml_memblock->num_buffers_per_bank = num_buffers;
|
||||
ml_memblock->num_banks = num_banks;
|
||||
ml_memblock->size_buffer = buffer_size;
|
||||
ml_memblock->buffer_descs = pbuff_descs;
|
||||
|
||||
return ret;
|
||||
|
||||
exit_ERROR:
|
||||
/* Free all buffer descriptors */
|
||||
if (pbuff_descs){
|
||||
free(pbuff_descs);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
mca_bcol_base_payload_buffer_desc_t *mca_coll_ml_alloc_buffer (mca_coll_ml_module_t *module)
|
||||
{
|
||||
uint64_t bindex;
|
||||
uint32_t bank, buffer, num_buffers;
|
||||
mca_bcol_base_memory_block_desc_t *ml_memblock = module->payload_block;
|
||||
mca_bcol_base_payload_buffer_desc_t *pbuff_descs = NULL,
|
||||
*ml_membuffer = NULL;
|
||||
|
||||
/* Return a buffer */
|
||||
num_buffers = ml_memblock->num_buffers_per_bank;
|
||||
pbuff_descs = ml_memblock->buffer_descs;
|
||||
bindex = ml_memblock->next_free_buffer;
|
||||
buffer = bindex % num_buffers;
|
||||
bank = bindex/num_buffers;
|
||||
|
||||
ML_VERBOSE(10, ("ML allocator: allocating buffer index %d, bank index %d", buffer, bank));
|
||||
|
||||
/* First buffer in bank, use next bank */
|
||||
if (0 == buffer) {
|
||||
if(!ml_memblock->bank_is_busy[bank]) {
|
||||
/* the bank is free, mark it busy */
|
||||
ml_memblock->bank_is_busy[bank] = true;
|
||||
ML_VERBOSE(10, ("ML allocator: reset bank %d to value %d", bank,
|
||||
ml_memblock->bank_release_counters[bank]));
|
||||
} else {
|
||||
/* the bank is busy, return NULL and upper layer will handle it */
|
||||
ML_VERBOSE(10, ("No free payload buffers are available for use."
|
||||
" Next memory bank is still used by one of bcols"));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
assert(true == ml_memblock->bank_is_busy[bank]);
|
||||
|
||||
ml_membuffer = &pbuff_descs[bindex];
|
||||
ML_VERBOSE(10, ("ML allocator: ml buffer index %d", bindex));
|
||||
|
||||
/* Compute next free buffer */
|
||||
buffer = (buffer == num_buffers - 1) ? 0 : buffer + 1;
|
||||
if (0 == buffer) {
|
||||
bank = (bank == ml_memblock->num_banks - 1) ? 0 : bank + 1;
|
||||
}
|
||||
|
||||
ml_memblock->next_free_buffer = BUFFER_INDEX(bank,num_buffers,buffer);
|
||||
|
||||
return ml_membuffer;
|
||||
}
|
@ -1,111 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_ML_ALLOC_H
|
||||
#define MCA_ML_ALLOC_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/include/ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "coll_ml_lmngr.h"
|
||||
|
||||
/*
|
||||
Returns a block of memory from mpool
|
||||
|
||||
ARGS:
|
||||
IN ml_component: component descriptor
|
||||
OUT ml_memblock: block_addr - Starting address of the memory block
|
||||
size - Size of the block
|
||||
register_info - Register information passed from the mpool
|
||||
|
||||
Return
|
||||
On Sucess : Returns size of memory block
|
||||
On Failure: Returns -1
|
||||
|
||||
*/
|
||||
|
||||
struct mca_coll_ml_component_t;
|
||||
struct mca_coll_ml_module_t;
|
||||
|
||||
mca_bcol_base_memory_block_desc_t *mca_coll_ml_allocate_block(
|
||||
struct mca_coll_ml_component_t *ml_component,
|
||||
struct mca_bcol_base_memory_block_desc_t *ml_memblock
|
||||
);
|
||||
/* Allocate the memory from mpool */
|
||||
/* Register the memory block with bcols */
|
||||
|
||||
void mca_coll_ml_free_block(
|
||||
mca_bcol_base_memory_block_desc_t *ml_memblock
|
||||
);
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Initialize the memory block and map into buffers and memory banks, and
|
||||
also buffer descriptors are initialized.
|
||||
|
||||
IN ml_memblock: Memory block descriptor
|
||||
IN num_buffers: number of buffers
|
||||
IN num_banks: number of banks
|
||||
Return
|
||||
On Sucess: OMPI_SUCCESS
|
||||
On Failure: OMPI_ERROR
|
||||
*/
|
||||
int mca_coll_ml_initialize_block(
|
||||
mca_bcol_base_memory_block_desc_t *ml_memblock,
|
||||
uint32_t num_buffers,
|
||||
uint32_t num_banks,
|
||||
uint32_t buffer_size,
|
||||
int32_t data_offset,
|
||||
opal_list_t *bcols_in_use
|
||||
);
|
||||
/* Map blocks into buffers and banks */
|
||||
/* Initialize the descriptors */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Allocate a memory buffer from the block
|
||||
IN ml_memblock: Memory block descriptor
|
||||
OUT ml_membuffer: Buffer allocated for data from the block
|
||||
|
||||
Return
|
||||
On Sucess: OMPI_SUCCESS
|
||||
On Failure: OMPI_ERROR
|
||||
*/
|
||||
mca_bcol_base_payload_buffer_desc_t *mca_coll_ml_alloc_buffer(
|
||||
struct mca_coll_ml_module_t *module);
|
||||
|
||||
int mca_coll_ml_free_buffer(
|
||||
mca_bcol_base_memory_block_desc_t *ml_memblock,
|
||||
struct mca_bcol_base_payload_buffer_desc_t *ml_membuffer
|
||||
);
|
||||
|
||||
/*
|
||||
Register the memory block with bcol component
|
||||
|
||||
IN ml_memblock: Memory block descriptor
|
||||
OUT registerations (ml_memblock)
|
||||
|
||||
Return
|
||||
On Sucess: OMPI_SUCCESS
|
||||
On Failure: OMPI_ERROR
|
||||
|
||||
*/
|
||||
int mca_coll_ml_register_block_bcol(
|
||||
mca_bcol_base_memory_block_desc_t *ml_memblock
|
||||
);
|
||||
|
||||
#endif /* MCA_ML_ALLOC_H */
|
@ -1,553 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
#include "coll_ml.h"
|
||||
#include "coll_ml_select.h"
|
||||
#include "coll_ml_allocation.h"
|
||||
|
||||
static int mca_coll_ml_allreduce_small_unpack(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
int ret;
|
||||
/* need to put in more */
|
||||
int count = coll_op->variable_fn_params.count;
|
||||
ompi_datatype_t *dtype = coll_op->variable_fn_params.dtype;
|
||||
|
||||
void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr +
|
||||
(uintptr_t)coll_op->fragment_data.offset_into_user_buffer);
|
||||
void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
|
||||
(size_t)coll_op->variable_fn_params.rbuf_offset);
|
||||
|
||||
ret = ompi_datatype_copy_content_same_ddt(dtype, (int32_t) count, (char *) dest,
|
||||
(char *) src);
|
||||
if (ret < 0) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
ML_VERBOSE(10, ("sbuf addr %p, sbuf offset %d, rbuf addr %p, rbuf offset %d.",
|
||||
src, coll_op->variable_fn_params.sbuf_offset, dest,
|
||||
coll_op->variable_fn_params.rbuf_offset));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_coll_ml_allreduce_task_setup(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
int fn_idx, h_level, my_index, root;
|
||||
mca_sbgp_base_module_t *sbgp;
|
||||
mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info;
|
||||
|
||||
fn_idx = coll_op->sequential_routine.current_active_bcol_fn;
|
||||
h_level = coll_op->coll_schedule->component_functions[fn_idx].h_level;
|
||||
sbgp = topo->component_pairs[h_level].subgroup_module;
|
||||
my_index = sbgp->my_index;
|
||||
|
||||
/* In the case of allreduce, the local leader is always the root */
|
||||
root = 0;
|
||||
if (my_index == root) {
|
||||
coll_op->variable_fn_params.root_flag = true;
|
||||
coll_op->variable_fn_params.root_route = NULL;
|
||||
} else {
|
||||
coll_op->variable_fn_params.root_flag = false;
|
||||
coll_op->variable_fn_params.root_route = &topo->route_vector[root];
|
||||
}
|
||||
|
||||
/* NTH: This was copied from the old allreduce launcher. */
|
||||
if (0 < fn_idx) {
|
||||
coll_op->variable_fn_params.sbuf = coll_op->variable_fn_params.rbuf;
|
||||
coll_op->variable_fn_params.userbuf = coll_op->variable_fn_params.rbuf;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_coll_ml_allreduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
/* local variables */
|
||||
const void *buf;
|
||||
|
||||
size_t dt_size;
|
||||
int ret, frag_len, count;
|
||||
|
||||
ptrdiff_t lb, extent;
|
||||
|
||||
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
|
||||
mca_coll_ml_collective_operation_progress_t *new_op;
|
||||
|
||||
mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
|
||||
|
||||
ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent);
|
||||
if (ret < 0) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
dt_size = (size_t) extent;
|
||||
|
||||
/* Keep the pipeline filled with fragments */
|
||||
while (coll_op->fragment_data.message_descriptor->n_active <
|
||||
coll_op->fragment_data.message_descriptor->pipeline_depth) {
|
||||
/* If an active fragment happens to have completed the collective during
|
||||
* a hop into the progress engine, then don't launch a new fragment,
|
||||
* instead break and return.
|
||||
*/
|
||||
if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
|
||||
== coll_op->fragment_data.message_descriptor->n_bytes_total) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* Get an ml buffer */
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op));
|
||||
if (NULL == src_buffer_desc) {
|
||||
/* If there exist outstanding fragments, then break out
|
||||
* and let an active fragment deal with this later,
|
||||
* there are no buffers available.
|
||||
*/
|
||||
if (0 < coll_op->fragment_data.message_descriptor->n_active) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* It is useless to call progress from here, since
|
||||
* ml progress can't be executed as result ml memsync
|
||||
* call will not be completed and no memory will be
|
||||
* recycled. So we put the element on the list, and we will
|
||||
* progress it later when memsync will recycle some memory*/
|
||||
|
||||
/* The fragment is already on list and
|
||||
* the we still have no ml resources
|
||||
* Return busy */
|
||||
if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) {
|
||||
coll_op->pending |= REQ_OUT_OF_MEMORY;
|
||||
opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
|
||||
(opal_list_item_t *)coll_op);
|
||||
ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
|
||||
} else {
|
||||
ML_VERBOSE(10,("Out of resources %p", coll_op));
|
||||
}
|
||||
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Get a new collective descriptor and initialize it */
|
||||
new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_allreduce_functions[coll_op->fragment_data.current_coll_op],
|
||||
coll_op->fragment_data.message_descriptor->src_user_addr,
|
||||
coll_op->fragment_data.message_descriptor->dest_user_addr,
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_total,
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_scheduled);
|
||||
|
||||
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
|
||||
src_buffer_desc->buffer_index, src_buffer_desc);
|
||||
|
||||
new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
|
||||
new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;
|
||||
|
||||
/* set the task setup callback */
|
||||
new_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup;
|
||||
/* We need this address for pointer arithmetic in memcpy */
|
||||
buf = coll_op->fragment_data.message_descriptor->src_user_addr;
|
||||
/* calculate the number of data types in this packet */
|
||||
count = (coll_op->fragment_data.message_descriptor->n_bytes_total -
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
|
||||
(size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_ALLREDUCE] ?
|
||||
(coll_op->fragment_data.message_descriptor->n_bytes_total -
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size :
|
||||
(size_t) coll_op->variable_fn_params.count);
|
||||
|
||||
/* calculate the fragment length */
|
||||
frag_len = count*dt_size;
|
||||
|
||||
ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count,
|
||||
(char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t)
|
||||
coll_op->fragment_data.message_descriptor->n_bytes_scheduled));
|
||||
if (ret < 0) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* No unpack for root */
|
||||
new_op->process_fn = mca_coll_ml_allreduce_small_unpack;
|
||||
|
||||
/* Setup fragment specific data */
|
||||
new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
|
||||
new_op->fragment_data.buffer_desc = src_buffer_desc;
|
||||
new_op->fragment_data.fragment_size = frag_len;
|
||||
(new_op->fragment_data.message_descriptor->n_active)++;
|
||||
|
||||
ML_SET_VARIABLE_PARAMS_BCAST(
|
||||
new_op,
|
||||
OP_ML_MODULE(new_op),
|
||||
count,
|
||||
MPI_BYTE,
|
||||
src_buffer_desc,
|
||||
0,
|
||||
0,
|
||||
frag_len,
|
||||
src_buffer_desc->data_addr);
|
||||
/* Fill in bcast specific arguments */
|
||||
/* TBD: remove buffer_size */
|
||||
new_op->variable_fn_params.buffer_size = frag_len;
|
||||
new_op->variable_fn_params.count = count;
|
||||
new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
|
||||
new_op->variable_fn_params.op = coll_op->variable_fn_params.op;
|
||||
new_op->variable_fn_params.dtype = coll_op->variable_fn_params.dtype;
|
||||
new_op->variable_fn_params.root = 0;
|
||||
new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
|
||||
new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
|
||||
new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
|
||||
|
||||
MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
|
||||
|
||||
ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d",
|
||||
new_op->variable_fn_params.buffer_size,
|
||||
new_op->fragment_data.fragment_size,
|
||||
new_op->fragment_data.message_descriptor->n_bytes_scheduled));
|
||||
/* initialize first coll */
|
||||
ret = new_op->sequential_routine.seq_task_setup(new_op);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
ML_VERBOSE(3,("Fragment failed to initialize itself"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* append this collective !! */
|
||||
OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
|
||||
opal_list_append(&mca_coll_ml_component.sequential_collectives,
|
||||
(opal_list_item_t *)new_op);
|
||||
OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
|
||||
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int parallel_allreduce_start(const void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_ml_module_t *ml_module,
|
||||
ompi_request_t **req,
|
||||
int small_data_allreduce,
|
||||
int large_data_allreduce)
|
||||
{
|
||||
int ret, n_fragments = 1, frag_len,
|
||||
pipeline_depth, n_dts_per_frag ;
|
||||
|
||||
ptrdiff_t lb, extent;
|
||||
size_t pack_len, dt_size;
|
||||
|
||||
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
|
||||
mca_coll_ml_collective_operation_progress_t *coll_op;
|
||||
|
||||
mca_coll_ml_component_t *cm = &mca_coll_ml_component;
|
||||
|
||||
bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count);
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
sbuf = rbuf;
|
||||
}
|
||||
|
||||
ret = ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
if (ret < 0) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
dt_size = (size_t) extent;
|
||||
pack_len = count * dt_size;
|
||||
|
||||
ML_VERBOSE(1,("The allreduce requested %d enable fragmentation %d ",
|
||||
pack_len,
|
||||
cm->enable_fragmentation));
|
||||
if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLREDUCE]) {
|
||||
/* The len of the message can not be larger than ML buffer size */
|
||||
assert(pack_len <= ml_module->payload_block->size_buffer);
|
||||
|
||||
ML_VERBOSE(1,("Using small data allreduce (threshold = %d)",
|
||||
ml_module->small_message_thresholds[BCOL_ALLREDUCE]));
|
||||
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
while (OPAL_UNLIKELY(NULL == src_buffer_desc)) {
|
||||
opal_progress();
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
}
|
||||
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_allreduce_functions[small_data_allreduce],
|
||||
sbuf, rbuf, pack_len, 0);
|
||||
|
||||
coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
|
||||
coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
|
||||
coll_op->variable_fn_params.count = count;
|
||||
|
||||
ret = ompi_datatype_copy_content_same_ddt(dtype, count,
|
||||
(void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf);
|
||||
if (ret < 0){
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* unpack function */
|
||||
coll_op->process_fn = mca_coll_ml_allreduce_small_unpack;
|
||||
} else if (cm->enable_fragmentation || !contiguous) {
|
||||
ML_VERBOSE(1,("Using Fragmented Allreduce"));
|
||||
|
||||
/* fragment the data */
|
||||
/* check for retarded application programming decisions */
|
||||
if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_ALLREDUCE]) {
|
||||
ML_ERROR(("Sorry, but we don't support datatypes that large"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* calculate the number of data types that can fit per ml-buffer */
|
||||
n_dts_per_frag = ml_module->small_message_thresholds[BCOL_ALLREDUCE] / dt_size;
|
||||
|
||||
/* calculate the number of fragments */
|
||||
n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */
|
||||
|
||||
/* calculate the actual pipeline depth */
|
||||
pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth;
|
||||
|
||||
/* calculate the fragment size */
|
||||
frag_len = n_dts_per_frag * dt_size;
|
||||
|
||||
/* allocate an ml buffer */
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
while (NULL == src_buffer_desc) {
|
||||
opal_progress();
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
}
|
||||
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_allreduce_functions[small_data_allreduce],
|
||||
sbuf, rbuf, pack_len, 0 /* offset for first pack */);
|
||||
|
||||
/* task setup callback function */
|
||||
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup;
|
||||
|
||||
coll_op->process_fn = mca_coll_ml_allreduce_small_unpack;
|
||||
|
||||
coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
|
||||
coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
|
||||
|
||||
coll_op->fragment_data.message_descriptor->n_active = 1;
|
||||
coll_op->full_message.n_bytes_scheduled = frag_len;
|
||||
coll_op->full_message.fragment_launcher = mca_coll_ml_allreduce_frag_progress;
|
||||
coll_op->full_message.pipeline_depth = pipeline_depth;
|
||||
coll_op->fragment_data.current_coll_op = small_data_allreduce;
|
||||
coll_op->fragment_data.fragment_size = frag_len;
|
||||
|
||||
coll_op->variable_fn_params.count = n_dts_per_frag; /* seems fishy */
|
||||
coll_op->variable_fn_params.buffer_size = frag_len;
|
||||
|
||||
/* copy into the ml-buffer */
|
||||
ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag,
|
||||
(char *) src_buffer_desc->data_addr, (char *) sbuf);
|
||||
if (ret < 0) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
} else {
|
||||
ML_VERBOSE(1,("Using zero-copy ptp allreduce"));
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_allreduce_functions[large_data_allreduce],
|
||||
sbuf, rbuf, pack_len, 0);
|
||||
|
||||
coll_op->variable_fn_params.userbuf =
|
||||
coll_op->variable_fn_params.sbuf = sbuf;
|
||||
|
||||
coll_op->variable_fn_params.rbuf = rbuf;
|
||||
|
||||
/* The ML buffer is used for testing. Later, when we
|
||||
* switch to use knem/mmap/portals this should be replaced
|
||||
* appropriately
|
||||
*/
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
while (NULL == src_buffer_desc) {
|
||||
opal_progress();
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
}
|
||||
|
||||
coll_op->variable_fn_params.count = count;
|
||||
}
|
||||
|
||||
MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index,
|
||||
src_buffer_desc);
|
||||
|
||||
/* set the offset */
|
||||
coll_op->variable_fn_params.sbuf_offset = 0;
|
||||
coll_op->variable_fn_params.rbuf_offset = 0;
|
||||
|
||||
/* Fill in the function arguments */
|
||||
coll_op->variable_fn_params.sequence_num =
|
||||
OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
|
||||
coll_op->sequential_routine.current_active_bcol_fn = 0;
|
||||
coll_op->variable_fn_params.dtype = dtype;
|
||||
coll_op->variable_fn_params.op = op;
|
||||
coll_op->variable_fn_params.root = 0;
|
||||
coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup; /* invoked after each level in sequential
|
||||
* progress call
|
||||
*/
|
||||
MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);
|
||||
|
||||
ret = mca_coll_ml_launch_sequential_collective (coll_op);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
ML_VERBOSE(10, ("Failed to launch"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
*req = &coll_op->full_message.super;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_coll_ml_allreduce(const void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module;
|
||||
ompi_request_t *req;
|
||||
int ret;
|
||||
|
||||
if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) {
|
||||
/* coll/ml does not handle non-communative operations at this time. fallback
|
||||
* on another collective module */
|
||||
return ml_module->fallback.coll_allreduce (sbuf, rbuf, count, dtype, op, comm,
|
||||
ml_module->fallback.coll_allreduce_module);
|
||||
}
|
||||
|
||||
ret = parallel_allreduce_start(sbuf, rbuf, count, dtype, op, comm,
|
||||
(mca_coll_ml_module_t *) module, &req,
|
||||
ML_SMALL_DATA_ALLREDUCE,
|
||||
ML_LARGE_DATA_ALLREDUCE);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
ML_ERROR(("Failed to launch"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
ompi_request_wait_completion(req);
|
||||
ompi_request_free(&req);
|
||||
|
||||
ML_VERBOSE(10, ("Blocking NB allreduce is done"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_coll_ml_allreduce_nb(const void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
ompi_request_t **req,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module;
|
||||
int ret;
|
||||
|
||||
if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) {
|
||||
/* coll/ml does not handle non-communative operations at this time. fallback
|
||||
* on another collective module */
|
||||
return ml_module->fallback.coll_iallreduce (sbuf, rbuf, count, dtype, op, comm, req,
|
||||
ml_module->fallback.coll_iallreduce_module);
|
||||
}
|
||||
|
||||
ret = parallel_allreduce_start(sbuf, rbuf, count, dtype, op, comm,
|
||||
(mca_coll_ml_module_t *) module, req,
|
||||
ML_SMALL_DATA_ALLREDUCE,
|
||||
ML_LARGE_DATA_ALLREDUCE);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
ML_ERROR(("Failed to launch"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
ML_VERBOSE(10, ("Blocking NB allreduce is done"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_coll_ml_allreduce_dispatch(const void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
|
||||
{
|
||||
int rc;
|
||||
bool use_extra_topo;
|
||||
ompi_request_t *req;
|
||||
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
|
||||
|
||||
use_extra_topo = (count > 1) ?
|
||||
!ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_MULTI_ELEM_TYPE] :
|
||||
!ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_SINGLE_ELEM_TYPE];
|
||||
|
||||
if (use_extra_topo) {
|
||||
rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
|
||||
op, comm, ml_module, &req,
|
||||
ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE,
|
||||
ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE);
|
||||
} else {
|
||||
rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
|
||||
op, comm, ml_module, &req,
|
||||
ML_SMALL_DATA_ALLREDUCE,
|
||||
ML_LARGE_DATA_ALLREDUCE);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
ML_ERROR(("Failed to launch"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
ompi_request_wait_completion(req);
|
||||
ompi_request_free(&req);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_coll_ml_allreduce_dispatch_nb(const void *sbuf, void *rbuf, int count,
|
||||
ompi_datatype_t *dtype, ompi_op_t *op,
|
||||
ompi_communicator_t *comm,
|
||||
ompi_request_t **req,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int rc;
|
||||
bool use_extra_topo;
|
||||
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
|
||||
|
||||
use_extra_topo = (count > 1) ?
|
||||
!ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_MULTI_ELEM_TYPE] :
|
||||
!ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_SINGLE_ELEM_TYPE];
|
||||
|
||||
if (use_extra_topo) {
|
||||
rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
|
||||
op, comm, ml_module, req,
|
||||
ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE,
|
||||
ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE);
|
||||
} else {
|
||||
rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
|
||||
op, comm, ml_module, req,
|
||||
ML_SMALL_DATA_ALLREDUCE,
|
||||
ML_LARGE_DATA_ALLREDUCE);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
ML_ERROR(("Failed to launch"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,146 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
#include "ompi/mca/coll/ml/coll_ml.h"
|
||||
#include "ompi/mca/coll/ml/coll_ml_inlines.h"
|
||||
|
||||
static void mca_coll_ml_barrier_task_setup(
|
||||
mca_coll_ml_task_status_t *task_status,
|
||||
int index, mca_coll_ml_compound_functions_t *func)
|
||||
{
|
||||
task_status->rt_num_dependencies = func->num_dependencies;
|
||||
task_status->rt_num_dependent_tasks = func->num_dependent_tasks;
|
||||
task_status->rt_dependent_task_indices = func->dependent_task_indices;
|
||||
}
|
||||
|
||||
static int mca_coll_ml_barrier_launch(mca_coll_ml_module_t *ml_module,
|
||||
ompi_request_t **req)
|
||||
{
|
||||
opal_free_list_item_t *item;
|
||||
mca_coll_ml_collective_operation_progress_t *coll_op;
|
||||
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
|
||||
|
||||
/* allocate an ml buffer for signaling purposes */
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
|
||||
while (NULL == src_buffer_desc) {
|
||||
opal_progress();
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
}
|
||||
|
||||
|
||||
/* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */
|
||||
item = opal_free_list_wait (&(ml_module->coll_ml_collective_descriptors));
|
||||
|
||||
coll_op = (mca_coll_ml_collective_operation_progress_t *) item;
|
||||
assert(NULL != coll_op);
|
||||
|
||||
ML_VERBOSE(10, ("Get coll request %p", coll_op));
|
||||
|
||||
MCA_COLL_ML_OP_BASIC_SETUP(coll_op, 0, 0, NULL, NULL, ml_module->coll_ml_barrier_function);
|
||||
|
||||
coll_op->fragment_data.buffer_desc = src_buffer_desc;
|
||||
coll_op->dag_description.num_tasks_completed = 0;
|
||||
|
||||
coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
|
||||
|
||||
coll_op->variable_fn_params.sequence_num =
|
||||
OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
|
||||
|
||||
/* Pointer to a coll finalize function */
|
||||
coll_op->process_fn = NULL;
|
||||
|
||||
(*req) = &coll_op->full_message.super;
|
||||
|
||||
OMPI_REQUEST_INIT((*req), false);
|
||||
|
||||
(*req)->req_status._cancelled = 0;
|
||||
(*req)->req_state = OMPI_REQUEST_ACTIVE;
|
||||
(*req)->req_status.MPI_ERROR = OMPI_SUCCESS;
|
||||
|
||||
/* Set order info if there is a bcol needs ordering */
|
||||
MCA_COLL_ML_SET_ORDER_INFO(coll_op, 1);
|
||||
|
||||
return mca_coll_ml_generic_collectives_launcher(coll_op, mca_coll_ml_barrier_task_setup);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hierarchical blocking barrier
|
||||
*/
|
||||
int mca_coll_ml_barrier_intra(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int rc;
|
||||
ompi_request_t *req;
|
||||
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
static int barriers_count = 0;
|
||||
#endif
|
||||
|
||||
ML_VERBOSE(10, ("Barrier num %d start.", ++barriers_count));
|
||||
|
||||
rc = mca_coll_ml_barrier_launch(ml_module, &req);
|
||||
if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
|
||||
ML_ERROR(("Failed to launch a barrier."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Blocking barrier */
|
||||
ompi_request_wait_completion(req);
|
||||
ompi_request_free(&req);
|
||||
|
||||
ML_VERBOSE(10, ("Barrier num %d was done.", barriers_count));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hierarchical non-blocking barrier
|
||||
*/
|
||||
int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm,
|
||||
ompi_request_t **req,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int rc;
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
static int barriers_count = 0;
|
||||
#endif
|
||||
|
||||
ML_VERBOSE(10, ("IBarrier num %d start.", ++barriers_count));
|
||||
|
||||
rc = mca_coll_ml_barrier_launch(ml_module, req);
|
||||
if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
|
||||
ML_ERROR(("Failed to launch a barrier."));
|
||||
return rc;
|
||||
}
|
||||
|
||||
ML_VERBOSE(10, ("IBarrier num %d was done.", barriers_count));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,849 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/uio.h>
|
||||
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
|
||||
#include "coll_ml.h"
|
||||
#include "coll_ml_inlines.h"
|
||||
#include "coll_ml_colls.h"
|
||||
#include "coll_ml_allocation.h"
|
||||
|
||||
#define ML_BUFFER_ALLOC_WAIT(ml, buffer) \
|
||||
do { \
|
||||
buffer = mca_coll_ml_alloc_buffer(ml); \
|
||||
while (NULL == buffer) { \
|
||||
opal_progress(); \
|
||||
buffer = mca_coll_ml_alloc_buffer(ml); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_ML_SETUP_ORDERING_INFO(op, last, prev) \
|
||||
do { \
|
||||
/* Don't change order of commands !!!! */ \
|
||||
(op)->prev_frag = prev; \
|
||||
(op)->fragment_data.message_descriptor->last_started_frag = last; \
|
||||
/* op->next_to_process_frag = NULL; */ \
|
||||
} while (0)
|
||||
|
||||
#define ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, op, coll_index, root, \
|
||||
total_len, frag_len, buf, ml_buff_desc) \
|
||||
do { \
|
||||
op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, \
|
||||
ml_module->coll_ml_bcast_functions[coll_index], \
|
||||
buf, buf, \
|
||||
total_len, \
|
||||
0 /* offset for first pack */); \
|
||||
if (OPAL_LIKELY(frag_len > 0)) { \
|
||||
if (ompi_comm_rank(ml_module->comm) == root) { \
|
||||
/* single frag, pack the data */ \
|
||||
memcpy((void *)(uintptr_t)(ml_buff_desc)->data_addr, \
|
||||
buf, frag_len); \
|
||||
/* No unpack for root */ \
|
||||
op->process_fn = NULL; \
|
||||
} else { \
|
||||
op->process_fn = mca_coll_ml_bcast_small_unpack_data; \
|
||||
} \
|
||||
} \
|
||||
op->full_message.n_bytes_scheduled = frag_len; \
|
||||
} while (0)
|
||||
|
||||
#define SMALL_BCAST 0
|
||||
#define LARGE_BCAST (SMALL_BCAST + 1)
|
||||
|
||||
/* bcast data unpack */
|
||||
static int mca_coll_ml_bcast_converter_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
struct iovec iov;
|
||||
uint32_t iov_count = 1;
|
||||
size_t max_data = 0;
|
||||
|
||||
mca_coll_ml_collective_operation_progress_t *next_op;
|
||||
mca_coll_ml_module_t *ml_module =
|
||||
(mca_coll_ml_module_t *) coll_op->coll_module;
|
||||
|
||||
size_t max_index =
|
||||
ml_module->payload_block->num_banks * ml_module->payload_block->num_buffers_per_bank;
|
||||
|
||||
bool is_first = true;
|
||||
int ret;
|
||||
|
||||
/* Check if the fragment delivered in order */
|
||||
if (coll_op->fragment_data.buffer_desc->buffer_index !=
|
||||
coll_op->fragment_data.message_descriptor->next_expected_index) {
|
||||
mca_coll_ml_collective_operation_progress_t *prev_coll_op = coll_op->prev_frag;
|
||||
assert(NULL == prev_coll_op->next_to_process_frag);
|
||||
/* make sure that previous process will have pointer to the out
|
||||
of order process */
|
||||
prev_coll_op->next_to_process_frag = coll_op;
|
||||
assert(!(coll_op->pending & REQ_OUT_OF_ORDER));
|
||||
coll_op->pending |= REQ_OUT_OF_ORDER;
|
||||
/* we will unpack it later */
|
||||
ML_VERBOSE(10, ("Get %d expecting %d previous %d",
|
||||
coll_op->fragment_data.buffer_desc->buffer_index,
|
||||
coll_op->fragment_data.message_descriptor->next_expected_index,
|
||||
prev_coll_op->fragment_data.buffer_desc->buffer_index));
|
||||
return ORTE_ERR_NO_MATCH_YET;
|
||||
}
|
||||
|
||||
do {
|
||||
iov.iov_len = coll_op->fragment_data.fragment_size;
|
||||
iov.iov_base = (void *)((uintptr_t) coll_op->fragment_data.buffer_desc->data_addr);
|
||||
|
||||
ML_VERBOSE(10, ("Data unpack with convertern index %d",
|
||||
coll_op->fragment_data.buffer_desc->buffer_index));
|
||||
|
||||
opal_convertor_unpack(&coll_op->fragment_data.message_descriptor->recv_convertor,
|
||||
&iov, &iov_count, &max_data);
|
||||
|
||||
/* update next index */
|
||||
++coll_op->fragment_data.message_descriptor->next_expected_index;
|
||||
if (coll_op->fragment_data.message_descriptor->next_expected_index >= max_index) {
|
||||
coll_op->fragment_data.message_descriptor->next_expected_index = 0;
|
||||
}
|
||||
|
||||
/* Return to queue if the packet is done,
|
||||
the exeption is first packet, we release it later.
|
||||
*/
|
||||
next_op = coll_op->next_to_process_frag;
|
||||
coll_op->next_to_process_frag = NULL;
|
||||
if ((!is_first) &&
|
||||
(0 != coll_op->fragment_data.offset_into_user_buffer)) {
|
||||
assert(coll_op->pending & REQ_OUT_OF_ORDER);
|
||||
coll_op->pending ^= REQ_OUT_OF_ORDER;
|
||||
/* Pasha: On one hand - I'm not sure that conceptually it is right place to call buffer recycling. Potentially,
|
||||
coll_ml_fragment_completion_processing() sounds like right place for out of order unpack/sync handling.
|
||||
* On the other hand - non contiguous data is not supper common and we would like to minimize effect on critical pass
|
||||
* for non contiguous data types. */
|
||||
ret = mca_coll_ml_buffer_recycling(coll_op);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
CHECK_AND_RECYCLE(coll_op);
|
||||
}
|
||||
|
||||
coll_op = next_op;
|
||||
is_first = false;
|
||||
} while (NULL != coll_op);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_coll_ml_bcast_small_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
void * dest = (void *)((uintptr_t) coll_op->full_message.dest_user_addr +
|
||||
(uintptr_t) coll_op->full_message.n_bytes_delivered);
|
||||
void * src = (void *)((uintptr_t) coll_op->fragment_data.buffer_desc->data_addr);
|
||||
|
||||
memcpy(dest, src, coll_op->fragment_data.fragment_size);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_coll_ml_bcast_large_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
void * dest = (void *)((uintptr_t) coll_op->fragment_data.message_descriptor->dest_user_addr +
|
||||
(uintptr_t) coll_op->fragment_data.offset_into_user_buffer);
|
||||
void * src = (void *)((uintptr_t) coll_op->fragment_data.buffer_desc->data_addr);
|
||||
|
||||
memcpy(dest, src, coll_op->fragment_data.fragment_size);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_coll_ml_bcast_frag_converter_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
/* local variables */
|
||||
int ret, frag_len;
|
||||
size_t max_data = 0;
|
||||
|
||||
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
|
||||
mca_coll_ml_collective_operation_progress_t *new_op = NULL;
|
||||
mca_coll_ml_task_setup_fn_t task_setup = NULL;
|
||||
mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
|
||||
|
||||
/* Keep the pipeline filled with fragments */
|
||||
while (coll_op->fragment_data.message_descriptor->n_active <
|
||||
mca_coll_ml_component.pipeline_depth) {
|
||||
/* If an active fragment happens to have completed the collective during
|
||||
* a hop into the progress engine, then don't launch a new fragment,
|
||||
* instead break and return.
|
||||
*/
|
||||
if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
|
||||
== coll_op->fragment_data.message_descriptor->n_bytes_total) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* Get an ml buffer */
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
if (OPAL_UNLIKELY(NULL == src_buffer_desc)) {
|
||||
/* If there exist outstanding fragments, then break out
|
||||
* and let an active fragment deal with this later,
|
||||
* there are no buffers available.
|
||||
*/
|
||||
if (0 < coll_op->fragment_data.message_descriptor->n_active) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* It is useless to call progress from here, since
|
||||
* ml progress can't be executed as result ml memsync
|
||||
* call will not be completed and no memory will be
|
||||
* recycled. So we put the element on the list, and we will
|
||||
* progress it later when memsync will recycle some memory*/
|
||||
|
||||
/* The fragment is already on list and
|
||||
* the we still have no ml resources
|
||||
* Return busy */
|
||||
if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) {
|
||||
coll_op->pending |= REQ_OUT_OF_MEMORY;
|
||||
opal_list_append(&ml_module->waiting_for_memory_list,
|
||||
(opal_list_item_t *)coll_op);
|
||||
}
|
||||
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Get a new collective descriptor and initialize it */
|
||||
new_op = mca_coll_ml_duplicate_op_prog_single_frag_dag
|
||||
(ml_module, coll_op);
|
||||
/* We need this address for pointer arithmetic in memcpy */
|
||||
frag_len = ML_GET_FRAG_SIZE(coll_op, BCOL_BCAST);
|
||||
/* Decide based on global flag, not variable one */
|
||||
if (coll_op->fragment_data.message_descriptor->root) {
|
||||
struct iovec iov;
|
||||
uint32_t iov_count = 1;
|
||||
|
||||
/* OBJ_RETAIN(new_op->variable_fn_params.dtype); */
|
||||
iov.iov_base = (IOVBASE_TYPE*) src_buffer_desc->data_addr;
|
||||
iov.iov_len = ml_module->small_message_thresholds[BCOL_BCAST];
|
||||
assert(0 != iov.iov_len);
|
||||
|
||||
max_data = ml_module->small_message_thresholds[BCOL_BCAST];
|
||||
opal_convertor_pack(&new_op->fragment_data.message_descriptor->send_convertor,
|
||||
&iov, &iov_count, &max_data);
|
||||
|
||||
new_op->process_fn = NULL;
|
||||
new_op->variable_fn_params.root_flag = true;
|
||||
new_op->variable_fn_params.root_route = NULL;
|
||||
|
||||
task_setup = OP_ML_MODULE(new_op)->
|
||||
coll_ml_bcast_functions[new_op->fragment_data.current_coll_op]->
|
||||
task_setup_fn[COLL_ML_ROOT_TASK_FN];
|
||||
} else {
|
||||
new_op->process_fn = mca_coll_ml_bcast_converter_unpack_data;
|
||||
new_op->variable_fn_params.root_flag = false;
|
||||
new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route;
|
||||
|
||||
task_setup = OP_ML_MODULE(new_op)->
|
||||
coll_ml_bcast_functions[new_op->fragment_data.current_coll_op]->
|
||||
task_setup_fn[COLL_ML_GENERAL_TASK_FN];
|
||||
|
||||
max_data = ml_module->small_message_thresholds[BCOL_BCAST];
|
||||
mca_coll_ml_convertor_get_send_frag_size(
|
||||
ml_module, &max_data,
|
||||
new_op->fragment_data.message_descriptor);
|
||||
}
|
||||
|
||||
new_op->fragment_data.message_descriptor->n_bytes_scheduled += max_data;
|
||||
new_op->fragment_data.fragment_size = max_data;
|
||||
new_op->fragment_data.buffer_desc = src_buffer_desc;
|
||||
|
||||
/* Setup fragment specific data */
|
||||
++(new_op->fragment_data.message_descriptor->n_active);
|
||||
|
||||
COLL_ML_SETUP_ORDERING_INFO(new_op, new_op,
|
||||
new_op->fragment_data.message_descriptor->last_started_frag);
|
||||
ML_VERBOSE(10, ("Start more, My index %d my prev %d",
|
||||
new_op->fragment_data.buffer_desc->buffer_index,
|
||||
new_op->prev_frag->fragment_data.buffer_desc->buffer_index));
|
||||
|
||||
ML_SET_VARIABLE_PARAMS_BCAST(
|
||||
new_op,
|
||||
OP_ML_MODULE(new_op),
|
||||
frag_len,
|
||||
MPI_BYTE,
|
||||
src_buffer_desc,
|
||||
0,
|
||||
0,
|
||||
frag_len,
|
||||
src_buffer_desc->data_addr);
|
||||
|
||||
/* TBD: remove buffer_size */
|
||||
new_op->variable_fn_params.buffer_size = coll_op->variable_fn_params.buffer_size;
|
||||
new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
|
||||
|
||||
/* Set order info for new frag if there is a bcol needs ordering */
|
||||
MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
|
||||
|
||||
/* Launch this collective !! */
|
||||
ret = mca_coll_ml_generic_collectives_append_to_queue(new_op, task_setup);
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
ML_ERROR(("Failed to launch"));
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_coll_ml_bcast_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
/* local variables */
|
||||
int ret;
|
||||
int frag_len, current_coll_op = coll_op->fragment_data.current_coll_op;
|
||||
size_t dt_size;
|
||||
void *buf;
|
||||
|
||||
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
|
||||
mca_coll_ml_collective_operation_progress_t *new_op = NULL;
|
||||
mca_coll_ml_task_setup_fn_t task_setup = NULL;
|
||||
|
||||
ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size);
|
||||
|
||||
/* Keep the pipeline filled with fragments */
|
||||
while (coll_op->fragment_data.message_descriptor->n_active <
|
||||
coll_op->fragment_data.message_descriptor->pipeline_depth) {
|
||||
/* If an active fragment happens to have completed the collective during
|
||||
* a hop into the progress engine, then don't launch a new fragment,
|
||||
* instead break and return.
|
||||
*/
|
||||
if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
|
||||
== coll_op->fragment_data.message_descriptor->n_bytes_total) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* Get an ml buffer */
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op));
|
||||
if (NULL == src_buffer_desc) {
|
||||
/* If there exist outstanding fragments, then break out
|
||||
* and let an active fragment deal with this later,
|
||||
* there are no buffers available.
|
||||
*/
|
||||
if (0 < coll_op->fragment_data.message_descriptor->n_active) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* It is useless to call progress from here, since
|
||||
* ml progress can't be executed as result ml memsync
|
||||
* call will not be completed and no memory will be
|
||||
* recycled. So we put the element on the list, and we will
|
||||
* progress it later when memsync will recycle some memory*/
|
||||
|
||||
/* The fragment is already on list and
|
||||
* the we still have no ml resources
|
||||
* Return busy */
|
||||
if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) {
|
||||
ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
|
||||
coll_op->pending |= REQ_OUT_OF_MEMORY;
|
||||
opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
|
||||
(opal_list_item_t *) coll_op);
|
||||
} else {
|
||||
ML_VERBOSE(10,("Out of resources %p", coll_op));
|
||||
}
|
||||
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Get a new collective descriptor and initialize it */
|
||||
new_op = mca_coll_ml_duplicate_op_prog_single_frag_dag
|
||||
(OP_ML_MODULE(coll_op), coll_op);
|
||||
/* We need this address for pointer arithmetic in memcpy */
|
||||
buf = coll_op->fragment_data.message_descriptor->dest_user_addr;
|
||||
frag_len = ML_GET_FRAG_SIZE(coll_op, BCOL_BCAST);
|
||||
|
||||
/* Decide based on global flag, not variable one */
|
||||
if (coll_op->fragment_data.message_descriptor->root) {
|
||||
memcpy((void *)(uintptr_t)src_buffer_desc->data_addr,
|
||||
(void *) ((uintptr_t) buf + (uintptr_t) coll_op->
|
||||
fragment_data.message_descriptor->n_bytes_scheduled) , frag_len);
|
||||
|
||||
/* No unpack for root */
|
||||
new_op->process_fn = NULL;
|
||||
new_op->variable_fn_params.root_flag = true;
|
||||
new_op->variable_fn_params.root_route = NULL;
|
||||
task_setup = OP_ML_MODULE(new_op)->coll_ml_bcast_functions[current_coll_op]->
|
||||
task_setup_fn[COLL_ML_ROOT_TASK_FN];
|
||||
|
||||
} else {
|
||||
new_op->process_fn = mca_coll_ml_bcast_large_unpack_data;
|
||||
new_op->variable_fn_params.root_flag = false;
|
||||
new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route;
|
||||
task_setup = OP_ML_MODULE(new_op)->coll_ml_bcast_functions[current_coll_op]->
|
||||
task_setup_fn[COLL_ML_GENERAL_TASK_FN];
|
||||
}
|
||||
|
||||
/* Setup fragment specific data */
|
||||
new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
|
||||
new_op->fragment_data.buffer_desc = src_buffer_desc;
|
||||
new_op->fragment_data.fragment_size = frag_len;
|
||||
new_op->fragment_data.message_descriptor->n_active++;
|
||||
|
||||
ML_SET_VARIABLE_PARAMS_BCAST(
|
||||
new_op,
|
||||
OP_ML_MODULE(new_op),
|
||||
frag_len,
|
||||
MPI_BYTE,
|
||||
src_buffer_desc,
|
||||
0,
|
||||
0,
|
||||
frag_len,
|
||||
src_buffer_desc->data_addr);
|
||||
|
||||
/* Fill in bcast specific arguments */
|
||||
/* TBD: remove buffer_size */
|
||||
new_op->variable_fn_params.buffer_size = coll_op->variable_fn_params.buffer_size;
|
||||
new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
|
||||
|
||||
/* Set order info for new frag if there is a bcol needs ordering */
|
||||
MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
|
||||
|
||||
ML_VERBOSE(10, ("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d",
|
||||
new_op->variable_fn_params.buffer_size ,
|
||||
new_op->fragment_data.fragment_size,
|
||||
new_op->fragment_data.message_descriptor->n_bytes_scheduled));
|
||||
|
||||
/* Launch this collective !! */
|
||||
ret = mca_coll_ml_generic_collectives_append_to_queue(new_op, task_setup);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
ML_VERBOSE(10, ("Failed to launch"));
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#define BCAST_FRAGMENTATION_IS_ENABLED(module) \
|
||||
(module->bcast_fn_index_table[LARGE_BCAST] < ML_BCAST_LARGE_DATA_KNOWN)
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int parallel_bcast_start(void *buf, int count, struct ompi_datatype_t *dtype,
|
||||
int root, mca_coll_base_module_t *module, ompi_request_t **req)
|
||||
{
|
||||
size_t pack_len = 0;
|
||||
size_t dt_size = 0;
|
||||
bool contig = false;
|
||||
int bcast_index, n_fragments = 1;
|
||||
|
||||
mca_coll_ml_collective_operation_progress_t * coll_op = NULL;
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
|
||||
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
|
||||
mca_coll_ml_task_setup_fn_t task_setup;
|
||||
OPAL_PTRDIFF_TYPE lb, extent;
|
||||
|
||||
/* actual starting place of the user buffer (lb added) */
|
||||
void *actual_buf;
|
||||
|
||||
ML_VERBOSE(10, ("Starting bcast, mca_coll_ml_bcast_uknown_root buf: %p", buf));
|
||||
|
||||
ompi_datatype_type_size(dtype, &dt_size);
|
||||
pack_len = count * dt_size;
|
||||
|
||||
/* Setup data buffer */
|
||||
ML_BUFFER_ALLOC_WAIT(ml_module, src_buffer_desc);
|
||||
/* Get information about memory layout */
|
||||
contig = opal_datatype_is_contiguous_memory_layout((opal_datatype_t *)dtype, count);
|
||||
|
||||
ompi_datatype_get_extent (dtype, &lb, &extent);
|
||||
|
||||
actual_buf = (void *) ((uintptr_t) buf + lb);
|
||||
|
||||
/* Allocate collective schedule and pack message */
|
||||
if (contig) {
|
||||
if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_BCAST]) {
|
||||
assert(pack_len <= ml_module->payload_block->size_buffer);
|
||||
bcast_index = ml_module->bcast_fn_index_table[SMALL_BCAST];
|
||||
|
||||
ML_VERBOSE(10, ("Contig + small message %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index));
|
||||
ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op, bcast_index, root, pack_len,
|
||||
pack_len, actual_buf, src_buffer_desc);
|
||||
|
||||
ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, count, dtype,
|
||||
src_buffer_desc, 0, 0, ml_module->payload_block->size_buffer,
|
||||
(src_buffer_desc->data_addr));
|
||||
} else if (BCAST_FRAGMENTATION_IS_ENABLED(ml_module)) {
|
||||
/* We moved the fragmentation decision from communication creation time to
|
||||
runtime, since for large messages the if latency is not so critical */
|
||||
size_t n_dts_per_frag;
|
||||
int frag_len, pipeline_depth = mca_coll_ml_component.pipeline_depth;
|
||||
bcast_index = ml_module->bcast_fn_index_table[LARGE_BCAST];
|
||||
|
||||
ML_VERBOSE(10, ("Contig + fragmentation %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index));
|
||||
|
||||
/* Calculate the number of fragments required for this message */
|
||||
frag_len = (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_BCAST] ?
|
||||
pack_len : (size_t) ml_module->small_message_thresholds[BCOL_BCAST]);
|
||||
|
||||
n_dts_per_frag = frag_len/dt_size;
|
||||
n_fragments = (pack_len + dt_size*n_dts_per_frag - 1)/(dt_size*n_dts_per_frag);
|
||||
pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth);
|
||||
|
||||
ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op, bcast_index, root, pack_len,
|
||||
frag_len, actual_buf, src_buffer_desc);
|
||||
ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, (frag_len/dt_size), dtype,
|
||||
src_buffer_desc, 0, 0, frag_len, (src_buffer_desc->data_addr));
|
||||
|
||||
coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_progress;
|
||||
coll_op->full_message.pipeline_depth = pipeline_depth;
|
||||
/* Initialize fragment specific information */
|
||||
coll_op->fragment_data.current_coll_op = bcast_index;
|
||||
/* coll_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; */
|
||||
coll_op->fragment_data.fragment_size = frag_len;
|
||||
coll_op->fragment_data.message_descriptor->n_active++;
|
||||
/* should be removed */
|
||||
coll_op->variable_fn_params.buffer_size = frag_len;
|
||||
|
||||
ML_VERBOSE(10, ("Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d",
|
||||
coll_op->variable_fn_params.buffer_size,
|
||||
coll_op->fragment_data.fragment_size));
|
||||
} else {
|
||||
bcast_index = ml_module->bcast_fn_index_table[LARGE_BCAST];
|
||||
ML_VERBOSE(10, ("Contig + zero copy %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index));
|
||||
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_bcast_functions[bcast_index],
|
||||
actual_buf, actual_buf, pack_len,
|
||||
0 /* offset for first pack */);
|
||||
/* For large messages (bcast) this points to userbuf */
|
||||
/* Pasha: temporary work around for basesmuma, userbuf should
|
||||
be removed */
|
||||
coll_op->variable_fn_params.userbuf = buf;
|
||||
coll_op->process_fn = NULL;
|
||||
coll_op->full_message.n_bytes_scheduled = pack_len;
|
||||
|
||||
ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, count, dtype,
|
||||
src_buffer_desc, 0, 0,
|
||||
ml_module->payload_block->size_buffer, buf);
|
||||
}
|
||||
} else {
|
||||
/* Non contiguous data type */
|
||||
bcast_index = ml_module->bcast_fn_index_table[SMALL_BCAST];
|
||||
ML_VERBOSE(10, ("NON Contig + fragmentation %d [0-sk, 1-lk, 3-su, 4-lu]", bcast_index));
|
||||
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_bcast_functions[bcast_index],
|
||||
actual_buf, actual_buf, pack_len,
|
||||
0 /* offset for first pack */);
|
||||
if (OPAL_LIKELY(pack_len > 0)) {
|
||||
size_t max_data = 0;
|
||||
|
||||
if (ompi_comm_rank(ml_module->comm) == root) {
|
||||
struct iovec iov;
|
||||
uint32_t iov_count = 1;
|
||||
|
||||
opal_convertor_copy_and_prepare_for_send(
|
||||
ompi_mpi_local_convertor,
|
||||
&dtype->super, count, buf, 0,
|
||||
&coll_op->full_message.send_convertor);
|
||||
|
||||
opal_convertor_get_packed_size(&coll_op->full_message.send_convertor,
|
||||
&coll_op->full_message.send_converter_bytes_packed);
|
||||
|
||||
coll_op->full_message.n_bytes_total =
|
||||
coll_op->full_message.send_converter_bytes_packed;
|
||||
|
||||
iov.iov_base = (IOVBASE_TYPE*) src_buffer_desc->data_addr;
|
||||
iov.iov_len = ml_module->small_message_thresholds[BCOL_BCAST];
|
||||
max_data = ml_module->small_message_thresholds[BCOL_BCAST];
|
||||
opal_convertor_pack(&coll_op->full_message.send_convertor,
|
||||
&iov, &iov_count, &max_data);
|
||||
coll_op->process_fn = NULL;
|
||||
coll_op->full_message.n_bytes_scheduled = max_data;
|
||||
|
||||
/* We need prepare the data for future pipe line comunication */
|
||||
coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_converter_progress;
|
||||
coll_op->full_message.pipeline_depth = mca_coll_ml_component.pipeline_depth;
|
||||
coll_op->full_message.root = true;
|
||||
|
||||
} else {
|
||||
opal_convertor_copy_and_prepare_for_send(
|
||||
ompi_mpi_local_convertor,
|
||||
&dtype->super, count, NULL, 0,
|
||||
&coll_op->full_message.dummy_convertor);
|
||||
|
||||
/* In non-root case we use it for #bytes remaining to receive */
|
||||
opal_convertor_get_packed_size(&coll_op->full_message.dummy_convertor,
|
||||
&coll_op->full_message.send_converter_bytes_packed);
|
||||
|
||||
opal_convertor_copy_and_prepare_for_recv(
|
||||
ompi_mpi_local_convertor,
|
||||
&dtype->super, count, buf, 0,
|
||||
&coll_op->full_message.recv_convertor);
|
||||
|
||||
opal_convertor_get_unpacked_size(&coll_op->full_message.recv_convertor,
|
||||
&coll_op->full_message.recv_converter_bytes_packed);
|
||||
|
||||
coll_op->full_message.root = false;
|
||||
coll_op->full_message.n_bytes_total =
|
||||
coll_op->full_message.recv_converter_bytes_packed;
|
||||
coll_op->process_fn = mca_coll_ml_bcast_converter_unpack_data;
|
||||
|
||||
coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_converter_progress;
|
||||
coll_op->full_message.pipeline_depth = mca_coll_ml_component.pipeline_depth;
|
||||
|
||||
max_data = ml_module->small_message_thresholds[BCOL_BCAST];
|
||||
coll_op->full_message.dummy_conv_position = 0;
|
||||
mca_coll_ml_convertor_get_send_frag_size(
|
||||
ml_module, &max_data,
|
||||
&coll_op->full_message);
|
||||
|
||||
coll_op->full_message.n_bytes_scheduled = max_data;
|
||||
}
|
||||
}
|
||||
coll_op->fragment_data.current_coll_op = bcast_index;
|
||||
coll_op->fragment_data.message_descriptor->n_active++;
|
||||
coll_op->fragment_data.fragment_size = coll_op->full_message.n_bytes_scheduled;
|
||||
|
||||
/* Set initial index */
|
||||
coll_op->full_message.next_expected_index = src_buffer_desc->buffer_index;
|
||||
|
||||
/* Prepare linking information for future frags */
|
||||
COLL_ML_SETUP_ORDERING_INFO(coll_op, coll_op, NULL);
|
||||
|
||||
/* Since the data is already packed we will use MPI_BYTE and byte count as datatype */
|
||||
ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, coll_op->full_message.n_bytes_scheduled, MPI_BYTE,
|
||||
src_buffer_desc, 0, 0, ml_module->payload_block->size_buffer,(src_buffer_desc->data_addr));
|
||||
|
||||
n_fragments = (coll_op->full_message.n_bytes_total +
|
||||
ml_module->small_message_thresholds[BCOL_BCAST] - 1) / ml_module->small_message_thresholds[BCOL_BCAST];
|
||||
}
|
||||
|
||||
coll_op->variable_fn_params.hier_factor = 1;
|
||||
coll_op->fragment_data.buffer_desc = src_buffer_desc;
|
||||
|
||||
/* Set order info if there is a bcol needs ordering */
|
||||
MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);
|
||||
|
||||
if (ompi_comm_rank(ml_module->comm) == root) {
|
||||
coll_op->full_message.root =
|
||||
coll_op->variable_fn_params.root_flag = true;
|
||||
coll_op->variable_fn_params.root_route = NULL;
|
||||
task_setup = ml_module->coll_ml_bcast_functions[bcast_index]->
|
||||
task_setup_fn[COLL_ML_ROOT_TASK_FN];
|
||||
} else {
|
||||
coll_op->full_message.root =
|
||||
coll_op->variable_fn_params.root_flag = false;
|
||||
|
||||
coll_op->variable_fn_params.root_route =
|
||||
(NULL == coll_op->coll_schedule->topo_info->route_vector ?
|
||||
NULL : &coll_op->coll_schedule->topo_info->route_vector[root]);
|
||||
|
||||
task_setup = ml_module->coll_ml_bcast_functions[bcast_index]->
|
||||
task_setup_fn[COLL_ML_GENERAL_TASK_FN];
|
||||
}
|
||||
|
||||
*req = &coll_op->full_message.super;
|
||||
return mca_coll_ml_generic_collectives_launcher(coll_op, task_setup);
|
||||
}
|
||||
|
||||
int mca_coll_ml_parallel_bcast(void *buf, int count, struct ompi_datatype_t *dtype,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int ret;
|
||||
ompi_request_t *req;
|
||||
|
||||
ret = parallel_bcast_start(buf, count, dtype, root, module, &req);
|
||||
if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) {
|
||||
ML_VERBOSE(10, ("Failed to launch"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Blocking bcast */
|
||||
ompi_request_wait_completion(req);
|
||||
ompi_request_free(&req);
|
||||
|
||||
ML_VERBOSE(10, ("Bcast is done mca_coll_ml_bcast_known"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_coll_ml_parallel_bcast_nb(void *buf, int count, struct ompi_datatype_t *dtype,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
ompi_request_t **req,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = parallel_bcast_start(buf, count, dtype, root, module, req);
|
||||
if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) {
|
||||
ML_VERBOSE(10, ("Failed to launch"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
ML_VERBOSE(10, ("Bcast is done mca_coll_ml_bcast_known"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype_t *dtype,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
|
||||
/* local variables */
|
||||
int ret, fn_idx;
|
||||
size_t pack_len = 0;
|
||||
size_t dt_size = 0;
|
||||
|
||||
mca_coll_ml_collective_operation_progress_t * coll_op = NULL;
|
||||
mca_coll_ml_compound_functions_t *fixed_schedule;
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
|
||||
mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
|
||||
mca_bcol_base_coll_fn_desc_t *func;
|
||||
OPAL_PTRDIFF_TYPE lb, extent;
|
||||
|
||||
/* actual starting place of the user buffer (lb added) */
|
||||
void *actual_buf;
|
||||
|
||||
ML_VERBOSE(10, ("Starting static bcast, small messages"));
|
||||
|
||||
assert(NULL != dtype);
|
||||
/* Calculate size of the data,
|
||||
* on this stage only contiguous data is supported */
|
||||
ompi_datatype_type_size(dtype, &dt_size);
|
||||
pack_len = count * dt_size;
|
||||
ompi_datatype_get_extent (dtype, &lb, &extent);
|
||||
|
||||
actual_buf = (void *) ((uintptr_t) buf + lb);
|
||||
|
||||
/* Setup data buffer */
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
while (NULL == src_buffer_desc) {
|
||||
opal_progress();
|
||||
src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
|
||||
}
|
||||
|
||||
/* Allocate collective schedule and pack message */
|
||||
if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_BCAST]) {
|
||||
/* The len of the message can not be larger than ML buffer size */
|
||||
assert(pack_len <= ml_module->payload_block->size_buffer);
|
||||
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_bcast_functions[ML_BCAST_SMALL_DATA_SEQUENTIAL],
|
||||
actual_buf, actual_buf, pack_len,
|
||||
0 /* offset for first pack */);
|
||||
if (ompi_comm_rank(comm) == root) {
|
||||
/* single frag, pack the data */
|
||||
memcpy((void *)(uintptr_t)src_buffer_desc->data_addr,
|
||||
buf, pack_len);
|
||||
/* No unpack for root */
|
||||
coll_op->process_fn = NULL;
|
||||
} else {
|
||||
coll_op->process_fn = mca_coll_ml_bcast_small_unpack_data;
|
||||
}
|
||||
|
||||
coll_op->variable_fn_params.sbuf =
|
||||
src_buffer_desc->data_addr;
|
||||
} else {
|
||||
ML_VERBOSE(10, ("ML_BCAST_LARGE_DATA_KNOWN case."));
|
||||
coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
|
||||
ml_module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_SEQUENTIAL],
|
||||
actual_buf, actual_buf, pack_len,
|
||||
0 /* offset for first pack */);
|
||||
/* For large messages (bcast) this points to userbuf */
|
||||
/* Pasha: temporary work around for basesmuma, userbuf should
|
||||
be removed */
|
||||
coll_op->variable_fn_params.userbuf =
|
||||
coll_op->variable_fn_params.sbuf = actual_buf;
|
||||
|
||||
coll_op->process_fn = NULL;
|
||||
}
|
||||
|
||||
/* Fill in the function arguments */
|
||||
coll_op->variable_fn_params.sequence_num =
|
||||
OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
|
||||
coll_op->variable_fn_params.count = count;
|
||||
coll_op->variable_fn_params.dtype = dtype;
|
||||
|
||||
coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
|
||||
coll_op->variable_fn_params.src_desc = src_buffer_desc;
|
||||
coll_op->variable_fn_params.sbuf_offset = 0;
|
||||
coll_op->variable_fn_params.rbuf_offset = 0;
|
||||
|
||||
/* pasha - why we duplicate it ? */
|
||||
coll_op->fragment_data.buffer_desc = src_buffer_desc;
|
||||
|
||||
/* pack data into payload buffer - NOTE: assume no fragmenation at this stage */
|
||||
if (ompi_comm_rank(comm) == root) {
|
||||
coll_op->variable_fn_params.root_flag = true;
|
||||
coll_op->variable_fn_params.root_route =
|
||||
&coll_op->coll_schedule->topo_info->route_vector[root];
|
||||
|
||||
coll_op->full_message.n_bytes_scheduled = pack_len;
|
||||
} else {
|
||||
coll_op->variable_fn_params.root_flag = false;
|
||||
coll_op->variable_fn_params.root_route =
|
||||
&coll_op->coll_schedule->topo_info->route_vector[root];
|
||||
}
|
||||
|
||||
/* seems like we should fix a schedule here and now */
|
||||
fixed_schedule = coll_op->coll_schedule->
|
||||
comp_fn_arr[coll_op->variable_fn_params.root_route->level];
|
||||
|
||||
/* now we set this schedule as the compound function list */
|
||||
coll_op->coll_schedule->component_functions = fixed_schedule;
|
||||
|
||||
coll_op->sequential_routine.current_active_bcol_fn = 0;
|
||||
|
||||
while (true) {
|
||||
/* ready, aim, fire collective(s)!! */
|
||||
fn_idx = coll_op->sequential_routine.current_active_bcol_fn;
|
||||
|
||||
func = fixed_schedule[fn_idx].bcol_function;
|
||||
ret = func->coll_fn(&coll_op->variable_fn_params,
|
||||
(struct mca_bcol_base_function_t *) &fixed_schedule[fn_idx].constant_group_data);
|
||||
/* set the coll_fn_started flag to true */
|
||||
if (BCOL_FN_COMPLETE == ret) {
|
||||
/* done with this routine, bump the active counter */
|
||||
coll_op->sequential_routine.current_active_bcol_fn++;
|
||||
coll_op->variable_fn_params.root_flag = true;
|
||||
/* check for collective completion */
|
||||
if (coll_op->sequential_routine.current_active_bcol_fn ==
|
||||
coll_op->coll_schedule->n_fns) {
|
||||
/* handle fragment completion */
|
||||
ret = coll_ml_fragment_completion_processing(coll_op);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing");
|
||||
}
|
||||
|
||||
/* break out of while loop */
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* put entire collective opperation onto sequential queue */
|
||||
opal_list_append(&mca_coll_ml_component.sequential_collectives,
|
||||
(opal_list_item_t *) coll_op);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Blocking bcast */
|
||||
ompi_request_wait_completion(&coll_op->full_message.super);
|
||||
ompi_request_free((ompi_request_t **) &coll_op);
|
||||
|
||||
ML_VERBOSE(10, ("Bcast is done"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,552 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_COLL_ML_COLLS_H
|
||||
#define MCA_COLL_ML_COLLS_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
|
||||
#define COLL_ML_FN_NAME_LEN 256
|
||||
|
||||
|
||||
/* utility information used to coordinate activities, such as resource
|
||||
* management between different functions in the hierarchy
|
||||
*/
|
||||
struct mca_coll_ml_utility_data_t {
|
||||
|
||||
/* RLG - temp fix !!!! - really need to remove this, but right now
|
||||
do not want to change the signature of the collective primitives to
|
||||
use coll_ml_utility_data_t rather than mca_bcol_base_function_t */
|
||||
int dummy;
|
||||
|
||||
/* module */
|
||||
struct mca_bcol_base_module_t *bcol_module;
|
||||
|
||||
/* */
|
||||
int index_in_consecutive_same_bcol_calls;
|
||||
|
||||
/* number of times functions from this bcol are called in order */
|
||||
int n_of_this_type_in_a_row;
|
||||
|
||||
/* number of times functions from this module are called
|
||||
* in the collective operation. */
|
||||
int n_of_this_type_in_collective;
|
||||
int index_of_this_type_in_collective;
|
||||
|
||||
};
|
||||
typedef struct mca_coll_ml_utility_data_t mca_coll_ml_utility_data_t;
|
||||
|
||||
|
||||
/* forward declaration */
|
||||
struct mca_coll_ml_collective_operation_progress_t;
|
||||
struct mca_coll_ml_task_status_t;
|
||||
|
||||
typedef int (* mca_coll_ml_process_op_fn_t)
|
||||
(struct mca_coll_ml_collective_operation_progress_t *coll_op);
|
||||
|
||||
typedef int (* mca_coll_ml_task_comp_fn_t)
|
||||
(struct mca_coll_ml_task_status_t *coll_op);
|
||||
|
||||
typedef int (* mca_coll_ml_fragment_launch_fn_t)
|
||||
( struct mca_coll_ml_collective_operation_progress_t *coll_op);
|
||||
|
||||
typedef int (* mca_coll_ml_sequential_task_setup_fn_t)
|
||||
( struct mca_coll_ml_collective_operation_progress_t *coll_op);
|
||||
/* This data structure defines the dependencies for a given
|
||||
* compound operation. We will use this as a basis for implementing
|
||||
* collective operations.
|
||||
*/
|
||||
struct mca_coll_ml_compound_functions_t {
|
||||
/* label */
|
||||
char fn_name[COLL_ML_FN_NAME_LEN];
|
||||
|
||||
/* hierarchy level that is used for this bcol */
|
||||
int h_level;
|
||||
|
||||
/* the list of functions that make up this task */
|
||||
/* coll_bcol_collective_description_t *bcol_function; */
|
||||
mca_bcol_base_coll_fn_desc_t *bcol_function;
|
||||
/* task completion function for this compound function */
|
||||
mca_coll_ml_task_comp_fn_t task_comp_fn;
|
||||
|
||||
/* module specific information that is a constant on a per group
|
||||
* basis
|
||||
*/
|
||||
mca_coll_ml_utility_data_t constant_group_data;
|
||||
|
||||
/* number of dependencies to be satified before these function can be
|
||||
* started */
|
||||
int num_dependencies;
|
||||
|
||||
/*
|
||||
* number of notifications to perform on completion. The assumption
|
||||
* is that a counter will be incremented.
|
||||
*/
|
||||
int num_dependent_tasks;
|
||||
|
||||
/*
|
||||
* pointers to counters that need be updated. This assumes
|
||||
* an array of tasks is used to describe the ML level
|
||||
* collective operation, with these indecies referencing elements
|
||||
* in this array.
|
||||
*/
|
||||
int *dependent_task_indices;
|
||||
|
||||
};
|
||||
|
||||
typedef struct mca_coll_ml_compound_functions_t mca_coll_ml_compound_functions_t;
|
||||
|
||||
/* Forward declaration for operation_description_t */
|
||||
struct mca_coll_ml_module_t;
|
||||
|
||||
enum {
|
||||
COLL_ML_GENERAL_TASK_FN,
|
||||
COLL_ML_ROOT_TASK_FN,
|
||||
COLL_ML_MAX_TASK_FN
|
||||
};
|
||||
|
||||
enum {
|
||||
SEQ_TASK_NOT_STARTED,
|
||||
SEQ_TASK_PENDING,
|
||||
SEQ_TASK_IN_PROG
|
||||
};
|
||||
|
||||
typedef void (*mca_coll_ml_task_setup_fn_t) (struct mca_coll_ml_task_status_t *task_status, int index, struct mca_coll_ml_compound_functions_t *func);
|
||||
|
||||
/*
|
||||
* Collective operation definition
|
||||
*/
|
||||
struct mca_coll_ml_collective_operation_description_t {
|
||||
|
||||
/*
|
||||
* Type of collective opeartion - there are two types:
|
||||
* 1) sequential progress through the collectives is sufficient
|
||||
* 2) general treatment, popping tasks onto execution queus is needed.
|
||||
*/
|
||||
int progress_type;
|
||||
|
||||
struct mca_coll_ml_topology_t *topo_info;
|
||||
|
||||
/*
|
||||
* number of functions in collective operation
|
||||
*/
|
||||
int n_fns;
|
||||
|
||||
/*
|
||||
* list of functions
|
||||
*/
|
||||
mca_coll_ml_compound_functions_t *component_functions;
|
||||
|
||||
/*
|
||||
* array of lists of functions
|
||||
*/
|
||||
mca_coll_ml_compound_functions_t **comp_fn_arr;
|
||||
|
||||
/*
|
||||
* indices into the list - fixes a sequential schedule
|
||||
*/
|
||||
int *sch_idx;
|
||||
|
||||
/*
|
||||
* Task setup functions, so far we have only 3 - root and non-root
|
||||
*/
|
||||
mca_coll_ml_task_setup_fn_t task_setup_fn[COLL_ML_MAX_TASK_FN];
|
||||
|
||||
/* number of functions are called for bcols need ordering */
|
||||
int n_fns_need_ordering;
|
||||
};
|
||||
typedef struct mca_coll_ml_collective_operation_description_t
|
||||
mca_coll_ml_collective_operation_description_t;
|
||||
|
||||
/* Data structure used to track the state of individual bcol
|
||||
* functions. This is used to track dependencies and completion
|
||||
* to progress the ML level function correctly.
|
||||
*
|
||||
* mca_coll_ml_task_status_t will be associated with an
|
||||
* mca_coll_ml_collective_operation_progress_t structure for
|
||||
* the duration of the lifetime of a communicator.
|
||||
* An array of task statuses will be stored with
|
||||
* the mca_coll_ml_collective_operation_progress_t data structure, so
|
||||
* that the taks status elements do not need to be moved back to
|
||||
* a free list before they are re-used. When the ML level function
|
||||
* is complete, all mca_coll_ml_task_status_t are available for
|
||||
* re-use.
|
||||
*/
|
||||
struct mca_coll_ml_task_status_t{
|
||||
/* need to move this between lists to progress this correctly */
|
||||
opal_list_item_t item;
|
||||
|
||||
/* number of dependencies satisfied */
|
||||
int n_dep_satisfied;
|
||||
|
||||
/* ***************************************************************
|
||||
* Pasha:
|
||||
* I'm adding to the status: num_dependencies, num_dependent_tasks and
|
||||
* dependent_task_indices. The information originally resided on mca_coll_ml_compound_functions_t.
|
||||
* For collective operation with static nature it is not problem.
|
||||
* But for Bcast operation, where run time parameters, like root, actually
|
||||
* define the dependency. rt prefix mean run-time.
|
||||
*/
|
||||
|
||||
/* number of dependencies to be satisfied before these function can be
|
||||
* started */
|
||||
int rt_num_dependencies;
|
||||
|
||||
/*
|
||||
* number of notifications to perform on completion. The assumption
|
||||
* is that a counter will be incremented.
|
||||
*/
|
||||
int rt_num_dependent_tasks;
|
||||
|
||||
/*
|
||||
* pointers to counters that need be updated. This assumes
|
||||
* an array of tasks is used to describe the ML level
|
||||
* collective operation, with these indecies referencing elements
|
||||
* in this array.
|
||||
*/
|
||||
int *rt_dependent_task_indices;
|
||||
/*
|
||||
*
|
||||
* ***************************************************************/
|
||||
|
||||
/* index in collective schedule */
|
||||
int my_index_in_coll_schedule;
|
||||
|
||||
/* function pointers */
|
||||
mca_bcol_base_coll_fn_desc_t *bcol_fn;
|
||||
|
||||
/* association with a specific collective task - the ML
|
||||
* mca_coll_ml_collective_operation_progress_t stores the
|
||||
* specific function parameters */
|
||||
struct mca_coll_ml_collective_operation_progress_t *ml_coll_operation;
|
||||
|
||||
mca_coll_ml_task_comp_fn_t task_comp_fn;
|
||||
};
|
||||
typedef struct mca_coll_ml_task_status_t mca_coll_ml_task_status_t;
|
||||
|
||||
typedef enum mca_coll_ml_pending_type_t {
|
||||
REQ_OUT_OF_ORDER = 1,
|
||||
REQ_OUT_OF_MEMORY = 1 << 1
|
||||
} mca_coll_ml_pending_type_t;
|
||||
|
||||
/* Forward declaration */
|
||||
struct mca_bcol_base_payload_buffer_desc_t;
|
||||
/* Data structure used to track ML level collective operation
|
||||
* progress.
|
||||
*/
|
||||
struct mca_coll_ml_collective_operation_progress_t {
|
||||
/* need this to put on a list properly */
|
||||
/* Full message information */
|
||||
struct full_message_t {
|
||||
/* make this a list item */
|
||||
ompi_request_t super;
|
||||
/* Next expected fragment.
|
||||
* It used for controling order of converter unpack operation */
|
||||
size_t next_expected_index;
|
||||
/* Pointer to last intilized fragment.
|
||||
* It used for controling order of converter unpack operation */
|
||||
struct mca_coll_ml_collective_operation_progress_t *last_started_frag;
|
||||
/* destination data address in user memory */
|
||||
void *dest_user_addr;
|
||||
/* source data address in user memory */
|
||||
const void *src_user_addr;
|
||||
/* total message size */
|
||||
size_t n_bytes_total;
|
||||
/* per-process total message size - relevant for operations
|
||||
* such as gather and scatter, where each rank has it's
|
||||
* own unique data
|
||||
*/
|
||||
size_t n_bytes_per_proc_total;
|
||||
size_t max_n_bytes_per_proc_total;
|
||||
/* data processes - from a local perspective */
|
||||
size_t n_bytes_delivered;
|
||||
/* current offset - where to continue with next fragment */
|
||||
size_t n_bytes_scheduled;
|
||||
/* number of fragments needed to process this message */
|
||||
size_t n_fragments;
|
||||
/* number of active frags */
|
||||
int n_active;
|
||||
/* actual pipeline depth */
|
||||
int pipeline_depth;
|
||||
/* am I the real root of the collective ? */
|
||||
bool root;
|
||||
/* collective fragment launcher */
|
||||
mca_coll_ml_fragment_launch_fn_t fragment_launcher;
|
||||
/* is data contingous */
|
||||
bool send_data_continguous;
|
||||
bool recv_data_continguous;
|
||||
/* data type count */
|
||||
int64_t send_count;
|
||||
int64_t recv_count;
|
||||
/* extent of the data types */
|
||||
size_t send_extent;
|
||||
size_t recv_extent;
|
||||
/* send data type */
|
||||
struct ompi_datatype_t * send_data_type;
|
||||
/* needed for non-contigous buffers */
|
||||
size_t offset_into_send_buffer;
|
||||
/* receive data type */
|
||||
struct ompi_datatype_t * recv_data_type;
|
||||
/* needed for non-contigous buffers */
|
||||
size_t offset_into_recv_buffer;
|
||||
/* Convertors for non contigous data */
|
||||
opal_convertor_t send_convertor;
|
||||
opal_convertor_t recv_convertor;
|
||||
/* Will be used by receiver for #bytes calc in the next frag */
|
||||
opal_convertor_t dummy_convertor;
|
||||
size_t dummy_conv_position;
|
||||
/* Size of packed data */
|
||||
size_t send_converter_bytes_packed;
|
||||
size_t recv_converter_bytes_packed;
|
||||
/* In case if ordering is needed: order num for next frag */
|
||||
int next_frag_num;
|
||||
/* The variable is used by non-blocking memory synchronization code
|
||||
* for caching bank index */
|
||||
int bank_index_to_recycle;
|
||||
/* need a handle for collective progress e.g. alltoall*/
|
||||
bcol_fragment_descriptor_t frag_info;
|
||||
} full_message;
|
||||
|
||||
/* collective operation being progressed */
|
||||
mca_coll_ml_collective_operation_description_t *coll_schedule;
|
||||
/* */
|
||||
mca_coll_ml_process_op_fn_t process_fn;
|
||||
|
||||
mca_coll_base_module_t *coll_module;
|
||||
|
||||
/* If not null , we have to release next fragment */
|
||||
struct mca_coll_ml_collective_operation_progress_t *next_to_process_frag;
|
||||
/* pointer to previous fragment */
|
||||
struct mca_coll_ml_collective_operation_progress_t *prev_frag;
|
||||
/* This flag marks that the fragment is pending on the waiting
|
||||
* to be processed prior to recycling
|
||||
*/
|
||||
enum mca_coll_ml_pending_type_t pending;
|
||||
|
||||
/* Fragment data */
|
||||
struct fragment_data_t {
|
||||
/* current buffer pointer - offset (in bytes) into the user data */
|
||||
size_t offset_into_user_buffer;
|
||||
size_t offset_into_user_buffer_per_proc;
|
||||
|
||||
/* amount of data (in bytes) in this fragment - amount of data
|
||||
* actually processed */
|
||||
size_t fragment_size;
|
||||
size_t per_rank_fragment_size;
|
||||
size_t data_type_count_per_frag;
|
||||
|
||||
/* pointer to full message progress data */
|
||||
struct full_message_t *message_descriptor;
|
||||
|
||||
/* ML buffer descriptor attached to this buffer */
|
||||
struct mca_bcol_base_payload_buffer_desc_t *buffer_desc;
|
||||
/* handle for collective progress, e.g. alltoall */
|
||||
bcol_fragment_descriptor_t bcol_fragment_desc;
|
||||
|
||||
/* Which collective algorithm */
|
||||
int current_coll_op;
|
||||
} fragment_data;
|
||||
|
||||
/* specific function parameters */
|
||||
/* the assumption is that the variable parameters passed into
|
||||
* the ML level function will persist until the collective operation
|
||||
* is complete. For a blocking function this is until the collective
|
||||
* function is exited, and for nonblocking collective functions this
|
||||
* is until test or wait completes the collective.
|
||||
*/
|
||||
int global_root;
|
||||
bcol_function_args_t variable_fn_params;
|
||||
|
||||
struct{
|
||||
/* current active function - for sequential algorithms */
|
||||
int current_active_bcol_fn;
|
||||
|
||||
/* current function status - not started, or in progress.
|
||||
* When the routine has completed, the active bcol index is
|
||||
* incremented, so no need to keep track of a completed
|
||||
* status.
|
||||
*/
|
||||
int current_bcol_status;
|
||||
|
||||
/* use this call back to setup algorithm specific info
|
||||
after each level necessary
|
||||
*/
|
||||
mca_coll_ml_sequential_task_setup_fn_t seq_task_setup;
|
||||
|
||||
} sequential_routine;
|
||||
|
||||
struct{
|
||||
/*
|
||||
* BCOL function status - individual elements will be posted to
|
||||
* ml level component queues, as appropriate.
|
||||
*/
|
||||
mca_coll_ml_task_status_t *status_array;
|
||||
|
||||
/* number of completed tasks - need this for collective completion.
|
||||
* Resource completion is tracked by each BCOL module .
|
||||
*/
|
||||
int num_tasks_completed;
|
||||
} dag_description;
|
||||
};
|
||||
typedef struct mca_coll_ml_collective_operation_progress_t
|
||||
mca_coll_ml_collective_operation_progress_t;
|
||||
OBJ_CLASS_DECLARATION(mca_coll_ml_collective_operation_progress_t);
|
||||
|
||||
#define OP_ML_MODULE(op) ((mca_coll_ml_module_t *)((op)->coll_module))
|
||||
#define GET_COMM(op) ((OP_ML_MODULE(op))->comm)
|
||||
#define IS_COLL_SYNCMEM(op) (ML_MEMSYNC == op->fragment_data.current_coll_op)
|
||||
|
||||
#define CHECK_AND_RECYCLE(op) \
|
||||
do { \
|
||||
if (0 == (op)->pending) { \
|
||||
/* Caching 2 values that we can't to touch on op after returing it */ \
|
||||
/* back to the free list (free list may release memory on distruct )*/ \
|
||||
struct ompi_communicator_t *comm = GET_COMM(op); \
|
||||
bool is_coll_sync = IS_COLL_SYNCMEM(op); \
|
||||
ML_VERBOSE(10, ("Releasing %p", op)); \
|
||||
OMPI_REQUEST_FINI(&(op)->full_message.super); \
|
||||
opal_free_list_return (&(((mca_coll_ml_module_t *)(op)->coll_module)-> \
|
||||
coll_ml_collective_descriptors), \
|
||||
(opal_free_list_item_t *)op); \
|
||||
/* Special check for memory synchronization completion */ \
|
||||
/* We have to return it first to free list, since the communicator */ \
|
||||
/* release potentially may trigger ML module distraction and having */ \
|
||||
/* the element not on the list may cause memory leak. */ \
|
||||
if (OPAL_UNLIKELY(is_coll_sync)) { \
|
||||
if (OMPI_COMM_IS_INTRINSIC(comm)) { \
|
||||
opal_show_help("help-mpi-coll-ml.txt", \
|
||||
"coll-ml-check-fatal-error", true, \
|
||||
comm->c_name); \
|
||||
ompi_mpi_abort(comm, 6); \
|
||||
} else { \
|
||||
opal_show_help("help-mpi-coll-ml.txt", \
|
||||
"coll-ml-check-error", true, \
|
||||
comm->c_name); \
|
||||
/* After this point it is UNSAFE to touch ml module */ \
|
||||
/* or communicator */ \
|
||||
OBJ_RELEASE(comm); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define MCA_COLL_ML_SET_ORDER_INFO(coll_progress, num_frags) \
|
||||
do { \
|
||||
mca_coll_ml_topology_t *topo = (coll_progress)->coll_schedule->topo_info; \
|
||||
bcol_function_args_t *variable_params = &(coll_progress)->variable_fn_params; \
|
||||
if (topo->topo_ordering_info.num_bcols_need_ordering > 0) { \
|
||||
variable_params->order_info.bcols_started = 0; \
|
||||
variable_params->order_info.order_num = \
|
||||
topo->topo_ordering_info.next_order_num; \
|
||||
variable_params->order_info.n_fns_need_ordering = \
|
||||
(coll_progress)->coll_schedule->n_fns_need_ordering; \
|
||||
topo->topo_ordering_info.next_order_num += num_frags; \
|
||||
(coll_progress)->fragment_data.message_descriptor->next_frag_num = \
|
||||
variable_params->order_info.order_num + 1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(coll_progress) \
|
||||
do { \
|
||||
mca_coll_ml_topology_t *topo = (coll_progress)->coll_schedule->topo_info; \
|
||||
if (topo->topo_ordering_info.num_bcols_need_ordering > 0) { \
|
||||
bcol_function_args_t *variable_params = &(coll_progress)->variable_fn_params; \
|
||||
struct fragment_data_t *frag_data = &(coll_progress)->fragment_data; \
|
||||
variable_params->order_info.bcols_started = 0; \
|
||||
variable_params->order_info.order_num = frag_data->message_descriptor->next_frag_num; \
|
||||
variable_params->order_info.n_fns_need_ordering = \
|
||||
(coll_progress)->coll_schedule->n_fns_need_ordering; \
|
||||
frag_data->message_descriptor->next_frag_num++; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule) \
|
||||
do { \
|
||||
int i; \
|
||||
(schedule)->n_fns_need_ordering = 0; \
|
||||
for (i = 0; i < (schedule)->n_fns; ++i) { \
|
||||
mca_bcol_base_module_t *current_bcol = \
|
||||
(schedule)->component_functions[i].constant_group_data.bcol_module; \
|
||||
assert (NULL != current_bcol); \
|
||||
if (current_bcol->bcol_component->need_ordering) { \
|
||||
(schedule)->n_fns_need_ordering++; \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
enum {
|
||||
MCA_COLL_ML_NET_STREAM_SEND,
|
||||
MCA_COLL_ML_NET_STREAM_RECV
|
||||
};
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int mca_coll_ml_convertor_prepare(ompi_datatype_t *dtype, int count, const void *buff,
|
||||
opal_convertor_t *convertor, int stream)
|
||||
{
|
||||
size_t bytes_packed;
|
||||
|
||||
if (MCA_COLL_ML_NET_STREAM_SEND == stream) {
|
||||
opal_convertor_copy_and_prepare_for_send(
|
||||
ompi_mpi_local_convertor,
|
||||
&dtype->super, count, buff, 0,
|
||||
convertor);
|
||||
} else {
|
||||
opal_convertor_copy_and_prepare_for_recv(
|
||||
ompi_mpi_local_convertor,
|
||||
&dtype->super, count, buff, 0,
|
||||
convertor);
|
||||
}
|
||||
|
||||
opal_convertor_get_packed_size(convertor, &bytes_packed);
|
||||
|
||||
return bytes_packed;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int mca_coll_ml_convertor_pack(void *data_addr, size_t buff_size,
|
||||
opal_convertor_t *convertor)
|
||||
{
|
||||
struct iovec iov;
|
||||
|
||||
size_t max_data = 0;
|
||||
uint32_t iov_count = 1;
|
||||
|
||||
iov.iov_base = (IOVBASE_TYPE*) data_addr;
|
||||
iov.iov_len = buff_size;
|
||||
|
||||
opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
||||
|
||||
return max_data;
|
||||
}
|
||||
|
||||
static inline __opal_attribute_always_inline__
|
||||
int mca_coll_ml_convertor_unpack(void *data_addr, size_t buff_size,
|
||||
opal_convertor_t *convertor)
|
||||
{
|
||||
struct iovec iov;
|
||||
|
||||
size_t max_data = 0;
|
||||
uint32_t iov_count = 1;
|
||||
|
||||
iov.iov_base = (void *) (uintptr_t) data_addr;
|
||||
iov.iov_len = buff_size;
|
||||
|
||||
opal_convertor_unpack(convertor, &iov, &iov_count, &max_data);
|
||||
|
||||
return max_data;
|
||||
}
|
||||
#endif /* MCA_COLL_ML_COLLS_H */
|
||||
|
@ -1,449 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Most of the description of the data layout is in the
|
||||
* coll_sm_module.c file.
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "ompi/mca/sbgp/base/base.h"
|
||||
|
||||
#include "coll_ml.h"
|
||||
#include "coll_ml_inlines.h"
|
||||
|
||||
#include "ompi/patterns/net/netpatterns.h"
|
||||
#include "coll_ml_mca.h"
|
||||
#include "coll_ml_custom_utils.h"
|
||||
|
||||
|
||||
/*
|
||||
* Public string showing the coll ompi_ml V2 component version number
|
||||
*/
|
||||
const char *mca_coll_ml_component_version_string =
|
||||
"Open MPI ml-V2 collective MCA component version " OMPI_VERSION;
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int ml_open(void);
|
||||
static int ml_close(void);
|
||||
static int coll_ml_progress(void);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
mca_coll_ml_component_t mca_coll_ml_component = {
|
||||
|
||||
/* First, fill in the super */
|
||||
|
||||
.super = {
|
||||
/* First, the mca_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
.collm_version = {
|
||||
MCA_COLL_BASE_VERSION_2_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
|
||||
.mca_component_name = "ml",
|
||||
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
|
||||
OMPI_RELEASE_VERSION),
|
||||
|
||||
/* Component open, close, and register functions */
|
||||
|
||||
.mca_open_component = ml_open,
|
||||
.mca_close_component = ml_close,
|
||||
.mca_register_component_params = mca_coll_ml_register_params
|
||||
},
|
||||
.collm_data = {
|
||||
/* The component is not checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_NONE
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
.collm_init_query = mca_coll_ml_init_query,
|
||||
.collm_comm_query = mca_coll_ml_comm_query,
|
||||
},
|
||||
};
|
||||
|
||||
void mca_coll_ml_abort_ml(char *message)
|
||||
{
|
||||
ML_ERROR(("ML Collective FATAL ERROR: %s", message));
|
||||
/* shutdown the MPI */
|
||||
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_INTERN);
|
||||
}
|
||||
/*
|
||||
* progress function
|
||||
*/
|
||||
|
||||
#define INDEX(task) ((task)->my_index_in_coll_schedule)
|
||||
#define ACTIVE_L (&mca_coll_ml_component.active_tasks)
|
||||
#define PENDING_L (&mca_coll_ml_component.pending_tasks)
|
||||
#define SEQ_L (&mca_coll_ml_component.sequential_collectives)
|
||||
|
||||
static int coll_ml_progress()
|
||||
{
|
||||
|
||||
int rc = OMPI_SUCCESS;
|
||||
int fn_idx;
|
||||
|
||||
mca_coll_ml_task_status_t *task_status, *task_status_tmp;
|
||||
mca_coll_ml_collective_operation_progress_t *seq_coll_op;
|
||||
mca_coll_ml_collective_operation_progress_t *seq_coll_op_tmp;
|
||||
|
||||
mca_bcol_base_module_collective_fn_primitives_t progress_fn,
|
||||
coll_fn;
|
||||
mca_coll_ml_utility_data_t *const_args;
|
||||
mca_coll_ml_component_t *cm = &mca_coll_ml_component;
|
||||
|
||||
/* Pasha: Not sure that is it correct way to resolve the problem.
|
||||
Iprobe call for progress engine. The progress engine calls for our
|
||||
progress and as result the first element on the list is progressed again
|
||||
and so we call for Iprobe again.... as result we get HUGE stack.
|
||||
|
||||
One way to prevent it - remove the item from the list, and once you finish
|
||||
to process it - put it back.
|
||||
|
||||
Other way - put flag on component, if the progress is running - exit immediate.
|
||||
*/
|
||||
if (cm->progress_is_busy) {
|
||||
/* We are already working...*/
|
||||
return OMPI_SUCCESS;
|
||||
} else {
|
||||
cm->progress_is_busy = true;
|
||||
}
|
||||
|
||||
/* progress sequential collective operations */
|
||||
/* RLG - need to do better here for parallel progress */
|
||||
OPAL_THREAD_LOCK(&(cm->sequential_collectives_mutex));
|
||||
OPAL_LIST_FOREACH_SAFE(seq_coll_op, seq_coll_op_tmp, SEQ_L, mca_coll_ml_collective_operation_progress_t) {
|
||||
do {
|
||||
fn_idx = seq_coll_op->sequential_routine.current_active_bcol_fn;
|
||||
/* initialize the task */
|
||||
|
||||
if (SEQ_TASK_IN_PROG == seq_coll_op->sequential_routine.current_bcol_status){
|
||||
progress_fn = seq_coll_op->coll_schedule->
|
||||
component_functions[fn_idx].bcol_function->progress_fn;
|
||||
} else {
|
||||
/* PPP Pasha - apparently task setup should be called only here. see linr 190 */
|
||||
progress_fn = seq_coll_op->coll_schedule->
|
||||
component_functions[fn_idx].bcol_function->coll_fn;
|
||||
}
|
||||
|
||||
const_args = &seq_coll_op->coll_schedule->component_functions[fn_idx].constant_group_data;
|
||||
/* RLG - note need to move to useing coll_ml_utility_data_t as
|
||||
* collective argument, rather than mca_bcol_base_function_t
|
||||
*/
|
||||
rc = progress_fn(&(seq_coll_op->variable_fn_params), (mca_bcol_base_function_t *)const_args);
|
||||
if (BCOL_FN_COMPLETE == rc) {
|
||||
/* done with this routine */
|
||||
seq_coll_op->sequential_routine.current_active_bcol_fn++;
|
||||
/* this is totally hardwired for bcast, need a general call-back */
|
||||
|
||||
fn_idx = seq_coll_op->sequential_routine.current_active_bcol_fn;
|
||||
if (fn_idx == seq_coll_op->coll_schedule->n_fns) {
|
||||
/* done with this collective - recycle descriptor */
|
||||
|
||||
/* remove from the progress list */
|
||||
(void) opal_list_remove_item(SEQ_L, (opal_list_item_t *)seq_coll_op);
|
||||
|
||||
/* handle fragment completion */
|
||||
rc = coll_ml_fragment_completion_processing(seq_coll_op);
|
||||
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing");
|
||||
}
|
||||
} else {
|
||||
rc = seq_coll_op->sequential_routine.seq_task_setup(seq_coll_op);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
mca_coll_ml_abort_ml("Failed to run sequential task setup");
|
||||
}
|
||||
|
||||
seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
|
||||
continue;
|
||||
}
|
||||
} else if (BCOL_FN_NOT_STARTED == rc) {
|
||||
seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
|
||||
} else if (BCOL_FN_STARTED == rc) {
|
||||
seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_IN_PROG;
|
||||
}
|
||||
|
||||
break;
|
||||
} while (true);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&(cm->sequential_collectives_mutex));
|
||||
|
||||
/* general dag's */
|
||||
/* see if active tasks can be progressed */
|
||||
OPAL_THREAD_LOCK(&(cm->active_tasks_mutex));
|
||||
OPAL_LIST_FOREACH(task_status, ACTIVE_L, mca_coll_ml_task_status_t) {
|
||||
/* progress task */
|
||||
progress_fn = task_status->bcol_fn->progress_fn;
|
||||
const_args = &task_status->ml_coll_operation->coll_schedule->
|
||||
component_functions[INDEX(task_status)].constant_group_data;
|
||||
rc = progress_fn(&(task_status->ml_coll_operation->variable_fn_params),
|
||||
(mca_bcol_base_function_t *)const_args);
|
||||
if (BCOL_FN_COMPLETE == rc) {
|
||||
ML_VERBOSE(3, ("GOT BCOL_COMPLETED!!!!"));
|
||||
rc = mca_coll_ml_task_completion_processing(&task_status, ACTIVE_L);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
mca_coll_ml_abort_ml("Failed to run mca_coll_ml_task_completion_processing");
|
||||
}
|
||||
} else if (BCOL_FN_STARTED == rc) {
|
||||
/* nothing to do */
|
||||
} else {
|
||||
mca_coll_ml_abort_ml("Failed to run mca_coll_ml_task_completion_processing");
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&(cm->active_tasks_mutex));
|
||||
|
||||
/* see if new tasks can be initiated */
|
||||
OPAL_THREAD_LOCK(&(cm->pending_tasks_mutex));
|
||||
OPAL_LIST_FOREACH_SAFE(task_status, task_status_tmp, PENDING_L, mca_coll_ml_task_status_t) {
|
||||
/* check to see if dependencies are satisfied */
|
||||
int n_dependencies = task_status->rt_num_dependencies;
|
||||
int n_dependencies_satisfied = task_status->n_dep_satisfied;
|
||||
|
||||
if (n_dependencies == n_dependencies_satisfied) {
|
||||
/* initiate the task */
|
||||
coll_fn = task_status->bcol_fn->coll_fn;
|
||||
const_args = &task_status->ml_coll_operation->coll_schedule->
|
||||
component_functions[INDEX(task_status)].constant_group_data;
|
||||
rc = coll_fn(&(task_status->ml_coll_operation->variable_fn_params),
|
||||
(mca_bcol_base_function_t *)const_args);
|
||||
if (BCOL_FN_COMPLETE == rc) {
|
||||
ML_VERBOSE(3, ("GOT BCOL_COMPLETED!"));
|
||||
rc = mca_coll_ml_task_completion_processing(&task_status, PENDING_L);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
mca_coll_ml_abort_ml("Failed to run mca_coll_ml_task_completion_processing");
|
||||
}
|
||||
} else if ( BCOL_FN_STARTED == rc ) {
|
||||
ML_VERBOSE(3, ("GOT BCOL_STARTED!"));
|
||||
(void) opal_list_remove_item(PENDING_L, (opal_list_item_t *)task_status);
|
||||
/* RLG - is there potential for deadlock here ? Need to
|
||||
* look at this closely
|
||||
*/
|
||||
OPAL_THREAD_LOCK(&(cm->active_tasks_mutex));
|
||||
opal_list_append(ACTIVE_L, (opal_list_item_t *)task_status);
|
||||
OPAL_THREAD_UNLOCK(&(cm->active_tasks_mutex));
|
||||
} else if( BCOL_FN_NOT_STARTED == rc ) {
|
||||
/* nothing to do */
|
||||
ML_VERBOSE(10, ("GOT BCOL_FN_NOT_STARTED!"));
|
||||
} else {
|
||||
OPAL_THREAD_UNLOCK(&(cm->pending_tasks_mutex));
|
||||
/* error will be returned - RLG : need to reconsider return
|
||||
* types - we have no way to convey error information
|
||||
* the way the code is implemented now */
|
||||
ML_VERBOSE(3, ("GOT error !"));
|
||||
rc = OMPI_ERROR;
|
||||
OMPI_ERRHANDLER_RETURN(rc,MPI_COMM_WORLD,rc,"Error returned from bcol function: aborting");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&(cm->pending_tasks_mutex));
|
||||
|
||||
/* return */
|
||||
cm->progress_is_busy = false;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static void adjust_coll_config_by_mca_param(void)
|
||||
{
|
||||
/* setting bcast mca params */
|
||||
if (COLL_ML_STATIC_BCAST == mca_coll_ml_component.bcast_algorithm) {
|
||||
mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_KNOWN;
|
||||
mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_KNOWN;
|
||||
} else if (COLL_ML_SEQ_BCAST == mca_coll_ml_component.bcast_algorithm) {
|
||||
mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_SEQUENTIAL;
|
||||
mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_SEQUENTIAL;
|
||||
} else { /* Unknown root */
|
||||
mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_UNKNOWN;
|
||||
mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Open the component
|
||||
*/
|
||||
static int ml_open(void)
|
||||
{
|
||||
/* local variables */
|
||||
int rc, c_idx, m_idx;
|
||||
mca_coll_ml_component_t *cs = &mca_coll_ml_component;
|
||||
|
||||
/* set the starting sequence number */
|
||||
cs->base_sequence_number = -1;
|
||||
cs->progress_is_busy = false;
|
||||
|
||||
/* If the priority is zero (default) disable the component */
|
||||
if (mca_coll_ml_component.ml_priority <= 0) {
|
||||
return OMPI_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/* Init memory structures (no real memory is allocated) */
|
||||
OBJ_CONSTRUCT(&cs->memory_manager, mca_coll_ml_lmngr_t);
|
||||
|
||||
if (OMPI_SUCCESS != (rc = mca_base_framework_open(&ompi_sbgp_base_framework, 0))) {
|
||||
fprintf(stderr," failure in open mca_sbgp_base_open \n");
|
||||
return rc;
|
||||
}
|
||||
if (OMPI_SUCCESS != (rc = mca_base_framework_open(&ompi_bcol_base_framework, 0))) {
|
||||
fprintf(stderr," failure in open mca_bcol_base_open \n");
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Reset collective tunings cache */
|
||||
for (c_idx = 0; c_idx < ML_NUM_OF_FUNCTIONS; c_idx++) {
|
||||
for (m_idx = 0; m_idx < ML_NUM_MSG; m_idx++) {
|
||||
mca_coll_ml_reset_config(&cs->coll_config[c_idx][m_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
adjust_coll_config_by_mca_param();
|
||||
|
||||
/* Load configuration file and cache the configuration on component */
|
||||
rc = mca_coll_ml_config_file_init();
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
||||
/* reigster the progress function */
|
||||
rc = opal_progress_register(coll_ml_progress);
|
||||
if (OMPI_SUCCESS != rc ) {
|
||||
fprintf(stderr," failed to register the ml progress function \n");
|
||||
fflush(stderr);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&(cs->pending_tasks_mutex), opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&(cs->pending_tasks), opal_list_t);
|
||||
OBJ_CONSTRUCT(&(cs->active_tasks_mutex), opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&(cs->active_tasks), opal_list_t);
|
||||
OBJ_CONSTRUCT(&(cs->sequential_collectives_mutex), opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&(cs->sequential_collectives), opal_list_t);
|
||||
|
||||
rc = netpatterns_init();
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
cs->topo_discovery_fn[COLL_ML_HR_FULL] =
|
||||
mca_coll_ml_fulltree_hierarchy_discovery;
|
||||
|
||||
cs->topo_discovery_fn[COLL_ML_HR_ALLREDUCE] =
|
||||
mca_coll_ml_allreduce_hierarchy_discovery;
|
||||
|
||||
cs->topo_discovery_fn[COLL_ML_HR_NBS] =
|
||||
mca_coll_ml_fulltree_exclude_basesmsocket_hierarchy_discovery;
|
||||
|
||||
cs->topo_discovery_fn[COLL_ML_HR_SINGLE_PTP] =
|
||||
mca_coll_ml_fulltree_ptp_only_hierarchy_discovery;
|
||||
|
||||
cs->topo_discovery_fn[COLL_ML_HR_SINGLE_IBOFFLOAD] =
|
||||
mca_coll_ml_fulltree_iboffload_only_hierarchy_discovery;
|
||||
|
||||
cs->need_allreduce_support = false;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the component
|
||||
*/
|
||||
static int ml_close(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mca_coll_ml_component_t *cs = &mca_coll_ml_component;
|
||||
|
||||
/* There is not need to release/close resource if the
|
||||
* priority was set to zero */
|
||||
if (cs->ml_priority <= 0) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&cs->memory_manager);
|
||||
OBJ_DESTRUCT(&cs->pending_tasks_mutex);
|
||||
OBJ_DESTRUCT(&cs->pending_tasks);
|
||||
OBJ_DESTRUCT(&cs->active_tasks_mutex);
|
||||
OBJ_DESTRUCT(&cs->active_tasks);
|
||||
OBJ_DESTRUCT(&cs->sequential_collectives_mutex);
|
||||
OBJ_DESTRUCT(&cs->sequential_collectives);
|
||||
|
||||
/* deregister progress function */
|
||||
ret = opal_progress_unregister(coll_ml_progress);
|
||||
if (OMPI_SUCCESS != ret ) {
|
||||
OMPI_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* close the sbgp and bcol frameworks */
|
||||
if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_sbgp_base_framework))) {
|
||||
OMPI_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bcol_base_framework))) {
|
||||
OMPI_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* query to see if the component is available for use, and can
|
||||
* satisfy the thread and progress requirements
|
||||
*/
|
||||
int mca_coll_ml_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* at this stage there is no reason to disaulify this component */
|
||||
/* Add here bcol init nand sbgp init */
|
||||
ret = mca_sbgp_base_init(enable_progress_threads, enable_mpi_threads);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = mca_bcol_base_init(enable_progress_threads, enable_mpi_threads);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* done */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
@ -1,613 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "coll_ml.h"
|
||||
#include "coll_ml_inlines.h"
|
||||
#include "coll_ml_config.h"
|
||||
#include "coll_ml_lex.h"
|
||||
|
||||
static char *key_buffer = NULL;
|
||||
static size_t key_buffer_len = 0;
|
||||
|
||||
typedef struct section_config_t {
|
||||
char *section_name;
|
||||
int section_id;
|
||||
per_collective_configuration_t config;
|
||||
} section_config_t;
|
||||
|
||||
typedef struct coll_config_t {
|
||||
char *coll_name;
|
||||
int coll_id;
|
||||
section_config_t section;
|
||||
} coll_config_t;
|
||||
|
||||
static int algorithm_name_to_id(char *name)
|
||||
{
|
||||
assert (NULL != name);
|
||||
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_KNOWN"))
|
||||
return ML_BCAST_SMALL_DATA_KNOWN;
|
||||
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_UNKNOWN"))
|
||||
return ML_BCAST_SMALL_DATA_UNKNOWN;
|
||||
if (!strcasecmp(name,"ML_BCAST_SMALL_DATA_SEQUENTIAL"))
|
||||
return ML_BCAST_SMALL_DATA_SEQUENTIAL;
|
||||
if (!strcasecmp(name,"ML_BCAST_LARGE_DATA_KNOWN"))
|
||||
return ML_BCAST_LARGE_DATA_KNOWN;
|
||||
if (!strcasecmp(name,"ML_BCAST_LARGE_DATA_UNKNOWN"))
|
||||
return ML_BCAST_LARGE_DATA_UNKNOWN;
|
||||
if (!strcasecmp(name,"ML_BCAST_LARGE_DATA_SEQUENTIAL"))
|
||||
return ML_BCAST_LARGE_DATA_SEQUENTIAL;
|
||||
if (!strcasecmp(name,"ML_N_DATASIZE_BINS"))
|
||||
return ML_N_DATASIZE_BINS;
|
||||
if (!strcasecmp(name,"ML_NUM_BCAST_FUNCTIONS"))
|
||||
return ML_NUM_BCAST_FUNCTIONS;
|
||||
if (!strcasecmp(name,"ML_SCATTER_SMALL_DATA_KNOWN"))
|
||||
return ML_SCATTER_SMALL_DATA_KNOWN;
|
||||
if (!strcasecmp(name,"ML_SCATTER_N_DATASIZE_BINS"))
|
||||
return ML_SCATTER_N_DATASIZE_BINS;
|
||||
if (!strcasecmp(name,"ML_SCATTER_SMALL_DATA_UNKNOWN"))
|
||||
return ML_SCATTER_SMALL_DATA_UNKNOWN;
|
||||
if (!strcasecmp(name,"ML_SCATTER_SMALL_DATA_SEQUENTIAL"))
|
||||
return ML_SCATTER_SMALL_DATA_SEQUENTIAL;
|
||||
if (!strcasecmp(name,"ML_NUM_SCATTER_FUNCTIONS"))
|
||||
return ML_NUM_SCATTER_FUNCTIONS;
|
||||
if (!strcasecmp(name,"ML_SMALL_DATA_ALLREDUCE"))
|
||||
return ML_SMALL_DATA_ALLREDUCE;
|
||||
if (!strcasecmp(name,"ML_LARGE_DATA_ALLREDUCE"))
|
||||
return ML_LARGE_DATA_ALLREDUCE;
|
||||
if (!strcasecmp(name,"ML_SMALL_DATA_REDUCE"))
|
||||
return ML_SMALL_DATA_ALLREDUCE;
|
||||
if (!strcasecmp(name,"ML_LARGE_DATA_REDUCE"))
|
||||
return ML_LARGE_DATA_ALLREDUCE;
|
||||
if (!strcasecmp(name,"ML_SMALL_DATA_REDUCE"))
|
||||
return ML_SMALL_DATA_REDUCE;
|
||||
if (!strcasecmp(name,"ML_LARGE_DATA_REDUCE"))
|
||||
return ML_LARGE_DATA_REDUCE;
|
||||
if (!strcasecmp(name,"ML_NUM_ALLREDUCE_FUNCTIONS"))
|
||||
return ML_NUM_ALLREDUCE_FUNCTIONS;
|
||||
if (!strcasecmp(name,"ML_SMALL_DATA_ALLTOALL"))
|
||||
return ML_SMALL_DATA_ALLTOALL;
|
||||
if (!strcasecmp(name,"ML_LARGE_DATA_ALLTOALL"))
|
||||
return ML_LARGE_DATA_ALLTOALL;
|
||||
if (!strcasecmp(name,"ML_NUM_ALLTOALL_FUNCTIONS"))
|
||||
return ML_NUM_ALLTOALL_FUNCTIONS;
|
||||
if (!strcasecmp(name,"ML_SMALL_DATA_ALLGATHER"))
|
||||
return ML_SMALL_DATA_ALLGATHER;
|
||||
if (!strcasecmp(name,"ML_LARGE_DATA_ALLGATHER"))
|
||||
return ML_LARGE_DATA_ALLGATHER;
|
||||
if (!strcasecmp(name,"ML_NUM_ALLGATHER_FUNCTIONS"))
|
||||
return ML_NUM_ALLGATHER_FUNCTIONS;
|
||||
if (!strcasecmp(name,"ML_SMALL_DATA_GATHER"))
|
||||
return ML_SMALL_DATA_GATHER;
|
||||
if (!strcasecmp(name,"ML_LARGE_DATA_GATHER"))
|
||||
return ML_LARGE_DATA_GATHER;
|
||||
if (!strcasecmp(name,"ML_NUM_GATHER_FUNCTIONS"))
|
||||
return ML_NUM_GATHER_FUNCTIONS;
|
||||
if (!strcasecmp(name,"ML_BARRIER_DEFAULT"))
|
||||
return ML_BARRIER_DEFAULT;
|
||||
|
||||
/* ERROR */
|
||||
return ML_UNDEFINED;
|
||||
}
|
||||
|
||||
static int hierarchy_name_to_id(char *name)
|
||||
{
|
||||
assert (NULL != name);
|
||||
if (!strcasecmp(name, "FULL_HR")) {
|
||||
return COLL_ML_HR_FULL;
|
||||
}
|
||||
if (!strcasecmp(name, "FULL_HR_NO_BASESOCKET")) {
|
||||
return COLL_ML_HR_NBS;
|
||||
}
|
||||
if (!strcasecmp(name, "PTP_ONLY")) {
|
||||
return COLL_ML_HR_SINGLE_PTP;
|
||||
}
|
||||
if (!strcasecmp(name, "IBOFFLOAD_ONLY")) {
|
||||
return COLL_ML_HR_SINGLE_IBOFFLOAD;
|
||||
}
|
||||
/* Error */
|
||||
return ML_UNDEFINED;
|
||||
}
|
||||
|
||||
static int section_name_to_id(char *name)
|
||||
{
|
||||
assert (NULL != name);
|
||||
if (!strcasecmp(name, "SMALL")) {
|
||||
return ML_SMALL_MSG;
|
||||
}
|
||||
|
||||
if (!strcasecmp(name, "LARGE")) {
|
||||
return ML_LARGE_MSG;
|
||||
}
|
||||
/* Error */
|
||||
return ML_UNDEFINED;
|
||||
}
|
||||
|
||||
static int coll_name_to_id(char *name)
|
||||
{
|
||||
assert (NULL != name);
|
||||
if (!strcasecmp(name, "ALLGATHER")) {
|
||||
return ML_ALLGATHER;
|
||||
}
|
||||
if (!strcasecmp(name, "ALLGATHERV")) {
|
||||
return ML_ALLGATHERV;
|
||||
}
|
||||
if (!strcasecmp(name, "ALLREDUCE")) {
|
||||
return ML_ALLREDUCE;
|
||||
}
|
||||
if (!strcasecmp(name, "ALLTOALL")) {
|
||||
return ML_ALLTOALL;
|
||||
}
|
||||
if (!strcasecmp(name, "ALLTOALLV")) {
|
||||
return ML_ALLTOALLV;
|
||||
}
|
||||
if (!strcasecmp(name, "ALLTOALLW")) {
|
||||
return ML_ALLTOALLW;
|
||||
}
|
||||
if (!strcasecmp(name, "ALLTOALLW")) {
|
||||
return ML_ALLTOALLW;
|
||||
}
|
||||
if (!strcasecmp(name, "BARRIER")) {
|
||||
return ML_BARRIER;
|
||||
}
|
||||
if (!strcasecmp(name, "BCAST")) {
|
||||
return ML_BCAST;
|
||||
}
|
||||
if (!strcasecmp(name, "EXSCAN")) {
|
||||
return ML_EXSCAN;
|
||||
}
|
||||
if (!strcasecmp(name, "GATHER")) {
|
||||
return ML_GATHER;
|
||||
}
|
||||
if (!strcasecmp(name, "GATHERV")) {
|
||||
return ML_GATHERV;
|
||||
}
|
||||
if (!strcasecmp(name, "REDUCE")) {
|
||||
return ML_REDUCE;
|
||||
}
|
||||
if (!strcasecmp(name, "REDUCE_SCATTER")) {
|
||||
return ML_REDUCE_SCATTER;
|
||||
}
|
||||
if (!strcasecmp(name, "SCAN")) {
|
||||
return ML_SCAN;
|
||||
}
|
||||
if (!strcasecmp(name, "SCATTER")) {
|
||||
return ML_SCATTER;
|
||||
}
|
||||
if (!strcasecmp(name, "SCATTERV")) {
|
||||
return ML_SCATTERV;
|
||||
}
|
||||
|
||||
/* nonblocking functions */
|
||||
|
||||
if (!strcasecmp(name, "IALLGATHER")) {
|
||||
return ML_IALLGATHER;
|
||||
}
|
||||
if (!strcasecmp(name, "IALLGATHERV")) {
|
||||
return ML_IALLGATHERV;
|
||||
}
|
||||
if (!strcasecmp(name, "IALLREDUCE")) {
|
||||
return ML_IALLREDUCE;
|
||||
}
|
||||
if (!strcasecmp(name, "IALLTOALL")) {
|
||||
return ML_IALLTOALL;
|
||||
}
|
||||
if (!strcasecmp(name, "IALLTOALLV")) {
|
||||
return ML_IALLTOALLV;
|
||||
}
|
||||
if (!strcasecmp(name, "IALLTOALLW")) {
|
||||
return ML_IALLTOALLW;
|
||||
}
|
||||
if (!strcasecmp(name, "IALLTOALLW")) {
|
||||
return ML_IALLTOALLW;
|
||||
}
|
||||
if (!strcasecmp(name, "IBARRIER")) {
|
||||
return ML_IBARRIER;
|
||||
}
|
||||
if (!strcasecmp(name, "IBCAST")) {
|
||||
return ML_IBCAST;
|
||||
}
|
||||
if (!strcasecmp(name, "IEXSCAN")) {
|
||||
return ML_IEXSCAN;
|
||||
}
|
||||
if (!strcasecmp(name, "IGATHER")) {
|
||||
return ML_IGATHER;
|
||||
}
|
||||
if (!strcasecmp(name, "IGATHERV")) {
|
||||
return ML_IGATHERV;
|
||||
}
|
||||
if (!strcasecmp(name, "IREDUCE")) {
|
||||
return ML_IREDUCE;
|
||||
}
|
||||
if (!strcasecmp(name, "IREDUCE_SCATTER")) {
|
||||
return ML_IREDUCE_SCATTER;
|
||||
}
|
||||
if (!strcasecmp(name, "ISCAN")) {
|
||||
return ML_ISCAN;
|
||||
}
|
||||
if (!strcasecmp(name, "ISCATTER")) {
|
||||
return ML_ISCATTER;
|
||||
}
|
||||
if (!strcasecmp(name, "ISCATTERV")) {
|
||||
return ML_ISCATTERV;
|
||||
}
|
||||
|
||||
/* Error - collecives name was not matched */
|
||||
return ML_UNDEFINED;
|
||||
}
|
||||
static int set_collective_name(coll_config_t *coll_config)
|
||||
{
|
||||
int coll_id =
|
||||
coll_name_to_id(coll_ml_config_yytext);
|
||||
|
||||
if (ML_UNDEFINED == coll_id) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
coll_config->coll_id = coll_id;
|
||||
coll_config->coll_name = strdup(coll_ml_config_yytext);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int set_section_name(section_config_t *section_config)
|
||||
{
|
||||
int section_id;
|
||||
|
||||
section_id = section_name_to_id(coll_ml_config_yytext);
|
||||
|
||||
if (ML_UNDEFINED == section_id) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
section_config->section_id = section_id;
|
||||
section_config->section_name = strdup(coll_ml_config_yytext);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
void mca_coll_ml_reset_config(per_collective_configuration_t *config)
|
||||
{
|
||||
config->topology_id = ML_UNDEFINED;
|
||||
config->threshold = ML_UNDEFINED;
|
||||
config->algorithm_id = ML_UNDEFINED;
|
||||
config->fragmentation_enabled = ML_UNDEFINED;
|
||||
}
|
||||
|
||||
static void reset_section(section_config_t *section_cf)
|
||||
{
|
||||
if (section_cf->section_name) {
|
||||
free (section_cf->section_name);
|
||||
section_cf->section_name = NULL;
|
||||
}
|
||||
|
||||
section_cf->section_id = ML_UNDEFINED;
|
||||
mca_coll_ml_reset_config(§ion_cf->config);
|
||||
}
|
||||
|
||||
static void reset_collective(coll_config_t *coll_cf)
|
||||
{
|
||||
if (coll_cf->coll_name) {
|
||||
free (coll_cf->coll_name);
|
||||
coll_cf->coll_name = NULL;
|
||||
}
|
||||
|
||||
coll_cf->coll_id = ML_UNDEFINED;
|
||||
reset_section(&coll_cf->section);
|
||||
}
|
||||
|
||||
/*
|
||||
* String to integer;
|
||||
*/
|
||||
static int string_to_int(char *str)
|
||||
{
|
||||
while (isspace(*str)) {
|
||||
++str;
|
||||
}
|
||||
|
||||
/* Nope -- just decimal, so use atoi() */
|
||||
return atoi(str);
|
||||
}
|
||||
|
||||
static int parse_algorithm_key(section_config_t *section, char *value)
|
||||
{
|
||||
int ret;
|
||||
ret = algorithm_name_to_id(value);
|
||||
if (ML_UNDEFINED == ret) {
|
||||
return OMPI_ERROR;
|
||||
} else {
|
||||
section->config.algorithm_id = ret;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int parse_threshold_key(section_config_t *section, char *value)
|
||||
{
|
||||
assert (NULL != value);
|
||||
|
||||
if(!strcasecmp(value, "unlimited")) {
|
||||
section->config.threshold = -1;
|
||||
} else {
|
||||
section->config.threshold = string_to_int(value);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int parse_hierarchy_key(section_config_t *section, char *value)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = hierarchy_name_to_id(value);
|
||||
if (ML_UNDEFINED == ret) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
section->config.topology_id = ret;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int parse_fragmentation_key(section_config_t *section, char *value)
|
||||
{
|
||||
assert (NULL != value);
|
||||
|
||||
if(!strcasecmp(value, "enable")) {
|
||||
section->config.fragmentation_enabled = 1;
|
||||
} else if (!strcasecmp(value, "disable")) {
|
||||
section->config.fragmentation_enabled = 0;
|
||||
} else {
|
||||
ML_ERROR(("Line %d, unexpected fragmentation value %s. Legal values are: enable/disable",
|
||||
coll_ml_config_yynewlines, value));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* Save configuration that have been collected so far */
|
||||
static int save_settings(coll_config_t *coll_config)
|
||||
{
|
||||
per_collective_configuration_t *cf;
|
||||
|
||||
if (ML_UNDEFINED == coll_config->coll_id || ML_UNDEFINED == coll_config->section.section_id) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
cf = &mca_coll_ml_component.coll_config[coll_config->coll_id][coll_config->section.section_id];
|
||||
|
||||
cf->topology_id = coll_config->section.config.topology_id;
|
||||
cf->threshold = coll_config->section.config.threshold;
|
||||
cf->algorithm_id = coll_config->section.config.algorithm_id;
|
||||
cf->fragmentation_enabled = coll_config->section.config.fragmentation_enabled;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse a single line
|
||||
*/
|
||||
static int parse_line(section_config_t *section)
|
||||
{
|
||||
int val, ret = OMPI_SUCCESS;
|
||||
char *value = NULL;
|
||||
|
||||
/* Save the name name */
|
||||
if (key_buffer_len < strlen(coll_ml_config_yytext) + 1) {
|
||||
char *tmp;
|
||||
key_buffer_len = strlen(coll_ml_config_yytext) + 1;
|
||||
tmp = (char *) realloc(key_buffer, key_buffer_len);
|
||||
if (NULL == tmp) {
|
||||
free(key_buffer);
|
||||
key_buffer_len = 0;
|
||||
key_buffer = NULL;
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
key_buffer = tmp;
|
||||
}
|
||||
strncpy(key_buffer, coll_ml_config_yytext, key_buffer_len);
|
||||
|
||||
/* The first thing we have to see is an "=" */
|
||||
val = coll_ml_config_yylex();
|
||||
if (coll_ml_config_parse_done || COLL_ML_CONFIG_PARSE_EQUAL != val) {
|
||||
ML_ERROR(("Line %d, expected = before key: %s",
|
||||
coll_ml_config_yynewlines,
|
||||
key_buffer));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Next we get the value */
|
||||
val = coll_ml_config_yylex();
|
||||
if (COLL_ML_CONFIG_PARSE_SINGLE_WORD == val ||
|
||||
COLL_ML_CONFIG_PARSE_VALUE == val) {
|
||||
value = strdup(coll_ml_config_yytext);
|
||||
if (NULL == value) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Now we need to see the newline */
|
||||
val = coll_ml_config_yylex();
|
||||
if (COLL_ML_CONFIG_PARSE_NEWLINE != val &&
|
||||
COLL_ML_CONFIG_PARSE_DONE != val) {
|
||||
ML_ERROR(("Line %d, expected new line after %s",
|
||||
coll_ml_config_yynewlines,
|
||||
key_buffer));
|
||||
free(value);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* If we did not get EOL or EOF, something is wrong */
|
||||
else if (COLL_ML_CONFIG_PARSE_DONE != val &&
|
||||
COLL_ML_CONFIG_PARSE_NEWLINE != val) {
|
||||
ML_ERROR(("Line %d, expected new line or end of line",
|
||||
coll_ml_config_yynewlines));
|
||||
return OMPI_ERROR;
|
||||
} else {
|
||||
ML_ERROR(("Line %d malformed", coll_ml_config_yynewlines));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Line parsing is done, read the values */
|
||||
if (!strcasecmp(key_buffer, "algorithm")) {
|
||||
ret = parse_algorithm_key(section, value);
|
||||
} else if (!strcasecmp(key_buffer, "threshold")) {
|
||||
ret = parse_threshold_key(section, value);
|
||||
} else if (!strcasecmp(key_buffer, "hierarchy")) {
|
||||
ret = parse_hierarchy_key(section, value);
|
||||
} else if (!strcasecmp(key_buffer, "fragmentation")) {
|
||||
ret = parse_fragmentation_key(section, value);
|
||||
/* Failed to parse the key */
|
||||
} else {
|
||||
ML_ERROR(("Line %d, unknown key %s",
|
||||
coll_ml_config_yynewlines, key_buffer));
|
||||
}
|
||||
|
||||
/* All done */
|
||||
free(value);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**************************************************************************/
|
||||
|
||||
/*
|
||||
* Parse a single file
|
||||
*/
|
||||
static int parse_file(char *filename)
|
||||
{
|
||||
int val;
|
||||
int ret = OMPI_SUCCESS;
|
||||
bool first_section = true, first_coll = true;
|
||||
coll_config_t coll_config;
|
||||
|
||||
memset (&coll_config, 0, sizeof (coll_config));
|
||||
reset_collective(&coll_config);
|
||||
|
||||
/* Open the file */
|
||||
coll_ml_config_yyin = fopen(filename, "r");
|
||||
if (NULL == coll_ml_config_yyin) {
|
||||
ML_ERROR(("Failed to open config file %s", filename));
|
||||
ret = OMPI_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Do the parsing */
|
||||
coll_ml_config_parse_done = false;
|
||||
coll_ml_config_yynewlines = 1;
|
||||
coll_ml_config_init_buffer(coll_ml_config_yyin);
|
||||
while (!coll_ml_config_parse_done) {
|
||||
val = coll_ml_config_yylex();
|
||||
switch (val) {
|
||||
case COLL_ML_CONFIG_PARSE_DONE:
|
||||
case COLL_ML_CONFIG_PARSE_NEWLINE:
|
||||
break;
|
||||
case COLL_ML_CONFIG_PARSE_COLLECTIVE:
|
||||
/* dump all the information to last section that was defined */
|
||||
if (!first_coll) {
|
||||
ret = save_settings(&coll_config);
|
||||
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
ML_ERROR(("Error in syntax for collective %s", coll_config.coll_name));
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* reset collective config */
|
||||
reset_collective(&coll_config);
|
||||
|
||||
first_coll = false;
|
||||
first_section = true;
|
||||
|
||||
ret = set_collective_name(&coll_config);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
goto cleanup;
|
||||
}
|
||||
break;
|
||||
case COLL_ML_CONFIG_PARSE_SECTION:
|
||||
if (ML_UNDEFINED == coll_config.coll_id) {
|
||||
ML_ERROR(("Collective section wasn't defined !"));
|
||||
ret = OMPI_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (!first_section) {
|
||||
/* dump all the information to last section that was defined */
|
||||
ret = save_settings(&coll_config);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
ML_ERROR(("Error in syntax for collective %s section %s", coll_config.coll_name,
|
||||
coll_config.section.section_name));
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
first_section = false;
|
||||
|
||||
/* reset all section values */
|
||||
reset_section(&coll_config.section);
|
||||
|
||||
/* set new section name */
|
||||
ret = set_section_name(&coll_config.section);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
goto cleanup;
|
||||
}
|
||||
break;
|
||||
case COLL_ML_CONFIG_PARSE_SINGLE_WORD:
|
||||
if (ML_UNDEFINED == coll_config.coll_id ||
|
||||
ML_UNDEFINED == coll_config.section.section_id) {
|
||||
ML_ERROR(("Collective section or sub-section was not defined !"));
|
||||
ret = OMPI_ERROR;
|
||||
goto cleanup;
|
||||
} else {
|
||||
parse_line(&coll_config.section);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/* anything else is an error */
|
||||
ML_ERROR(("Unexpected token!"));
|
||||
ret = OMPI_ERROR;
|
||||
goto cleanup;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
save_settings(&coll_config);
|
||||
fclose(coll_ml_config_yyin);
|
||||
coll_ml_config_yylex_destroy ();
|
||||
ret = OMPI_SUCCESS;
|
||||
|
||||
cleanup:
|
||||
reset_collective(&coll_config);
|
||||
if (NULL != key_buffer) {
|
||||
free(key_buffer);
|
||||
key_buffer = NULL;
|
||||
key_buffer_len = 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int mca_coll_ml_config_file_init(void)
|
||||
{
|
||||
return parse_file(mca_coll_ml_component.config_file_name);
|
||||
}
|
||||
|
@ -1,23 +0,0 @@
|
||||
#ifndef COLL_ML_CONFIG_H_
|
||||
#define COLL_ML_CONFIG_H_
|
||||
|
||||
#include "opal_config.h"
|
||||
#include <stdio.h>
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define ML_UNDEFINED -1
|
||||
|
||||
struct per_collective_configuration_t {
|
||||
int topology_id;
|
||||
int threshold;
|
||||
int algorithm_id;
|
||||
int fragmentation_enabled;
|
||||
};
|
||||
typedef struct per_collective_configuration_t per_collective_configuration_t;
|
||||
|
||||
void mca_coll_ml_reset_config(per_collective_configuration_t *config);
|
||||
int mca_coll_ml_config_file_init(void);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
@ -1,131 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
#include "ompi/mca/coll/ml/coll_ml.h"
|
||||
#include "ompi/mca/coll/ml/coll_ml_inlines.h"
|
||||
#include "ompi/mca/coll/ml/coll_ml_allocation.h"
|
||||
#include "coll_ml_colls.h"
|
||||
#include <unistd.h>
|
||||
#include <sys/uio.h>
|
||||
|
||||
|
||||
|
||||
/* This routine re-orders and packs user data. The assumption is that
|
||||
* there is per-process data, the amount of data is the same for all
|
||||
* ranks, and the user data is contigous.
|
||||
*/
|
||||
int mca_coll_ml_pack_reorder_contiguous_data(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
int i, rank;
|
||||
void *user_buf, *library_buf;
|
||||
size_t bytes_per_proc;
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *)
|
||||
coll_op->coll_module;
|
||||
mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info;
|
||||
ptrdiff_t ptr_dif;
|
||||
|
||||
/* get the offset into each processes data. The assumption is that
|
||||
* we are manipulating the same amount of data for each process.
|
||||
*/
|
||||
|
||||
/* figure out how much data per-proc to copy */
|
||||
bytes_per_proc=coll_op->fragment_data.per_rank_fragment_size;
|
||||
|
||||
/* loop over all the ranks in the communicator */
|
||||
for( i=0 ; i < ompi_comm_size(ml_module->comm) ; i++ ) {
|
||||
|
||||
/* look up the rank of the i'th element in the sorted list */
|
||||
rank = topo_info->sort_list[i];
|
||||
|
||||
/* get the pointer to user data */
|
||||
user_buf=(void *)coll_op->full_message.src_user_addr;
|
||||
/* compute offset into the user buffer */
|
||||
|
||||
/* offset for data already processed */
|
||||
ptr_dif=rank*coll_op->full_message.n_bytes_per_proc_total+
|
||||
coll_op->fragment_data.offset_into_user_buffer_per_proc;
|
||||
user_buf=(void *) ((char *)user_buf+ptr_dif);
|
||||
/*
|
||||
rank*coll_op->full_message.n_bytes_per_proc_total+
|
||||
coll_op->fragment_data.offset_into_user_buffer_per_proc);
|
||||
*/
|
||||
|
||||
/* get the pointer to the ML buffer */
|
||||
library_buf= (void *)
|
||||
((char *)coll_op->variable_fn_params.src_desc->data_addr+i*bytes_per_proc);
|
||||
|
||||
/* copy the data */
|
||||
memcpy(library_buf, user_buf, bytes_per_proc);
|
||||
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* This routine re-orders and packs user data. The assumption is that
|
||||
* there is per-process data, the amount of data is the same for all
|
||||
* ranks, and the user data is contigous.
|
||||
*/
|
||||
int mca_coll_ml_pack_reorder_noncontiguous_data(mca_coll_ml_collective_operation_progress_t *coll_op)
|
||||
{
|
||||
int i, rank;
|
||||
void *user_buf, *library_buf;
|
||||
size_t bytes_per_proc;
|
||||
ptrdiff_t ptr_dif;
|
||||
mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *)
|
||||
coll_op->coll_module;
|
||||
mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info;
|
||||
|
||||
/* get the offset into each processes data. The assumption is that
|
||||
* we are manipulating the same amount of data for each process.
|
||||
*/
|
||||
|
||||
/* figure out how much data per-proc to copy */
|
||||
bytes_per_proc = coll_op->fragment_data.per_rank_fragment_size;
|
||||
|
||||
/* loop over all the ranks in the communicator */
|
||||
for(i = 0; i < ompi_comm_size(ml_module->comm); i++ ) {
|
||||
|
||||
/* look up the rank of the i'th element in the sorted list */
|
||||
rank = topo_info->sort_list[i];
|
||||
|
||||
/* get the pointer to user data */
|
||||
user_buf=(void *)coll_op->full_message.src_user_addr;
|
||||
/* compute offset into the user buffer */
|
||||
|
||||
/* offset for data already processed */
|
||||
ptr_dif=rank*coll_op->full_message.send_count*
|
||||
coll_op->full_message.send_extent+
|
||||
coll_op->fragment_data.offset_into_user_buffer_per_proc;
|
||||
user_buf=(void *) ((char *)user_buf+ptr_dif);
|
||||
|
||||
/* get the pointer to the ML buffer */
|
||||
library_buf= (void *)
|
||||
((char *)coll_op->variable_fn_params.src_desc->data_addr+i*bytes_per_proc);
|
||||
|
||||
/* copy the data */
|
||||
memcpy(library_buf, user_buf, bytes_per_proc);
|
||||
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -1,139 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "ompi/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
|
||||
#include "ompi/op/op.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/bcol/bcol.h"
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "ompi/mca/coll/ml/coll_ml.h"
|
||||
#include "ompi/mca/coll/ml/coll_ml_inlines.h"
|
||||
#include "ompi/patterns/comm/coll_ops.h"
|
||||
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
|
||||
#include "ompi/mca/bcol/base/base.h"
|
||||
#include "coll_ml_custom_utils.h"
|
||||
|
||||
/*
|
||||
* Local types
|
||||
*/
|
||||
|
||||
struct avail_coll_t {
|
||||
opal_list_item_t super;
|
||||
int ac_priority;
|
||||
mca_coll_base_module_2_1_0_t *ac_module;
|
||||
};
|
||||
typedef struct avail_coll_t avail_coll_t;
|
||||
|
||||
/*
|
||||
* Stuff for the OBJ interface
|
||||
* If topo_index == COLL_ML_TOPO_MAX it looks over all possilbe topologies, otherwhise it looks
|
||||
* in the topology that was specified.
|
||||
*/
|
||||
|
||||
int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_module_t *ml_module,
|
||||
int topo_index)
|
||||
{
|
||||
int i, rc, hier, *ranks_in_comm,
|
||||
is_used = 0,
|
||||
comm_size = ompi_comm_size(ml_module->comm);
|
||||
int n_hier, tp , max_tp;
|
||||
const mca_coll_ml_topology_t *topo_info;
|
||||
|
||||
ranks_in_comm = (int *) malloc(comm_size * sizeof(int));
|
||||
if (OPAL_UNLIKELY(NULL == ranks_in_comm)) {
|
||||
ML_ERROR(("Memory allocation failed."));
|
||||
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_NO_MEM);
|
||||
/* not reached but causes a clang warning to not return here */
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for (i = 0; i < comm_size; ++i) {
|
||||
ranks_in_comm[i] = i;
|
||||
}
|
||||
|
||||
if (COLL_ML_TOPO_MAX == topo_index) {
|
||||
tp = 0;
|
||||
max_tp = COLL_ML_TOPO_MAX;
|
||||
} else {
|
||||
tp = topo_index;
|
||||
max_tp = topo_index + 1;
|
||||
}
|
||||
|
||||
for (; tp < max_tp; tp++) {
|
||||
topo_info = &ml_module->topo_list[tp];
|
||||
n_hier = topo_info->n_levels;
|
||||
for (hier = 0; hier < n_hier; ++hier) {
|
||||
hierarchy_pairs *pair = &topo_info->component_pairs[hier];
|
||||
mca_bcol_base_component_t *b_cm = pair->bcol_component;
|
||||
if(0 == strcmp(bcol_name,
|
||||
b_cm->bcol_version.mca_component_name)) {
|
||||
is_used = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rc = comm_allreduce_pml(&is_used, &is_used, 1, MPI_INT,
|
||||
ompi_comm_rank(ml_module->comm), MPI_MAX,
|
||||
comm_size, ranks_in_comm, ml_module->comm);
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
ML_ERROR(("comm_allreduce_pml failed."));
|
||||
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_OP);
|
||||
}
|
||||
|
||||
free(ranks_in_comm);
|
||||
|
||||
return is_used;
|
||||
}
|
||||
|
||||
/* The function is very different from the above function */
|
||||
int mca_coll_ml_check_if_bcol_is_requested(const char *component_name)
|
||||
{
|
||||
mca_base_component_list_item_t *bcol_comp;
|
||||
|
||||
ML_VERBOSE(10, ("Loop over bcol components"));
|
||||
OPAL_LIST_FOREACH(bcol_comp, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
|
||||
if(0 == strcmp(component_name,
|
||||
((mca_bcol_base_component_2_0_0_t *)
|
||||
bcol_comp->cli_component)->bcol_version.mca_component_name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* the component was not resquested */
|
||||
return false;
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#ifndef MCA_COLL_ML_CUSTOM_UTILS_H
|
||||
#define MCA_COLL_ML_CUSTOM_UTILS_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "coll_ml.h"
|
||||
|
||||
/* the function is used to check if the bcol name is used in this ml module */
|
||||
int mca_coll_ml_check_if_bcol_is_used(const char *bcol_name, const mca_coll_ml_module_t *ml_module,
|
||||
int topo_index);
|
||||
|
||||
/* The function is used to check if the bcol component was REQUESTED by user */
|
||||
int mca_coll_ml_check_if_bcol_is_requested(const char *component_name);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_COLL_ML_ML_H */
|
@ -1,60 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "ompi_config.h"
|
||||
#include "coll_ml.h"
|
||||
#include "coll_ml_inlines.h"
|
||||
|
||||
|
||||
static inline void mca_coll_ml_fragment_constructor(mca_coll_ml_fragment_t *frag)
|
||||
{
|
||||
frag->fn_args = NULL;
|
||||
}
|
||||
|
||||
static inline void mca_coll_ml_fragment_destructor(mca_coll_ml_fragment_t *frag)
|
||||
{
|
||||
if (frag->fn_args) {
|
||||
free(frag->fn_args);
|
||||
frag->fn_args = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void mca_coll_ml_descriptor_constructor(mca_coll_ml_descriptor_t *descriptor)
|
||||
{
|
||||
|
||||
OBJ_CONSTRUCT(&(descriptor->fragment),mca_coll_ml_fragment_t);
|
||||
|
||||
/* this fragment is alway associated with this message descriptor */
|
||||
descriptor->fragment.full_msg_descriptor=descriptor;
|
||||
|
||||
}
|
||||
|
||||
|
||||
static inline void mca_coll_ml_descriptor_destructor(mca_coll_ml_descriptor_t *descriptor)
|
||||
{
|
||||
OBJ_DESTRUCT(&(descriptor->fragment));
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_coll_ml_fragment_t,
|
||||
opal_list_item_t,
|
||||
mca_coll_ml_fragment_constructor,
|
||||
mca_coll_ml_fragment_destructor);
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_coll_ml_descriptor_t,
|
||||
ompi_request_t,
|
||||
mca_coll_ml_descriptor_constructor,
|
||||
mca_coll_ml_descriptor_destructor);
|
||||
|
@ -1,132 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#ifndef MCA_COLL_ML_FUNCTIONS_H
|
||||
#define MCA_COLL_ML_FUNCTIONS_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define ML_MEMSYNC -100
|
||||
|
||||
enum {
|
||||
ML_BARRIER_DEFAULT
|
||||
};
|
||||
/* small data algorithm */
|
||||
/* broadcast functions */
|
||||
enum {
|
||||
/* small data algorithm */
|
||||
ML_BCAST_SMALL_DATA_KNOWN,
|
||||
/* small data - dynamic decision making supported */
|
||||
ML_BCAST_SMALL_DATA_UNKNOWN,
|
||||
/* Sequential algorithm */
|
||||
ML_BCAST_SMALL_DATA_SEQUENTIAL,
|
||||
|
||||
ML_BCAST_LARGE_DATA_KNOWN,
|
||||
|
||||
ML_BCAST_LARGE_DATA_UNKNOWN,
|
||||
|
||||
ML_BCAST_LARGE_DATA_SEQUENTIAL,
|
||||
|
||||
/* marker - all routines about this are expected to be used in
|
||||
* selection logic that is based on size of the data */
|
||||
ML_N_DATASIZE_BINS,
|
||||
|
||||
/* number of functions - also counts some markers, but ... */
|
||||
ML_NUM_BCAST_FUNCTIONS
|
||||
};
|
||||
|
||||
|
||||
/* scatter functions */
|
||||
enum {
|
||||
/* small data algorithm */
|
||||
ML_SCATTER_SMALL_DATA_KNOWN,
|
||||
|
||||
/* marker - all routines about this are expected to be used in
|
||||
* selection logic that is based on size of the data */
|
||||
ML_SCATTER_N_DATASIZE_BINS,
|
||||
|
||||
/* small data - dynamic decision making supported */
|
||||
ML_SCATTER_SMALL_DATA_UNKNOWN,
|
||||
|
||||
/* Sequential algorithm */
|
||||
ML_SCATTER_SMALL_DATA_SEQUENTIAL,
|
||||
|
||||
/* number of functions - also counts some markers, but ... */
|
||||
ML_NUM_SCATTER_FUNCTIONS
|
||||
};
|
||||
|
||||
|
||||
/* Allreduce functions */
|
||||
enum {
|
||||
/* small data algorithm */
|
||||
ML_SMALL_DATA_ALLREDUCE,
|
||||
|
||||
/* Large data algorithm */
|
||||
ML_LARGE_DATA_ALLREDUCE,
|
||||
|
||||
/* If some of bcols doesn't support
|
||||
all possibles types, use these extra algthms */
|
||||
/* small data algorithm */
|
||||
ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE,
|
||||
|
||||
/* large data algorithm */
|
||||
ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE,
|
||||
|
||||
/* number of functions */
|
||||
ML_NUM_ALLREDUCE_FUNCTIONS
|
||||
};
|
||||
|
||||
/* Reduce functions */
|
||||
enum {
|
||||
/* small data algorithm */
|
||||
ML_SMALL_DATA_REDUCE,
|
||||
|
||||
/* Large data algorithm */
|
||||
ML_LARGE_DATA_REDUCE,
|
||||
|
||||
/* number of functions */
|
||||
ML_NUM_REDUCE_FUNCTIONS
|
||||
};
|
||||
/* Alltoall functions */
|
||||
enum {
|
||||
/* small data algorithm */
|
||||
ML_SMALL_DATA_ALLTOALL,
|
||||
/* large all to all */
|
||||
ML_LARGE_DATA_ALLTOALL,
|
||||
/* number of functions */
|
||||
ML_NUM_ALLTOALL_FUNCTIONS
|
||||
};
|
||||
|
||||
/* Allgather functions */
|
||||
enum {
|
||||
/* small data */
|
||||
ML_SMALL_DATA_ALLGATHER,
|
||||
/* large data */
|
||||
ML_LARGE_DATA_ALLGATHER,
|
||||
/* number of functions */
|
||||
ML_NUM_ALLGATHER_FUNCTIONS
|
||||
};
|
||||
|
||||
/* gather functions */
|
||||
enum {
|
||||
/* small data */
|
||||
ML_SMALL_DATA_GATHER,
|
||||
/* large data */
|
||||
ML_LARGE_DATA_GATHER,
|
||||
/* number of functions */
|
||||
ML_NUM_GATHER_FUNCTIONS
|
||||
};
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_COLL_ML_FUNCTIONS_H */
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
x
Ссылка в новой задаче
Block a user