1
1

Merge pull request #5087 from mkurnosov/base-reduce-scatter-block

coll/base: add recursive doubling algorithm for MPI_Reduce_scatter_block
Этот коммит содержится в:
Howard Pritchard 2018-05-04 11:58:26 -06:00 коммит произвёл GitHub
родитель ba40e2173d 8cf8553abd
Коммит 4e917b7692
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
11 изменённых файлов: 440 добавлений и 4 удалений

Просмотреть файл

@ -43,5 +43,6 @@ libmca_coll_la_SOURCES += \
base/coll_base_reduce.c \
base/coll_base_barrier.c \
base/coll_base_reduce_scatter.c \
base/coll_base_reduce_scatter_block.c \
base/coll_base_exscan.c \
base/coll_base_scan.c

Просмотреть файл

@ -249,6 +249,9 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
/* Reduce_scatter_block */
int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS);
/* Scan */
int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
int ompi_coll_base_scan_intra_linear(SCAN_ARGS);

Просмотреть файл

@ -0,0 +1,221 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018 Siberian State University of Telecommunications
* and Information Sciences. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "opal/util/bit_ops.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/*
* ompi_rounddown: Rounds a number down to nearest multiple.
* rounddown(10,4) = 8, rounddown(6,3) = 6, rounddown(14,3) = 12
*/
static int ompi_rounddown(int num, int factor)
{
num /= factor;
return num * factor; /* floor(num / factor) * factor */
}
/*
* ompi_coll_base_reduce_scatter_block_intra_recursivedoubling
*
* Function: Recursive doubling algorithm for reduce_scatter_block.
* Accepts: Same as MPI_Reduce_scatter_block
* Returns: MPI_SUCCESS or error code
*
* Description: Implements recursive doubling algorithm for MPI_Reduce_scatter_block.
* The algorithm preserves order of operations so it can
* be used both by commutative and non-commutative operations.
*
* Time complexity: \alpha\log(p) + \beta*m(\log(p)-(p-1)/p) + \gamma*m(\log(p)-(p-1)/p),
* where m = rcount * comm_size, p = comm_size
* Memory requirements (per process): 2 * rcount * comm_size * typesize
*/
int
ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
struct ompi_datatype_t *dtypesend = NULL, *dtyperecv = NULL;
char *tmprecv_raw = NULL, *tmpbuf_raw = NULL, *tmprecv, *tmpbuf;
ptrdiff_t span, gap, totalcount, extent;
int blocklens[2], displs[2];
int err = MPI_SUCCESS;
int comm_size = ompi_comm_size(comm);
int rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:reduce_scatter_block_intra_recursivedoubling: rank %d/%d",
rank, comm_size));
if (rcount == 0)
return MPI_SUCCESS;
if (comm_size < 2)
return MPI_SUCCESS;
totalcount = comm_size * rcount;
ompi_datatype_type_extent(dtype, &extent);
span = opal_datatype_span(&dtype->super, totalcount, &gap);
tmpbuf_raw = malloc(span);
tmprecv_raw = malloc(span);
if (NULL == tmpbuf_raw || NULL == tmprecv_raw) {
err = OMPI_ERR_OUT_OF_RESOURCE;
goto cleanup_and_return;
}
tmpbuf = tmpbuf_raw - gap;
tmprecv = tmprecv_raw - gap;
if (sbuf != MPI_IN_PLACE) {
err = ompi_datatype_copy_content_same_ddt(dtype, totalcount, tmpbuf, (char *)sbuf);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
} else {
err = ompi_datatype_copy_content_same_ddt(dtype, totalcount, tmpbuf, rbuf);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
}
int is_commutative = ompi_op_is_commute(op);
/* Recursive distance doubling */
int rdoubling_step = 0;
for (int mask = 1; mask < comm_size; mask <<= 1) {
int remote = rank ^ mask;
int cur_tree_root = ompi_rounddown(rank, mask);
int remote_tree_root = ompi_rounddown(remote, mask);
/*
* Let be m is a block size in bytes (rcount), p is a comm_size,
* p*m is a total message size in sbuf.
* Step 1: processes send and recv (p*m-m) amount of data
* Step 2: processes send and recv (p*m-2*m) amount of data
* Step 3: processes send and recv (p*m-4*m) amount of data
* ...
* Step ceil(\log_2(p)): send and recv (p*m-m*2^floor{\log_2(p-1)})
*
* Send block from tmpbuf: [0..cur_tree_root - 1], [cur_tree_root + mask, p - 1]
* Recv block into tmprecv: [0..remote_tree_root - 1], [remote_tree_root + mask, p - 1]
*/
/* Send type */
blocklens[0] = rcount * cur_tree_root;
blocklens[1] = (comm_size >= cur_tree_root + mask) ?
rcount * (comm_size - cur_tree_root - mask) : 0;
displs[0] = 0;
displs[1] = comm_size * rcount - blocklens[1];
err = ompi_datatype_create_indexed(2, blocklens, displs, dtype, &dtypesend);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
err = ompi_datatype_commit(&dtypesend);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
/* Recv type */
blocklens[0] = rcount * remote_tree_root;
blocklens[1] = (comm_size >= remote_tree_root + mask) ?
rcount * (comm_size - remote_tree_root - mask) : 0;
displs[0] = 0;
displs[1] = comm_size * rcount - blocklens[1];
err = ompi_datatype_create_indexed(2, blocklens, displs, dtype, &dtyperecv);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
err = ompi_datatype_commit(&dtyperecv);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
int is_block_received = 0;
if (remote < comm_size) {
err = ompi_coll_base_sendrecv(tmpbuf, 1, dtypesend, remote,
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
tmprecv, 1, dtyperecv, remote,
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
comm, MPI_STATUS_IGNORE, rank);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
is_block_received = 1;
}
/*
* Non-power-of-two case: if process did not have destination process
* to communicate with, we need to send him the current result.
* Recursive halving algorithm is used for search of process.
*/
if (remote_tree_root + mask > comm_size) {
/*
* Compute the number of processes in current subtree
* that have all the data
*/
int nprocs_alldata = comm_size - cur_tree_root - mask;
for (int rhalving_mask = mask >> 1; rhalving_mask > 0; rhalving_mask >>= 1) {
remote = rank ^ rhalving_mask;
int tree_root = ompi_rounddown(rank, rhalving_mask << 1);
/*
* Send only if:
* 1) current process has data: (remote > rank) && (rank < tree_root + nprocs_alldata)
* 2) remote process does not have data at any step: remote >= tree_root + nprocs_alldata
*/
if ((remote > rank) && (rank < tree_root + nprocs_alldata)
&& (remote >= tree_root + nprocs_alldata)) {
err = MCA_PML_CALL(send(tmprecv, 1, dtyperecv, remote,
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
} else if ((remote < rank) && (remote < tree_root + nprocs_alldata) &&
(rank >= tree_root + nprocs_alldata)) {
err = MCA_PML_CALL(recv(tmprecv, 1, dtyperecv, remote,
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
comm, MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
is_block_received = 1;
}
}
}
if (is_block_received) {
/* After reduction the result must be in tmpbuf */
if (is_commutative || (remote_tree_root < cur_tree_root)) {
ompi_op_reduce(op, tmprecv, tmpbuf, blocklens[0], dtype);
ompi_op_reduce(op, tmprecv + (ptrdiff_t)displs[1] * extent,
tmpbuf + (ptrdiff_t)displs[1] * extent,
blocklens[1], dtype);
} else {
ompi_op_reduce(op, tmpbuf, tmprecv, blocklens[0], dtype);
ompi_op_reduce(op, tmpbuf + (ptrdiff_t)displs[1] * extent,
tmprecv + (ptrdiff_t)displs[1] * extent,
blocklens[1], dtype);
err = ompi_datatype_copy_content_same_ddt(dtyperecv, 1,
tmpbuf, tmprecv);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
}
}
rdoubling_step++;
err = ompi_datatype_destroy(&dtypesend);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
err = ompi_datatype_destroy(&dtyperecv);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
}
err = ompi_datatype_copy_content_same_ddt(dtype, rcount, rbuf,
tmpbuf + (ptrdiff_t)rank * rcount * extent);
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
cleanup_and_return:
if (dtypesend)
ompi_datatype_destroy(&dtypesend);
if (dtyperecv)
ompi_datatype_destroy(&dtyperecv);
if (tmpbuf_raw)
free(tmpbuf_raw);
if (tmprecv_raw)
free(tmprecv_raw);
return err;
}

Просмотреть файл

@ -37,10 +37,11 @@
#define MCA_COLL_BASE_TAG_GATHERV -20
#define MCA_COLL_BASE_TAG_REDUCE -21
#define MCA_COLL_BASE_TAG_REDUCE_SCATTER -22
#define MCA_COLL_BASE_TAG_SCAN -23
#define MCA_COLL_BASE_TAG_SCATTER -24
#define MCA_COLL_BASE_TAG_SCATTERV -25
#define MCA_COLL_BASE_TAG_NONBLOCKING_BASE -26
#define MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK -23
#define MCA_COLL_BASE_TAG_SCAN -24
#define MCA_COLL_BASE_TAG_SCATTER -25
#define MCA_COLL_BASE_TAG_SCATTERV -26
#define MCA_COLL_BASE_TAG_NONBLOCKING_BASE -27
#define MCA_COLL_BASE_TAG_NONBLOCKING_END ((-1 * INT_MAX/2) + 1)
#define MCA_COLL_BASE_TAG_HCOLL_BASE (-1 * INT_MAX/2)
#define MCA_COLL_BASE_TAG_HCOLL_END (-1 * INT_MAX)

Просмотреть файл

@ -41,6 +41,7 @@ sources = \
coll_tuned_bcast_decision.c \
coll_tuned_reduce_scatter_decision.c \
coll_tuned_scatter_decision.c \
coll_tuned_reduce_scatter_block_decision.c \
coll_tuned_exscan_decision.c \
coll_tuned_scan_decision.c

Просмотреть файл

@ -150,6 +150,12 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
/* Reduce_scatter_block */
int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(REDUCESCATTERBLOCK_ARGS);
int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(REDUCESCATTERBLOCK_ARGS);
int ompi_coll_tuned_reduce_scatter_block_intra_do_this(REDUCESCATTERBLOCK_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
/* Scatter */
int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS);
int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);

Просмотреть файл

@ -187,6 +187,7 @@ static int tuned_register(void)
ompi_coll_tuned_bcast_intra_check_forced_init(&ompi_coll_tuned_forced_params[BCAST]);
ompi_coll_tuned_reduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCE]);
ompi_coll_tuned_reduce_scatter_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCESCATTER]);
ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCESCATTERBLOCK]);
ompi_coll_tuned_gather_intra_check_forced_init(&ompi_coll_tuned_forced_params[GATHER]);
ompi_coll_tuned_scatter_intra_check_forced_init(&ompi_coll_tuned_forced_params[SCATTER]);
ompi_coll_tuned_exscan_intra_check_forced_init(&ompi_coll_tuned_forced_params[EXSCAN]);

Просмотреть файл

@ -386,6 +386,58 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu
dtype, op, comm, module);
}
/*
* reduce_scatter_block_intra_dec
*
* Function: - seletects reduce_scatter_block algorithm to use
* Accepts: - same arguments as MPI_Reduce_scatter_block()
* Returns: - MPI_SUCCESS or error code (passed from
* the reduce_scatter implementation)
*
*/
int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, void *rbuf,
int rcount,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_block_intra_dec_dynamic"));
/* check to see if we have some filebased rules */
if (tuned_module->com_rules[REDUCESCATTERBLOCK]) {
/* we do, so calc the message size or what ever we need and use
this for the evaluation */
int alg, faninout, segsize, ignoreme, size;
size_t dsize;
size = ompi_comm_size(comm);
ompi_datatype_type_size (dtype, &dsize);
dsize *= rcount * size;
alg = ompi_coll_tuned_get_target_method_params(tuned_module->com_rules[REDUCESCATTERBLOCK],
dsize, &faninout,
&segsize, &ignoreme);
if (alg) {
/* we have found a valid choice from the file based rules for this message size */
return ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype,
op, comm, module,
alg, faninout, segsize);
} /* found a method */
} /* end if any com rules to check */
if (tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm) {
return ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
op, comm, module,
tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm,
tuned_module->user_forced[REDUCESCATTERBLOCK].chain_fanout,
tuned_module->user_forced[REDUCESCATTERBLOCK].segsize);
}
return ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed (sbuf, rbuf, rcount,
dtype, op, comm, module);
}
/*
* allgather_intra_dec
*

Просмотреть файл

@ -500,6 +500,26 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( const void *sbuf, void *rbuf
comm, module);
}
/*
* reduce_scatter_block_intra_dec
*
* Function: - seletects reduce_scatter_block algorithm to use
* Accepts: - same arguments as MPI_Reduce_scatter_block()
* Returns: - MPI_SUCCESS or error code (passed from
* the reduce scatter implementation)
*/
int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(const void *sbuf, void *rbuf,
int rcount,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed"));
return ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(sbuf, rbuf, rcount,
dtype, op, comm, module);
}
/*
* allgather_intra_dec
*

Просмотреть файл

@ -108,6 +108,7 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority)
tuned_module->super.coll_gatherv = NULL;
tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_fixed;
tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_fixed;
tuned_module->super.coll_reduce_scatter_block = ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed;
tuned_module->super.coll_scan = NULL;
tuned_module->super.coll_scatter = ompi_coll_tuned_scatter_intra_dec_fixed;
tuned_module->super.coll_scatterv = NULL;
@ -240,6 +241,8 @@ tuned_module_enable( mca_coll_base_module_t *module,
tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, REDUCESCATTER,
tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, REDUCESCATTERBLOCK,
tuned_module->super.coll_reduce_scatter_block = ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCAN,
tuned_module->super.coll_scan = ompi_coll_tuned_scan_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCATTER,

Просмотреть файл

@ -0,0 +1,127 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018 Siberian State University of Telecommunications
* and Information Sciences. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "opal/util/bit_ops.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "coll_tuned.h"
/* reduce_scatter_block algorithm variables */
static int coll_tuned_reduce_scatter_block_forced_algorithm = 0;
static int coll_tuned_reduce_scatter_block_segment_size = 0;
static int coll_tuned_reduce_scatter_block_tree_fanout;
/* valid values for coll_tuned_reduce_scatter_blokc_forced_algorithm */
static mca_base_var_enum_value_t reduce_scatter_block_algorithms[] = {
{0, "ignore"},
{1, "recursive_doubling"},
{0, NULL}
};
/**
* The following are used by dynamic and forced rules
*
* publish details of each algorithm and if its forced/fixed/locked in
* as you add methods/algorithms you must update this and the query/map routines
*
* this routine is called by the component only
* this makes sure that the mca parameters are set to their initial values and
* perms module does not call this they call the forced_getvalues routine
* instead
*/
int ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != reduce_scatter_block_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[REDUCESCATTERBLOCK] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_block_algorithm_count",
"Number of reduce_scatter_block algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&ompi_coll_tuned_forced_max_algorithms[REDUCESCATTERBLOCK]);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_reduce_scatter_block_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_reduce_scatter_block_algorithms", reduce_scatter_block_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_block_algorithm",
"Which reduce reduce_scatter_block algorithm is used. "
"Can be locked down to choice of: 0 ignore, 1 recursive doubling",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_ALL,
&coll_tuned_reduce_scatter_block_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_reduce_scatter_block_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_block_algorithm_segmentsize",
"Segment size in bytes used by default for reduce_scatter_block algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_ALL,
&coll_tuned_reduce_scatter_block_segment_size);
coll_tuned_reduce_scatter_block_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_block_algorithm_tree_fanout",
"Fanout for n-tree used for reduce_scatter_block algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_ALL,
&coll_tuned_reduce_scatter_block_tree_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *rbuf,
int rcount,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_block_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(sbuf, rbuf, rcount,
dtype, op, comm, module);
case (1): return ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(sbuf, rbuf, rcount,
dtype, op, comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_block_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTERBLOCK]));
return (MPI_ERR_ARG);
}