Merge pull request #3451 from mkurnosov/reduce-allreduce-rebenseifner
coll: Add Rabenseifner's algorithm for Reduce and Allreduce
Этот коммит содержится в:
Коммит
47ebfaa60d
35
ompi/mca/coll/spacc/Makefile.am
Обычный файл
35
ompi/mca/coll/spacc/Makefile.am
Обычный файл
@ -0,0 +1,35 @@
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
coll_spacc.h \
|
||||
coll_spacc_component.c \
|
||||
coll_spacc_module.c \
|
||||
coll_spacc_allreduce.c \
|
||||
coll_spacc_reduce.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_ompi_coll_spacc_DSO
|
||||
component_noinst =
|
||||
component_install = mca_coll_spacc.la
|
||||
else
|
||||
component_noinst = libmca_coll_spacc.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_coll_spacc_la_SOURCES = $(sources)
|
||||
mca_coll_spacc_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_coll_spacc_la_SOURCES =$(sources)
|
||||
libmca_coll_spacc_la_LDFLAGS = -module -avoid-version
|
83
ompi/mca/coll/spacc/coll_spacc.h
Обычный файл
83
ompi/mca/coll/spacc/coll_spacc.h
Обычный файл
@ -0,0 +1,83 @@
|
||||
/*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_COLL_SPACC_EXPORT_H
|
||||
#define MCA_COLL_SPACC_EXPORT_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* Globally exported variables */
|
||||
extern int ompi_coll_spacc_stream;
|
||||
extern int ompi_coll_spacc_priority;
|
||||
|
||||
/* API functions */
|
||||
|
||||
int mca_coll_spacc_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads);
|
||||
mca_coll_base_module_t
|
||||
*mca_coll_spacc_comm_query(struct ompi_communicator_t *comm, int *priority);
|
||||
|
||||
int mca_coll_spacc_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm);
|
||||
|
||||
int mca_coll_spacc_allreduce_intra_redscat_allgather(
|
||||
const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_spacc_reduce_intra_redscat_gather(
|
||||
const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
/*
|
||||
* coll API functions
|
||||
*/
|
||||
|
||||
/* API functions */
|
||||
|
||||
int ompi_coll_spacc_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads);
|
||||
|
||||
mca_coll_base_module_t *
|
||||
ompi_coll_spacc_comm_query(struct ompi_communicator_t *comm, int *priority);
|
||||
|
||||
struct mca_coll_spacc_component_t {
|
||||
/* Base coll component */
|
||||
mca_coll_base_component_2_0_0_t super;
|
||||
|
||||
/* MCA parameter: priority of this component */
|
||||
int spacc_priority;
|
||||
|
||||
/* global stuff that I need the component to store */
|
||||
|
||||
/* MCA parameters first */
|
||||
};
|
||||
|
||||
/*
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct mca_coll_spacc_component_t mca_coll_spacc_component_t;
|
||||
|
||||
/*
|
||||
* Global component instance
|
||||
*/
|
||||
OMPI_MODULE_DECLSPEC extern mca_coll_spacc_component_t mca_coll_spacc_component;
|
||||
|
||||
struct mca_coll_spacc_module_t {
|
||||
mca_coll_base_module_t super;
|
||||
};
|
||||
typedef struct mca_coll_spacc_module_t mca_coll_spacc_module_t;
|
||||
OBJ_CLASS_DECLARATION(mca_coll_spacc_module_t);
|
||||
|
||||
#endif /* MCA_COLL_SPACC_EXPORT_H */
|
354
ompi/mca/coll/spacc/coll_spacc_allreduce.c
Обычный файл
354
ompi/mca/coll/spacc/coll_spacc_allreduce.c
Обычный файл
@ -0,0 +1,354 @@
|
||||
/*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "coll_spacc.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/op/op.h"
|
||||
|
||||
/*
|
||||
* mca_coll_spacc_allreduce_intra_redscat_gather
|
||||
*
|
||||
* Function: Allreduce using Rabenseifner's algorithm.
|
||||
* Accepts: Same arguments as MPI_Allreduce
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
*
|
||||
* Description: an implementation of Rabenseifner's allreduce algorithm [1, 2].
|
||||
* [1] Rajeev Thakur, Rolf Rabenseifner and William Gropp.
|
||||
* Optimization of Collective Communication Operations in MPICH //
|
||||
* The Int. Journal of High Performance Computing Applications. Vol 19,
|
||||
* Issue 1, pp. 49--66.
|
||||
* [2] http://www.hlrs.de/mpi/myreduce.html.
|
||||
*
|
||||
* This algorithm is a combination of a reduce-scatter implemented with
|
||||
* recursive vector halving and recursive distance doubling, followed either
|
||||
* by an allgather implemented with recursive doubling [1].
|
||||
*
|
||||
* Step 1. If the number of processes is not a power of two, reduce it to
|
||||
* the nearest lower power of two (p' = 2^{\floor{\log_2 p}})
|
||||
* by removing r = p - p' extra processes as follows. In the first 2r processes
|
||||
* (ranks 0 to 2r - 1), all the even ranks send the second half of the input
|
||||
* vector to their right neighbor (rank + 1), and all the odd ranks send
|
||||
* the first half of the input vector to their left neighbor (rank - 1).
|
||||
* The even ranks compute the reduction on the first half of the vector and
|
||||
* the odd ranks compute the reduction on the second half. The odd ranks then
|
||||
* send the result to their left neighbors (the even ranks). As a result,
|
||||
* the even ranks among the first 2r processes now contain the reduction with
|
||||
* the input vector on their right neighbors (the odd ranks). These odd ranks
|
||||
* do not participate in the rest of the algorithm, which leaves behind
|
||||
* a power-of-two number of processes. The first r even-ranked processes and
|
||||
* the last p - 2r processes are now renumbered from 0 to p' - 1.
|
||||
*
|
||||
* Step 2. The remaining processes now perform a reduce-scatter by using
|
||||
* recursive vector halving and recursive distance doubling. The even-ranked
|
||||
* processes send the second half of their buffer to rank + 1 and the odd-ranked
|
||||
* processes send the first half of their buffer to rank - 1. All processes
|
||||
* then compute the reduction between the local buffer and the received buffer.
|
||||
* In the next log_2(p') - 1 steps, the buffers are recursively halved, and the
|
||||
* distance is doubled. At the end, each of the p' processes has 1 / p' of the
|
||||
* total reduction result.
|
||||
*
|
||||
* Step 3. An allgather is performed by using recursive vector doubling and
|
||||
* distance halving. All exchanges are executed in reverse order relative
|
||||
* to recursive doubling on previous step. If the number of processes is not
|
||||
* a power of two, the total result vector must be sent to the r processes
|
||||
* that were removed in the first step.
|
||||
*
|
||||
* Limitations:
|
||||
* count >= 2^{\floor{\log_2 p}}
|
||||
* commutative operations only
|
||||
* intra-communicators only
|
||||
*
|
||||
* Memory requirements (per process):
|
||||
* count * typesize + 4 * log_2(p) * sizeof(int) = O(count)
|
||||
*/
|
||||
int mca_coll_spacc_allreduce_intra_redscat_allgather(
|
||||
const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL;
|
||||
|
||||
int comm_size = ompi_comm_size(comm);
|
||||
int rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_spacc_stream,
|
||||
"coll:spacc:allreduce_intra_redscat_allgather: rank %d/%d",
|
||||
rank, comm_size));
|
||||
|
||||
/* Find nearest power-of-two less than or equal to comm_size */
|
||||
int nsteps = opal_hibit(comm_size, comm->c_cube_dim + 1); /* ilog2(comm_size) */
|
||||
int nprocs_pof2 = 1 << nsteps; /* flp2(comm_size) */
|
||||
|
||||
if (count < nprocs_pof2 || !ompi_op_is_commute(op)) {
|
||||
OPAL_OUTPUT((ompi_coll_spacc_stream,
|
||||
"coll:spacc:allreduce_intra_redscat_allgather: rank %d/%d count %d switching to base allreduce",
|
||||
rank, comm_size, count));
|
||||
return ompi_coll_base_allreduce_intra_basic_linear(sbuf, rbuf, count, dtype,
|
||||
op, comm, module);
|
||||
}
|
||||
|
||||
int err = MPI_SUCCESS;
|
||||
|
||||
ptrdiff_t lb, extent, dsize, gap = 0;
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
dsize = opal_datatype_span(&dtype->super, count, &gap);
|
||||
|
||||
/* Temporary buffer for receiving messages */
|
||||
char *tmp_buf = NULL;
|
||||
char *tmp_buf_raw = (char *)malloc(dsize);
|
||||
if (NULL == tmp_buf_raw)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
tmp_buf = tmp_buf_raw - gap;
|
||||
|
||||
if (sbuf != MPI_IN_PLACE) {
|
||||
/* Copy sbuf to rbuf */
|
||||
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf,
|
||||
(char *)sbuf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Step 1. Reduce the number of processes to the nearest lower power of two
|
||||
* p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes.
|
||||
* 1. In the first 2r processes (ranks 0 to 2r - 1), all the even ranks send
|
||||
* the second half of the input vector to their right neighbor (rank + 1)
|
||||
* and all the odd ranks send the first half of the input vector to their
|
||||
* left neighbor (rank - 1).
|
||||
* 2. All 2r processes compute the reduction on their half.
|
||||
* 3. The odd ranks then send the result to their left neighbors
|
||||
* (the even ranks).
|
||||
*
|
||||
* The even ranks (0 to 2r - 1) now contain the reduction with the input
|
||||
* vector on their right neighbors (the odd ranks). The first r even
|
||||
* processes and the p - 2r last processes are renumbered from
|
||||
* 0 to 2^{\floor{\log_2 p}} - 1.
|
||||
*/
|
||||
|
||||
int vrank, step, wsize;
|
||||
int nprocs_rem = comm_size - nprocs_pof2;
|
||||
|
||||
if (rank < 2 * nprocs_rem) {
|
||||
int count_lhalf = count / 2;
|
||||
int count_rhalf = count - count_lhalf;
|
||||
|
||||
if (rank % 2 != 0) {
|
||||
/*
|
||||
* Odd process -- exchange with rank - 1
|
||||
* Send the left half of the input vector to the left neighbor,
|
||||
* Recv the right half of the input vector from the left neighbor
|
||||
*/
|
||||
err = ompi_coll_base_sendrecv(rbuf, count_lhalf, dtype, rank - 1,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
(char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
|
||||
count_rhalf, dtype, rank - 1,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
/* Reduce on the right half of the buffers (result in rbuf) */
|
||||
ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
|
||||
(char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
|
||||
|
||||
/* Send the right half to the left neighbor */
|
||||
err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
|
||||
count_rhalf, dtype, rank - 1,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
/* This process does not pariticipate in recursive doubling phase */
|
||||
vrank = -1;
|
||||
|
||||
} else {
|
||||
/*
|
||||
* Even process -- exchange with rank + 1
|
||||
* Send the right half of the input vector to the right neighbor,
|
||||
* Recv the left half of the input vector from the right neighbor
|
||||
*/
|
||||
err = ompi_coll_base_sendrecv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
|
||||
count_rhalf, dtype, rank + 1,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
tmp_buf, count_lhalf, dtype, rank + 1,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
/* Reduce on the right half of the buffers (result in rbuf) */
|
||||
ompi_op_reduce(op, tmp_buf, rbuf, count_lhalf, dtype);
|
||||
|
||||
/* Recv the right half from the right neighbor */
|
||||
err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
|
||||
count_rhalf, dtype, rank + 1,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
vrank = rank / 2;
|
||||
}
|
||||
} else { /* rank >= 2 * nprocs_rem */
|
||||
vrank = rank - nprocs_rem;
|
||||
}
|
||||
|
||||
/*
|
||||
* Step 2. Reduce-scatter implemented with recursive vector halving and
|
||||
* recursive distance doubling. We have p' = 2^{\floor{\log_2 p}}
|
||||
* power-of-two number of processes with new ranks (vrank) and result in rbuf.
|
||||
*
|
||||
* The even-ranked processes send the right half of their buffer to rank + 1
|
||||
* and the odd-ranked processes send the left half of their buffer to
|
||||
* rank - 1. All processes then compute the reduction between the local
|
||||
* buffer and the received buffer. In the next \log_2(p') - 1 steps, the
|
||||
* buffers are recursively halved, and the distance is doubled. At the end,
|
||||
* each of the p' processes has 1 / p' of the total reduction result.
|
||||
*/
|
||||
rindex = malloc(sizeof(*rindex) * nsteps);
|
||||
sindex = malloc(sizeof(*sindex) * nsteps);
|
||||
rcount = malloc(sizeof(*rcount) * nsteps);
|
||||
scount = malloc(sizeof(*scount) * nsteps);
|
||||
if (NULL == rindex || NULL == sindex || NULL == rcount || NULL == scount) {
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
|
||||
if (vrank != -1) {
|
||||
step = 0;
|
||||
wsize = count;
|
||||
sindex[0] = rindex[0] = 0;
|
||||
|
||||
for (int mask = 1; mask < nprocs_pof2; mask <<= 1) {
|
||||
/*
|
||||
* On each iteration: rindex[step] = sindex[step] -- begining of the
|
||||
* current window. Length of the current window is storded in wsize.
|
||||
*/
|
||||
int vdest = vrank ^ mask;
|
||||
/* Translate vdest virtual rank to real rank */
|
||||
int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem;
|
||||
|
||||
if (rank < dest) {
|
||||
/*
|
||||
* Recv into the left half of the current window, send the right
|
||||
* half of the window to the peer (perform reduce on the left
|
||||
* half of the current window)
|
||||
*/
|
||||
rcount[step] = wsize / 2;
|
||||
scount[step] = wsize - rcount[step];
|
||||
sindex[step] = rindex[step] + rcount[step];
|
||||
} else {
|
||||
/*
|
||||
* Recv into the right half of the current window, send the left
|
||||
* half of the window to the peer (perform reduce on the right
|
||||
* half of the current window)
|
||||
*/
|
||||
scount[step] = wsize / 2;
|
||||
rcount[step] = wsize - scount[step];
|
||||
rindex[step] = sindex[step] + scount[step];
|
||||
}
|
||||
|
||||
/* Send part of data from the rbuf, recv into the tmp_buf */
|
||||
err = ompi_coll_base_sendrecv((char *)rbuf + (ptrdiff_t)sindex[step] * extent,
|
||||
scount[step], dtype, dest,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
(char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
|
||||
rcount[step], dtype, dest,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
/* Local reduce: rbuf[] = tmp_buf[] <op> rbuf[] */
|
||||
ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
|
||||
(char *)rbuf + (ptrdiff_t)rindex[step] * extent,
|
||||
rcount[step], dtype);
|
||||
|
||||
/* Move the current window to the received message */
|
||||
rindex[step + 1] = rindex[step];
|
||||
sindex[step + 1] = rindex[step];
|
||||
wsize = rcount[step];
|
||||
step++;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Assertion: each process has 1 / p' of the total reduction result:
|
||||
* rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...].
|
||||
*/
|
||||
|
||||
/*
|
||||
* Step 3. Allgather by the recursive doubling algorithm.
|
||||
* Each process has 1 / p' of the total reduction result:
|
||||
* rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...].
|
||||
* All exchanges are executed in reverse order relative
|
||||
* to recursive doubling (previous step).
|
||||
*/
|
||||
|
||||
if (vrank != -1) {
|
||||
step = nsteps - 1; /* step = ilog2(p') - 1 */
|
||||
|
||||
for (int mask = nprocs_pof2 >> 1; mask > 0; mask >>= 1) {
|
||||
int vdest = vrank ^ mask;
|
||||
/* Translate vdest virtual rank to real rank */
|
||||
int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem;
|
||||
|
||||
/*
|
||||
* Send rcount[step] elements from rbuf[rindex[step]...]
|
||||
* Recv scount[step] elements to rbuf[sindex[step]...]
|
||||
*/
|
||||
err = ompi_coll_base_sendrecv((char *)rbuf + (ptrdiff_t)rindex[step] * extent,
|
||||
rcount[step], dtype, dest,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
(char *)rbuf + (ptrdiff_t)sindex[step] * extent,
|
||||
scount[step], dtype, dest,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
step--;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Step 4. Send total result to excluded odd ranks.
|
||||
*/
|
||||
if (rank < 2 * nprocs_rem) {
|
||||
if (rank % 2 != 0) {
|
||||
/* Odd process -- recv result from rank - 1 */
|
||||
err = MCA_PML_CALL(recv(rbuf, count, dtype, rank - 1,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
} else {
|
||||
/* Even process -- send result to rank + 1 */
|
||||
err = MCA_PML_CALL(send(rbuf, count, dtype, rank + 1,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
}
|
||||
}
|
||||
|
||||
cleanup_and_return:
|
||||
if (NULL != tmp_buf_raw)
|
||||
free(tmp_buf_raw);
|
||||
if (NULL != rindex)
|
||||
free(rindex);
|
||||
if (NULL != sindex)
|
||||
free(sindex);
|
||||
if (NULL != rcount)
|
||||
free(rcount);
|
||||
if (NULL != scount)
|
||||
free(scount);
|
||||
|
||||
return err;
|
||||
}
|
104
ompi/mca/coll/spacc/coll_spacc_component.c
Обычный файл
104
ompi/mca/coll/spacc/coll_spacc_component.c
Обычный файл
@ -0,0 +1,104 @@
|
||||
/*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "coll_spacc.h"
|
||||
|
||||
/*
|
||||
* Public string showing the coll ompi_spacc component version number
|
||||
*/
|
||||
const char *ompi_coll_spacc_component_version_string =
|
||||
"Open MPI SPACC collective MCA component version " OMPI_VERSION;
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
*/
|
||||
int ompi_coll_spacc_priority = 5;
|
||||
int ompi_coll_spacc_stream = -1;
|
||||
|
||||
/*
|
||||
* Local function
|
||||
*/
|
||||
static int spacc_register(void);
|
||||
static int spacc_open(void);
|
||||
static int spacc_close(void);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
mca_coll_spacc_component_t mca_coll_spacc_component = {
|
||||
/* First, fill in the super */
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
.collm_version = {
|
||||
MCA_COLL_BASE_VERSION_2_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
.mca_component_name = "spacc",
|
||||
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
|
||||
OMPI_RELEASE_VERSION),
|
||||
|
||||
/* Component open and close functions */
|
||||
.mca_open_component = spacc_open,
|
||||
.mca_close_component = spacc_close,
|
||||
.mca_register_component_params = spacc_register,
|
||||
},
|
||||
.collm_data = {
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
.collm_init_query = ompi_coll_spacc_init_query,
|
||||
.collm_comm_query = ompi_coll_spacc_comm_query,
|
||||
}
|
||||
};
|
||||
|
||||
static int spacc_register(void)
|
||||
{
|
||||
/* Use a low priority, but allow other components to be lower */
|
||||
ompi_coll_spacc_priority = 5;
|
||||
(void)mca_base_component_var_register(&mca_coll_spacc_component.super.collm_version,
|
||||
"priority", "Priority of the spacc coll component",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_6,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_coll_spacc_priority);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int spacc_open(void)
|
||||
{
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
{
|
||||
int param;
|
||||
|
||||
param = mca_base_var_find("ompi", "coll", "base", "verbose");
|
||||
if (param >= 0) {
|
||||
const int *verbose = NULL;
|
||||
mca_base_var_get_value(param, &verbose, NULL, NULL);
|
||||
if (verbose && verbose[0] > 0) {
|
||||
ompi_coll_spacc_stream = opal_output_open(NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* OPAL_ENABLE_DEBUG */
|
||||
OPAL_OUTPUT((ompi_coll_spacc_stream, "coll:spacc:component_open: done"));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int spacc_close(void)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_spacc_stream, "coll:spacc:component_close: done"));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
97
ompi/mca/coll/spacc/coll_spacc_module.c
Обычный файл
97
ompi/mca/coll/spacc/coll_spacc_module.c
Обычный файл
@ -0,0 +1,97 @@
|
||||
/*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "coll_spacc.h"
|
||||
|
||||
static int spacc_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm);
|
||||
/*
|
||||
* Initial query function that is invoked during MPI_INIT, allowing
|
||||
* this component to disqualify itself if it doesn't support the
|
||||
* required level of thread support.
|
||||
*/
|
||||
int ompi_coll_spacc_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Invoked when there's a new communicator that has been created.
|
||||
* Look at the communicator and decide which set of functions and
|
||||
* priority we want to return.
|
||||
*/
|
||||
mca_coll_base_module_t *ompi_coll_spacc_comm_query(
|
||||
struct ompi_communicator_t *comm, int *priority)
|
||||
{
|
||||
mca_coll_spacc_module_t *spacc_module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_spacc_stream, "coll:spacc:module_spacc query called"));
|
||||
|
||||
if (OMPI_COMM_IS_INTER(comm)) {
|
||||
*priority = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (OMPI_COMM_IS_INTRA(comm) && ompi_comm_size(comm) < 2) {
|
||||
*priority = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
spacc_module = OBJ_NEW(mca_coll_spacc_module_t);
|
||||
if (NULL == spacc_module)
|
||||
return NULL;
|
||||
|
||||
*priority = ompi_coll_spacc_priority;
|
||||
|
||||
spacc_module->super.coll_module_enable = spacc_module_enable;
|
||||
spacc_module->super.ft_event = NULL;
|
||||
spacc_module->super.coll_allgather = NULL;
|
||||
spacc_module->super.coll_allgatherv = NULL;
|
||||
spacc_module->super.coll_allreduce = mca_coll_spacc_allreduce_intra_redscat_allgather;
|
||||
spacc_module->super.coll_alltoall = NULL;
|
||||
spacc_module->super.coll_alltoallv = NULL;
|
||||
spacc_module->super.coll_alltoallw = NULL;
|
||||
spacc_module->super.coll_barrier = NULL;
|
||||
spacc_module->super.coll_bcast = NULL;
|
||||
spacc_module->super.coll_exscan = NULL;
|
||||
spacc_module->super.coll_gather = NULL;
|
||||
spacc_module->super.coll_gatherv = NULL;
|
||||
spacc_module->super.coll_reduce = mca_coll_spacc_reduce_intra_redscat_gather;
|
||||
spacc_module->super.coll_reduce_scatter_block = NULL;
|
||||
spacc_module->super.coll_reduce_scatter = NULL;
|
||||
spacc_module->super.coll_scan = NULL;
|
||||
spacc_module->super.coll_scatter = NULL;
|
||||
spacc_module->super.coll_scatterv = NULL;
|
||||
|
||||
return &(spacc_module->super);
|
||||
}
|
||||
|
||||
/*
|
||||
* Init module on the communicator
|
||||
*/
|
||||
static int spacc_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_spacc_stream, "coll:spacc:module_enable called."));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static void mca_coll_spacc_module_construct(mca_coll_spacc_module_t *module)
|
||||
{
|
||||
/* mca_coll_spacc_module_t *spacc_module = (mca_coll_spacc_module_t*)module; */
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_coll_spacc_module_t, mca_coll_base_module_t,
|
||||
mca_coll_spacc_module_construct, NULL);
|
413
ompi/mca/coll/spacc/coll_spacc_reduce.c
Обычный файл
413
ompi/mca/coll/spacc/coll_spacc_reduce.c
Обычный файл
@ -0,0 +1,413 @@
|
||||
/*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "coll_spacc.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/op/op.h"
|
||||
|
||||
/*
|
||||
* mca_coll_spacc_reduce_intra_redscat_gather
|
||||
*
|
||||
* Function: Reduce using Rabenseifner's algorithm.
|
||||
* Accepts: Same arguments as MPI_Reduce
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
*
|
||||
* Description: an implementation of Rabenseifner's reduce algorithm [1, 2].
|
||||
* [1] Rajeev Thakur, Rolf Rabenseifner and William Gropp.
|
||||
* Optimization of Collective Communication Operations in MPICH //
|
||||
* The Int. Journal of High Performance Computing Applications. Vol 19,
|
||||
* Issue 1, pp. 49--66.
|
||||
* [2] http://www.hlrs.de/mpi/myreduce.html.
|
||||
*
|
||||
* This algorithm is a combination of a reduce-scatter implemented with
|
||||
* recursive vector halving and recursive distance doubling, followed either
|
||||
* by a binomial tree gather [1].
|
||||
*
|
||||
* Step 1. If the number of processes is not a power of two, reduce it to
|
||||
* the nearest lower power of two (p' = 2^{\floor{\log_2 p}})
|
||||
* by removing r = p - p' extra processes as follows. In the first 2r processes
|
||||
* (ranks 0 to 2r - 1), all the even ranks send the second half of the input
|
||||
* vector to their right neighbor (rank + 1), and all the odd ranks send
|
||||
* the first half of the input vector to their left neighbor (rank - 1).
|
||||
* The even ranks compute the reduction on the first half of the vector and
|
||||
* the odd ranks compute the reduction on the second half. The odd ranks then
|
||||
* send the result to their left neighbors (the even ranks). As a result,
|
||||
* the even ranks among the first 2r processes now contain the reduction with
|
||||
* the input vector on their right neighbors (the odd ranks). These odd ranks
|
||||
* do not participate in the rest of the algorithm, which leaves behind
|
||||
* a power-of-two number of processes. The first r even-ranked processes and
|
||||
* the last p - 2r processes are now renumbered from 0 to p' - 1.
|
||||
*
|
||||
* Step 2. The remaining processes now perform a reduce-scatter by using
|
||||
* recursive vector halving and recursive distance doubling. The even-ranked
|
||||
* processes send the second half of their buffer to rank + 1 and the odd-ranked
|
||||
* processes send the first half of their buffer to rank - 1. All processes
|
||||
* then compute the reduction between the local buffer and the received buffer.
|
||||
* In the next log_2(p') - 1 steps, the buffers are recursively halved, and the
|
||||
* distance is doubled. At the end, each of the p' processes has 1 / p' of the
|
||||
* total reduction result.
|
||||
*
|
||||
* Step 3. A binomial tree gather is performed by using recursive vector
|
||||
* doubling and distance halving. In the non-power-of-two case, if the root
|
||||
* happens to be one of those odd-ranked processes that would normally
|
||||
* be removed in the first step, then the role of this process and process 0
|
||||
* are interchanged.
|
||||
*
|
||||
* Limitations:
|
||||
* count >= 2^{\floor{\log_2 p}}
|
||||
* commutative operations only
|
||||
* intra-communicators only
|
||||
*
|
||||
* Memory requirements (per process):
|
||||
* rank != root: 2 * count * typesize + 4 * log_2(p) * sizeof(int) = O(count)
|
||||
* rank == root: count * typesize + 4 * log_2(p) * sizeof(int) = O(count)
|
||||
*
|
||||
* Recommendations: root = 0, otherwise it is required additional steps
|
||||
* in the root process.
|
||||
*/
|
||||
int mca_coll_spacc_reduce_intra_redscat_gather(
|
||||
const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int comm_size = ompi_comm_size(comm);
|
||||
int rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_spacc_stream,
|
||||
"coll:spacc:reduce_intra_redscat_gather: rank %d/%d, root %d",
|
||||
rank, comm_size, root));
|
||||
|
||||
/* Find nearest power-of-two less than or equal to comm_size */
|
||||
int nsteps = opal_hibit(comm_size, comm->c_cube_dim + 1); /* ilog2(comm_size) */
|
||||
int nprocs_pof2 = 1 << nsteps; /* flp2(comm_size) */
|
||||
|
||||
if (count < nprocs_pof2 || !ompi_op_is_commute(op)) {
|
||||
OPAL_OUTPUT((ompi_coll_spacc_stream,
|
||||
"coll:spacc:reduce_intra_redscat_gather: rank %d/%d count %d switching to base reduce",
|
||||
rank, comm_size, count));
|
||||
return ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module);
|
||||
}
|
||||
|
||||
int err = MPI_SUCCESS;
|
||||
int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL;
|
||||
|
||||
ptrdiff_t lb, extent, dsize, gap;
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
dsize = opal_datatype_span(&dtype->super, count, &gap);
|
||||
|
||||
/* Temporary buffer for receiving messages */
|
||||
char *tmp_buf = NULL;
|
||||
char *tmp_buf_raw = (char *)malloc(dsize);
|
||||
if (NULL == tmp_buf_raw)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
tmp_buf = tmp_buf_raw - gap;
|
||||
|
||||
char *rbuf_raw = NULL;
|
||||
if (rank != root) {
|
||||
rbuf_raw = (char *)malloc(dsize);
|
||||
if (NULL == rbuf_raw) {
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
rbuf = rbuf_raw - gap;
|
||||
}
|
||||
|
||||
if ((rank != root) || (sbuf != MPI_IN_PLACE)) {
|
||||
/* Copy sbuf to rbuf */
|
||||
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf,
|
||||
(char *)sbuf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Step 1. Reduce the number of processes to the nearest lower power of two
|
||||
* p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes.
|
||||
* 1. In the first 2r processes (ranks 0 to 2r - 1), all the even ranks send
|
||||
* the second half of the input vector to their right neighbor (rank + 1)
|
||||
* and all the odd ranks send the first half of the input vector to their
|
||||
* left neighbor (rank - 1).
|
||||
* 2. All 2r processes compute the reduction on their half.
|
||||
* 3. The odd ranks then send the result to their left neighbors
|
||||
* (the even ranks).
|
||||
*
|
||||
* The even ranks (0 to 2r - 1) now contain the reduction with the input
|
||||
* vector on their right neighbors (the odd ranks). The first r even
|
||||
* processes and the p - 2r last processes are renumbered from
|
||||
* 0 to 2^{\floor{\log_2 p}} - 1. These odd ranks do not participate in the
|
||||
* rest of the algorithm.
|
||||
*/
|
||||
|
||||
int vrank, step, wsize;
|
||||
int nprocs_rem = comm_size - nprocs_pof2;
|
||||
|
||||
if (rank < 2 * nprocs_rem) {
|
||||
int count_lhalf = count / 2;
|
||||
int count_rhalf = count - count_lhalf;
|
||||
|
||||
if (rank % 2 != 0) {
|
||||
/*
|
||||
* Odd process -- exchange with rank - 1
|
||||
* Send the left half of the input vector to the left neighbor,
|
||||
* Recv the right half of the input vector from the left neighbor
|
||||
*/
|
||||
err = ompi_coll_base_sendrecv(rbuf, count_lhalf, dtype, rank - 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
(char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
|
||||
count_rhalf, dtype, rank - 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
/* Reduce on the right half of the buffers (result in rbuf) */
|
||||
ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
|
||||
(char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
|
||||
|
||||
/* Send the right half to the left neighbor */
|
||||
err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
|
||||
count_rhalf, dtype, rank - 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
/* This process does not pariticipate in recursive doubling phase */
|
||||
vrank = -1;
|
||||
|
||||
} else {
|
||||
/*
|
||||
* Even process -- exchange with rank + 1
|
||||
* Send the right half of the input vector to the right neighbor,
|
||||
* Recv the left half of the input vector from the right neighbor
|
||||
*/
|
||||
err = ompi_coll_base_sendrecv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
|
||||
count_rhalf, dtype, rank + 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
tmp_buf, count_lhalf, dtype, rank + 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
/* Reduce on the right half of the buffers (result in rbuf) */
|
||||
ompi_op_reduce(op, tmp_buf, rbuf, count_lhalf, dtype);
|
||||
|
||||
/* Recv the right half from the right neighbor */
|
||||
err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
|
||||
count_rhalf, dtype, rank + 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
vrank = rank / 2;
|
||||
}
|
||||
} else { /* rank >= 2 * nprocs_rem */
|
||||
vrank = rank - nprocs_rem;
|
||||
}
|
||||
|
||||
/*
|
||||
* Step 2. Reduce-scatter implemented with recursive vector halving and
|
||||
* recursive distance doubling. We have p' = 2^{\floor{\log_2 p}}
|
||||
* power-of-two number of processes with new ranks (vrank) and result in rbuf.
|
||||
*
|
||||
* The even-ranked processes send the right half of their buffer to rank + 1
|
||||
* and the odd-ranked processes send the left half of their buffer to
|
||||
* rank - 1. All processes then compute the reduction between the local
|
||||
* buffer and the received buffer. In the next \log_2(p') - 1 steps, the
|
||||
* buffers are recursively halved, and the distance is doubled. At the end,
|
||||
* each of the p' processes has 1 / p' of the total reduction result.
|
||||
*/
|
||||
|
||||
rindex = malloc(sizeof(*rindex) * nsteps); /* O(\log_2(p)) */
|
||||
sindex = malloc(sizeof(*sindex) * nsteps);
|
||||
rcount = malloc(sizeof(*rcount) * nsteps);
|
||||
scount = malloc(sizeof(*scount) * nsteps);
|
||||
if (NULL == rindex || NULL == sindex || NULL == rcount || NULL == scount) {
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
|
||||
if (vrank != -1) {
|
||||
step = 0;
|
||||
wsize = count;
|
||||
sindex[0] = rindex[0] = 0;
|
||||
|
||||
for (int mask = 1; mask < nprocs_pof2; mask <<= 1) {
|
||||
/*
|
||||
* On each iteration: rindex[step] = sindex[step] -- begining of the
|
||||
* current window. Length of the current window is storded in wsize.
|
||||
*/
|
||||
int vdest = vrank ^ mask;
|
||||
/* Translate vdest virtual rank to real rank */
|
||||
int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem;
|
||||
|
||||
if (rank < dest) {
|
||||
/*
|
||||
* Recv into the left half of the current window, send the right
|
||||
* half of the window to the peer (perform reduce on the left
|
||||
* half of the current window)
|
||||
*/
|
||||
rcount[step] = wsize / 2;
|
||||
scount[step] = wsize - rcount[step];
|
||||
sindex[step] = rindex[step] + rcount[step];
|
||||
} else {
|
||||
/*
|
||||
* Recv into the right half of the current window, send the left
|
||||
* half of the window to the peer (perform reduce on the right
|
||||
* half of the current window)
|
||||
*/
|
||||
scount[step] = wsize / 2;
|
||||
rcount[step] = wsize - scount[step];
|
||||
rindex[step] = sindex[step] + scount[step];
|
||||
}
|
||||
|
||||
/* Send part of data from the rbuf, recv into the tmp_buf */
|
||||
err = ompi_coll_base_sendrecv((char *)rbuf + (ptrdiff_t)sindex[step] * extent,
|
||||
scount[step], dtype, dest,
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
(char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
|
||||
rcount[step], dtype, dest,
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
/* Local reduce: rbuf[] = tmp_buf[] <op> rbuf[] */
|
||||
ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
|
||||
(char *)rbuf + (ptrdiff_t)rindex[step] * extent,
|
||||
rcount[step], dtype);
|
||||
|
||||
/* Move the current window to the received message */
|
||||
rindex[step + 1] = rindex[step];
|
||||
sindex[step + 1] = rindex[step];
|
||||
wsize = rcount[step];
|
||||
step++;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Assertion: each process has 1 / p' of the total reduction result:
|
||||
* rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...].
|
||||
*/
|
||||
|
||||
/*
|
||||
* Setup the root process for gather operation.
|
||||
* Case 1: root < 2r and root is odd -- root process was excluded on step 1
|
||||
* Recv data from process 0, vroot = 0, vrank = 0
|
||||
* Case 2: root < 2r and root is even: vroot = root / 2
|
||||
* Case 3: root >= 2r: vroot = root - r
|
||||
*/
|
||||
int vroot = 0;
|
||||
if (root < 2 * nprocs_rem) {
|
||||
if (root % 2 != 0) {
|
||||
vroot = 0;
|
||||
if (rank == root) {
|
||||
/*
|
||||
* Case 1: root < 2r and root is odd -- root process was
|
||||
* excluded on step 1 (newrank == -1).
|
||||
* Recv a data from the process 0.
|
||||
*/
|
||||
rindex[0] = 0;
|
||||
step = 0, wsize = count;
|
||||
for (int mask = 1; mask < nprocs_pof2; mask *= 2) {
|
||||
rcount[step] = wsize / 2;
|
||||
scount[step] = wsize - rcount[step];
|
||||
rindex[step] = 0;
|
||||
sindex[step] = rcount[step];
|
||||
step++;
|
||||
wsize /= 2;
|
||||
}
|
||||
|
||||
err = MCA_PML_CALL(recv(rbuf, rcount[nsteps - 1], dtype, 0,
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
vrank = 0;
|
||||
|
||||
} else if (vrank == 0) {
|
||||
/* Send a data to the root */
|
||||
err = MCA_PML_CALL(send(rbuf, rcount[nsteps - 1], dtype, root,
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
vrank = -1;
|
||||
}
|
||||
} else {
|
||||
/* Case 2: root < 2r and a root is even: vroot = root / 2 */
|
||||
vroot = root / 2;
|
||||
}
|
||||
} else {
|
||||
/* Case 3: root >= 2r: newroot = root - r */
|
||||
vroot = root - nprocs_rem;
|
||||
}
|
||||
|
||||
/*
|
||||
* Step 3. Gather result at the vroot by the binomial tree algorithm.
|
||||
* Each process has 1 / p' of the total reduction result:
|
||||
* rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...].
|
||||
* All exchanges are executed in reverse order relative
|
||||
* to recursive doubling (previous step).
|
||||
*/
|
||||
|
||||
if (vrank != -1) {
|
||||
int vdest_tree, vroot_tree;
|
||||
step = nsteps - 1; /* step = ilog2(p') - 1 */
|
||||
|
||||
for (int mask = nprocs_pof2 >> 1; mask > 0; mask >>= 1) {
|
||||
int vdest = vrank ^ mask;
|
||||
/* Translate vdest virtual rank to real rank */
|
||||
int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem;
|
||||
if ((vdest == 0) && (root < 2 * nprocs_rem) && (root % 2 != 0))
|
||||
dest = root;
|
||||
|
||||
vdest_tree = vdest >> step;
|
||||
vdest_tree <<= step;
|
||||
vroot_tree = vroot >> step;
|
||||
vroot_tree <<= step;
|
||||
if (vdest_tree == vroot_tree) {
|
||||
/* Send data from rbuf and exit */
|
||||
err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)rindex[step] * extent,
|
||||
rcount[step], dtype, dest,
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
break;
|
||||
} else {
|
||||
/* Recv and continue */
|
||||
err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)sindex[step] * extent,
|
||||
scount[step], dtype, dest,
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
}
|
||||
step--;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup_and_return:
|
||||
if (NULL != tmp_buf_raw)
|
||||
free(tmp_buf_raw);
|
||||
if (NULL != rbuf_raw)
|
||||
free(rbuf_raw);
|
||||
if (NULL != rindex)
|
||||
free(rindex);
|
||||
if (NULL != sindex)
|
||||
free(sindex);
|
||||
if (NULL != rcount)
|
||||
free(rcount);
|
||||
if (NULL != scount)
|
||||
free(scount);
|
||||
|
||||
return err;
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user