2013-10-28 23:06:38 +04:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
2004-01-12 00:26:55 +03:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2014-04-06 22:23:49 +04:00
|
|
|
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
2005-11-05 22:57:48 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2015-06-24 06:59:57 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2004-11-28 23:09:25 +03:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2008-10-07 23:44:51 +04:00
|
|
|
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
2012-04-04 19:11:03 +04:00
|
|
|
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
|
2013-10-28 23:06:38 +04:00
|
|
|
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
|
|
|
* reserved.
|
2014-11-04 06:16:30 +03:00
|
|
|
* Copyright (c) 2014 Research Organization for Information Science
|
|
|
|
* and Technology (RIST). All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
2015-06-24 06:59:57 +03:00
|
|
|
*
|
2004-11-22 04:38:40 +03:00
|
|
|
* Additional copyrights may follow
|
2015-06-24 06:59:57 +03:00
|
|
|
*
|
2004-01-12 00:26:55 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2004-06-07 19:33:53 +04:00
|
|
|
#include "ompi_config.h"
|
2004-02-15 00:26:30 +03:00
|
|
|
#include "coll_basic.h"
|
2004-01-12 00:26:55 +03:00
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <errno.h>
|
|
|
|
|
|
|
|
#include "mpi.h"
|
- Check, whether the compiler supports __builtin_clz (count leading
zeroes);
if so, use it for bit-operations like opal_cube_dim and opal_hibit.
Implement two versions of power-of-two.
In case of opal_next_poweroftwo, this reduces the average execution
time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining,
measured rdtsc, with loop over 2^27 values).
Numbers for other functions are similar (but of course heavily depend
on the usage, e.g. opal_hibit() with a start of 4 does not save
much). The bsr instruction on AMD Opteron is also not as fast.
- Replace various places where the next power-of-two is computed.
Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and
Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes.
This commit was SVN r25270.
2011-10-12 02:49:01 +04:00
|
|
|
#include "opal/util/bit_ops.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "ompi/constants.h"
|
2005-09-13 00:21:53 +04:00
|
|
|
#include "ompi/mca/coll/coll.h"
|
|
|
|
#include "ompi/mca/coll/base/coll_tags.h"
|
2009-03-04 20:06:51 +03:00
|
|
|
#include "ompi/mca/pml/pml.h"
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
|
|
|
#include "ompi/datatype/ompi_datatype.h"
|
2004-01-12 00:26:55 +03:00
|
|
|
#include "coll_basic.h"
|
2005-09-13 00:21:53 +04:00
|
|
|
#include "ompi/op/op.h"
|
2004-01-12 00:26:55 +03:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
#define COMMUTATIVE_LONG_MSG 8 * 1024 * 1024
|
2004-01-12 00:26:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* reduce_scatter
|
|
|
|
*
|
|
|
|
* Function: - reduce then scatter
|
|
|
|
* Accepts: - same as MPI_Reduce_scatter()
|
|
|
|
* Returns: - MPI_SUCCESS or error code
|
2007-01-29 22:29:35 +03:00
|
|
|
*
|
|
|
|
* Algorithm:
|
|
|
|
* Cummutative, reasonable sized messages
|
|
|
|
* recursive halving algorithm
|
|
|
|
* Others:
|
2015-06-24 06:59:57 +03:00
|
|
|
* reduce and scatterv (needs to be cleaned
|
2007-01-29 22:29:35 +03:00
|
|
|
* up at some point)
|
|
|
|
*
|
|
|
|
* NOTE: that the recursive halving algorithm should be faster than
|
|
|
|
* the reduce/scatter for all message sizes. However, the memory
|
|
|
|
* usage for the recusive halving is msg_size + 2 * comm_size greater
|
|
|
|
* for the recursive halving, so I've limited where the recursive
|
|
|
|
* halving is used to be nice to the app memory wise. There are much
|
|
|
|
* better algorithms for large messages with cummutative operations,
|
|
|
|
* so this should be investigated further.
|
2004-01-12 00:26:55 +03:00
|
|
|
*/
|
2005-08-10 14:51:42 +04:00
|
|
|
int
|
|
|
|
mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts,
|
2005-08-10 21:53:43 +04:00
|
|
|
struct ompi_datatype_t *dtype,
|
|
|
|
struct ompi_op_t *op,
|
2007-08-19 07:37:49 +04:00
|
|
|
struct ompi_communicator_t *comm,
|
2008-07-29 02:40:57 +04:00
|
|
|
mca_coll_base_module_t *module)
|
2004-01-12 00:26:55 +03:00
|
|
|
{
|
2007-01-29 22:29:35 +03:00
|
|
|
int i, rank, size, count, err = OMPI_SUCCESS;
|
|
|
|
ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
|
2005-08-10 14:51:42 +04:00
|
|
|
int *disps = NULL;
|
2007-01-29 22:29:35 +03:00
|
|
|
char *recv_buf = NULL, *recv_buf_free = NULL;
|
|
|
|
char *result_buf = NULL, *result_buf_free = NULL;
|
2005-08-10 14:51:42 +04:00
|
|
|
/* Initialize */
|
|
|
|
rank = ompi_comm_rank(comm);
|
|
|
|
size = ompi_comm_size(comm);
|
2004-06-29 04:02:25 +04:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* Find displacements and the like */
|
|
|
|
disps = (int*) malloc(sizeof(int) * size);
|
|
|
|
if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE;
|
2004-06-29 04:02:25 +04:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
disps[0] = 0;
|
|
|
|
for (i = 0; i < (size - 1); ++i) {
|
|
|
|
disps[i + 1] = disps[i] + rcounts[i];
|
2004-01-12 00:26:55 +03:00
|
|
|
}
|
2007-01-29 22:29:35 +03:00
|
|
|
count = disps[size - 1] + rcounts[size - 1];
|
2004-01-12 00:26:55 +03:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* short cut the trivial case */
|
|
|
|
if (0 == count) {
|
|
|
|
free(disps);
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get datatype information */
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
|
|
|
ompi_datatype_get_extent(dtype, &lb, &extent);
|
|
|
|
ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
|
2007-01-29 22:29:35 +03:00
|
|
|
buf_size = true_extent + (count - 1) * extent;
|
2005-08-10 21:53:43 +04:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* Handle MPI_IN_PLACE */
|
|
|
|
if (MPI_IN_PLACE == sbuf) {
|
|
|
|
sbuf = rbuf;
|
|
|
|
}
|
2005-08-10 21:53:43 +04:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
if ((op->o_flags & OMPI_OP_FLAGS_COMMUTE) &&
|
2013-10-28 23:06:38 +04:00
|
|
|
(buf_size < COMMUTATIVE_LONG_MSG)) {
|
- Check, whether the compiler supports __builtin_clz (count leading
zeroes);
if so, use it for bit-operations like opal_cube_dim and opal_hibit.
Implement two versions of power-of-two.
In case of opal_next_poweroftwo, this reduces the average execution
time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining,
measured rdtsc, with loop over 2^27 values).
Numbers for other functions are similar (but of course heavily depend
on the usage, e.g. opal_hibit() with a start of 4 does not save
much). The bsr instruction on AMD Opteron is also not as fast.
- Replace various places where the next power-of-two is computed.
Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and
Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes.
This commit was SVN r25270.
2011-10-12 02:49:01 +04:00
|
|
|
int tmp_size, remain = 0, tmp_rank;
|
2005-08-10 21:53:43 +04:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* temporary receive buffer. See coll_basic_reduce.c for details on sizing */
|
|
|
|
recv_buf_free = (char*) malloc(buf_size);
|
2014-11-14 07:22:01 +03:00
|
|
|
recv_buf = recv_buf_free - true_lb;
|
2007-01-29 22:29:35 +03:00
|
|
|
if (NULL == recv_buf_free) {
|
|
|
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
2005-08-10 21:53:43 +04:00
|
|
|
}
|
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* allocate temporary buffer for results */
|
|
|
|
result_buf_free = (char*) malloc(buf_size);
|
2014-11-14 07:22:01 +03:00
|
|
|
result_buf = result_buf_free - true_lb;
|
2007-01-29 22:29:35 +03:00
|
|
|
|
|
|
|
/* copy local buffer into the temporary results */
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
|
|
|
err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype);
|
2007-01-29 22:29:35 +03:00
|
|
|
if (OMPI_SUCCESS != err) goto cleanup;
|
|
|
|
|
|
|
|
/* figure out power of two mapping: grow until larger than
|
|
|
|
comm size, then go back one, to get the largest power of
|
|
|
|
two less than comm size */
|
- Check, whether the compiler supports __builtin_clz (count leading
zeroes);
if so, use it for bit-operations like opal_cube_dim and opal_hibit.
Implement two versions of power-of-two.
In case of opal_next_poweroftwo, this reduces the average execution
time from 83 cycles to 4 cycles (Intel Nehalem, icc, -O2, inlining,
measured rdtsc, with loop over 2^27 values).
Numbers for other functions are similar (but of course heavily depend
on the usage, e.g. opal_hibit() with a start of 4 does not save
much). The bsr instruction on AMD Opteron is also not as fast.
- Replace various places where the next power-of-two is computed.
Tested on Intel Nehalem Cluster with openib, compilers GNU-4.6.1 and
Intel-12.0.4 using mpi_testsuite -t "Collective" with 128 processes.
This commit was SVN r25270.
2011-10-12 02:49:01 +04:00
|
|
|
tmp_size = opal_next_poweroftwo(size);
|
2007-01-29 22:29:35 +03:00
|
|
|
tmp_size >>= 1;
|
|
|
|
remain = size - tmp_size;
|
|
|
|
|
|
|
|
/* If comm size is not a power of two, have the first "remain"
|
|
|
|
procs with an even rank send to rank + 1, leaving a power of
|
|
|
|
two procs to do the rest of the algorithm */
|
|
|
|
if (rank < 2 * remain) {
|
|
|
|
if ((rank & 1) == 0) {
|
2015-06-24 06:59:57 +03:00
|
|
|
err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
|
2007-01-29 22:29:35 +03:00
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
|
|
|
MCA_PML_BASE_SEND_STANDARD,
|
|
|
|
comm));
|
|
|
|
if (OMPI_SUCCESS != err) goto cleanup;
|
|
|
|
|
|
|
|
/* we don't participate from here on out */
|
|
|
|
tmp_rank = -1;
|
|
|
|
} else {
|
|
|
|
err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
|
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
|
|
|
comm, MPI_STATUS_IGNORE));
|
2012-04-04 19:11:03 +04:00
|
|
|
if (OMPI_SUCCESS != err) goto cleanup;
|
2007-01-29 22:29:35 +03:00
|
|
|
|
|
|
|
/* integrate their results into our temp results */
|
|
|
|
ompi_op_reduce(op, recv_buf, result_buf, count, dtype);
|
|
|
|
|
|
|
|
/* adjust rank to be the bottom "remain" ranks */
|
|
|
|
tmp_rank = rank / 2;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* just need to adjust rank to show that the bottom "even
|
|
|
|
remain" ranks dropped out */
|
|
|
|
tmp_rank = rank - remain;
|
2005-08-10 21:53:43 +04:00
|
|
|
}
|
2004-01-12 00:26:55 +03:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* For ranks not kicked out by the above code, perform the
|
|
|
|
recursive halving */
|
|
|
|
if (tmp_rank >= 0) {
|
|
|
|
int *tmp_disps = NULL, *tmp_rcounts = NULL;
|
|
|
|
int mask, send_index, recv_index, last_index;
|
2005-09-15 23:33:54 +04:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* recalculate disps and rcounts to account for the
|
|
|
|
special "remainder" processes that are no longer doing
|
|
|
|
anything */
|
|
|
|
tmp_rcounts = (int*) malloc(tmp_size * sizeof(int));
|
|
|
|
if (NULL == tmp_rcounts) {
|
|
|
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
tmp_disps = (int*) malloc(tmp_size * sizeof(int));
|
|
|
|
if (NULL == tmp_disps) {
|
|
|
|
free(tmp_rcounts);
|
|
|
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2005-09-15 23:33:54 +04:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
for (i = 0 ; i < tmp_size ; ++i) {
|
|
|
|
if (i < remain) {
|
|
|
|
/* need to include old neighbor as well */
|
|
|
|
tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2];
|
|
|
|
} else {
|
|
|
|
tmp_rcounts[i] = rcounts[i + remain];
|
|
|
|
}
|
|
|
|
}
|
2004-01-12 00:26:55 +03:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
tmp_disps[0] = 0;
|
|
|
|
for (i = 0; i < tmp_size - 1; ++i) {
|
|
|
|
tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i];
|
|
|
|
}
|
2004-01-12 00:26:55 +03:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* do the recursive halving communication. Don't use the
|
|
|
|
dimension information on the communicator because I
|
|
|
|
think the information is invalidated by our "shrinking"
|
|
|
|
of the communicator */
|
|
|
|
mask = tmp_size >> 1;
|
|
|
|
send_index = recv_index = 0;
|
|
|
|
last_index = tmp_size;
|
|
|
|
while (mask > 0) {
|
|
|
|
int tmp_peer, peer, send_count, recv_count;
|
|
|
|
struct ompi_request_t *request;
|
2004-01-12 00:26:55 +03:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
tmp_peer = tmp_rank ^ mask;
|
|
|
|
peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain;
|
2004-06-29 04:02:25 +04:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* figure out if we're sending, receiving, or both */
|
|
|
|
send_count = recv_count = 0;
|
|
|
|
if (tmp_rank < tmp_peer) {
|
|
|
|
send_index = recv_index + mask;
|
|
|
|
for (i = send_index ; i < last_index ; ++i) {
|
|
|
|
send_count += tmp_rcounts[i];
|
|
|
|
}
|
|
|
|
for (i = recv_index ; i < send_index ; ++i) {
|
|
|
|
recv_count += tmp_rcounts[i];
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
recv_index = send_index + mask;
|
|
|
|
for (i = send_index ; i < recv_index ; ++i) {
|
|
|
|
send_count += tmp_rcounts[i];
|
|
|
|
}
|
|
|
|
for (i = recv_index ; i < last_index ; ++i) {
|
|
|
|
recv_count += tmp_rcounts[i];
|
|
|
|
}
|
|
|
|
}
|
2004-06-29 04:02:25 +04:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
/* actual data transfer. Send from result_buf,
|
|
|
|
receive into recv_buf */
|
2013-10-28 23:06:38 +04:00
|
|
|
if (recv_count > 0) {
|
2007-01-29 22:29:35 +03:00
|
|
|
err = MCA_PML_CALL(irecv(recv_buf + tmp_disps[recv_index] * extent,
|
|
|
|
recv_count, dtype, peer,
|
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
|
|
|
comm, &request));
|
|
|
|
if (OMPI_SUCCESS != err) {
|
|
|
|
free(tmp_rcounts);
|
|
|
|
free(tmp_disps);
|
|
|
|
goto cleanup;
|
2015-06-24 06:59:57 +03:00
|
|
|
}
|
2007-01-29 22:29:35 +03:00
|
|
|
}
|
2013-10-28 23:06:38 +04:00
|
|
|
if (send_count > 0) {
|
2007-01-29 22:29:35 +03:00
|
|
|
err = MCA_PML_CALL(send(result_buf + tmp_disps[send_index] * extent,
|
2015-06-24 06:59:57 +03:00
|
|
|
send_count, dtype, peer,
|
2007-01-29 22:29:35 +03:00
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
|
|
|
MCA_PML_BASE_SEND_STANDARD,
|
|
|
|
comm));
|
|
|
|
if (OMPI_SUCCESS != err) {
|
|
|
|
free(tmp_rcounts);
|
|
|
|
free(tmp_disps);
|
|
|
|
goto cleanup;
|
2015-06-24 06:59:57 +03:00
|
|
|
}
|
2007-01-29 22:29:35 +03:00
|
|
|
}
|
2013-10-28 23:06:38 +04:00
|
|
|
|
|
|
|
/* if we received something on this step, push it into
|
|
|
|
the results buffer */
|
|
|
|
if (recv_count > 0) {
|
2007-01-29 22:29:35 +03:00
|
|
|
err = ompi_request_wait(&request, MPI_STATUS_IGNORE);
|
|
|
|
if (OMPI_SUCCESS != err) {
|
|
|
|
free(tmp_rcounts);
|
|
|
|
free(tmp_disps);
|
|
|
|
goto cleanup;
|
2015-06-24 06:59:57 +03:00
|
|
|
}
|
2007-01-29 22:29:35 +03:00
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
ompi_op_reduce(op,
|
|
|
|
recv_buf + tmp_disps[recv_index] * extent,
|
2007-01-29 22:29:35 +03:00
|
|
|
result_buf + tmp_disps[recv_index] * extent,
|
|
|
|
recv_count, dtype);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* update for next iteration */
|
|
|
|
send_index = recv_index;
|
|
|
|
last_index = recv_index + mask;
|
|
|
|
mask >>= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* copy local results from results buffer into real receive buffer */
|
|
|
|
if (0 != rcounts[rank]) {
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
|
|
|
err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
|
2015-06-24 06:59:57 +03:00
|
|
|
rcounts[rank], dtype,
|
2007-01-29 22:29:35 +03:00
|
|
|
rbuf, rcounts[rank], dtype);
|
|
|
|
if (OMPI_SUCCESS != err) {
|
|
|
|
free(tmp_rcounts);
|
|
|
|
free(tmp_disps);
|
|
|
|
goto cleanup;
|
2015-06-24 06:59:57 +03:00
|
|
|
}
|
2007-01-29 22:29:35 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
free(tmp_rcounts);
|
|
|
|
free(tmp_disps);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Now fix up the non-power of two case, by having the odd
|
|
|
|
procs send the even procs the proper results */
|
|
|
|
if (rank < 2 * remain) {
|
|
|
|
if ((rank & 1) == 0) {
|
|
|
|
if (rcounts[rank]) {
|
|
|
|
err = MCA_PML_CALL(recv(rbuf, rcounts[rank], dtype, rank + 1,
|
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
|
|
|
comm, MPI_STATUS_IGNORE));
|
|
|
|
if (OMPI_SUCCESS != err) goto cleanup;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (rcounts[rank - 1]) {
|
|
|
|
err = MCA_PML_CALL(send(result_buf + disps[rank - 1] * extent,
|
|
|
|
rcounts[rank - 1], dtype, rank - 1,
|
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
|
|
|
MCA_PML_BASE_SEND_STANDARD,
|
|
|
|
comm));
|
|
|
|
if (OMPI_SUCCESS != err) goto cleanup;
|
|
|
|
}
|
2015-06-24 06:59:57 +03:00
|
|
|
}
|
2007-01-29 22:29:35 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
if (0 == rank) {
|
|
|
|
/* temporary receive buffer. See coll_basic_reduce.c for
|
|
|
|
details on sizing */
|
|
|
|
recv_buf_free = (char*) malloc(buf_size);
|
2014-11-14 07:22:01 +03:00
|
|
|
recv_buf = recv_buf_free - true_lb;
|
2007-01-29 22:29:35 +03:00
|
|
|
if (NULL == recv_buf_free) {
|
|
|
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* reduction */
|
|
|
|
err =
|
|
|
|
comm->c_coll.coll_reduce(sbuf, recv_buf, count, dtype, op, 0,
|
2007-10-04 20:29:24 +04:00
|
|
|
comm, comm->c_coll.coll_reduce_module);
|
2007-01-29 22:29:35 +03:00
|
|
|
|
|
|
|
/* scatter */
|
|
|
|
if (MPI_SUCCESS == err) {
|
|
|
|
err = comm->c_coll.coll_scatterv(recv_buf, rcounts, disps, dtype,
|
|
|
|
rbuf, rcounts[rank], dtype, 0,
|
2007-10-04 20:29:24 +04:00
|
|
|
comm, comm->c_coll.coll_scatterv_module);
|
2007-01-29 22:29:35 +03:00
|
|
|
}
|
2005-08-10 14:51:42 +04:00
|
|
|
}
|
2004-01-12 00:26:55 +03:00
|
|
|
|
2007-01-29 22:29:35 +03:00
|
|
|
cleanup:
|
|
|
|
if (NULL != disps) free(disps);
|
|
|
|
if (NULL != recv_buf_free) free(recv_buf_free);
|
|
|
|
if (NULL != result_buf_free) free(result_buf_free);
|
|
|
|
|
2005-08-10 14:51:42 +04:00
|
|
|
return err;
|
2004-06-29 04:02:25 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* reduce_scatter_inter
|
|
|
|
*
|
|
|
|
* Function: - reduce/scatter operation
|
|
|
|
* Accepts: - same arguments as MPI_Reduce_scatter()
|
|
|
|
* Returns: - MPI_SUCCESS or error code
|
|
|
|
*/
|
2005-08-10 14:51:42 +04:00
|
|
|
int
|
|
|
|
mca_coll_basic_reduce_scatter_inter(void *sbuf, void *rbuf, int *rcounts,
|
2005-08-10 21:53:43 +04:00
|
|
|
struct ompi_datatype_t *dtype,
|
|
|
|
struct ompi_op_t *op,
|
2007-08-19 07:37:49 +04:00
|
|
|
struct ompi_communicator_t *comm,
|
2008-07-29 02:40:57 +04:00
|
|
|
mca_coll_base_module_t *module)
|
2004-06-29 04:02:25 +04:00
|
|
|
{
|
2008-10-09 18:35:20 +04:00
|
|
|
int err, i, rank, root = 0, rsize, lsize;
|
2008-10-10 18:35:52 +04:00
|
|
|
int totalcounts;
|
2006-10-18 00:20:58 +04:00
|
|
|
ptrdiff_t lb, extent;
|
2008-10-10 18:35:52 +04:00
|
|
|
char *tmpbuf = NULL, *tmpbuf2 = NULL;
|
2004-08-04 02:12:57 +04:00
|
|
|
ompi_request_t *req;
|
2008-10-09 18:35:20 +04:00
|
|
|
int *disps = NULL;
|
2004-08-04 02:12:57 +04:00
|
|
|
|
2005-08-10 14:51:42 +04:00
|
|
|
rank = ompi_comm_rank(comm);
|
|
|
|
rsize = ompi_comm_remote_size(comm);
|
2008-10-09 18:35:20 +04:00
|
|
|
lsize = ompi_comm_size(comm);
|
2004-08-04 02:12:57 +04:00
|
|
|
|
2008-10-09 18:35:20 +04:00
|
|
|
/* Figure out the total amount of data for the reduction. */
|
|
|
|
for (totalcounts = 0, i = 0; i < lsize; i++) {
|
2005-08-10 21:53:43 +04:00
|
|
|
totalcounts += rcounts[i];
|
2004-08-04 02:12:57 +04:00
|
|
|
}
|
|
|
|
|
2015-06-24 06:59:57 +03:00
|
|
|
/*
|
2008-10-09 18:35:20 +04:00
|
|
|
* The following code basically does an interreduce followed by a
|
|
|
|
* intrascatterv. This is implemented by having the roots of each
|
|
|
|
* group exchange their sbuf. Then, the roots receive the data
|
|
|
|
* from each of the remote ranks and execute the reduce. When
|
|
|
|
* this is complete, they have the reduced data available to them
|
|
|
|
* for doing the scatterv. They do this on the local communicator
|
|
|
|
* associated with the intercommunicator.
|
|
|
|
*
|
|
|
|
* Note: There are other ways to implement MPI_Reduce_scatter on
|
|
|
|
* intercommunicators. For example, one could do a MPI_Reduce locally,
|
|
|
|
* then send the results to the other root which could scatter it.
|
|
|
|
*
|
|
|
|
* Note: It is also worth pointing out that the rcounts argument
|
|
|
|
* represents how the data is going to be scatter locally. Therefore,
|
|
|
|
* its size is the same as the local communicator size.
|
|
|
|
*/
|
2005-08-10 14:51:42 +04:00
|
|
|
if (rank == root) {
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
|
|
|
err = ompi_datatype_get_extent(dtype, &lb, &extent);
|
2005-08-10 21:53:43 +04:00
|
|
|
if (OMPI_SUCCESS != err) {
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
2008-10-09 18:35:20 +04:00
|
|
|
/* Generate displacements for the scatterv part */
|
|
|
|
disps = (int*) malloc(sizeof(int) * lsize);
|
|
|
|
if (NULL == disps) {
|
2014-04-06 22:23:49 +04:00
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
2008-10-09 18:35:20 +04:00
|
|
|
}
|
|
|
|
disps[0] = 0;
|
|
|
|
for (i = 0; i < (lsize - 1); ++i) {
|
|
|
|
disps[i + 1] = disps[i] + rcounts[i];
|
|
|
|
}
|
|
|
|
|
2005-08-10 21:53:43 +04:00
|
|
|
tmpbuf = (char *) malloc(totalcounts * extent);
|
|
|
|
tmpbuf2 = (char *) malloc(totalcounts * extent);
|
|
|
|
if (NULL == tmpbuf || NULL == tmpbuf2) {
|
2014-04-06 22:23:49 +04:00
|
|
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto exit;
|
2005-08-10 21:53:43 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Do a send-recv between the two root procs. to avoid deadlock */
|
|
|
|
err = MCA_PML_CALL(isend(sbuf, totalcounts, dtype, 0,
|
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
|
|
|
MCA_PML_BASE_SEND_STANDARD, comm, &req));
|
|
|
|
if (OMPI_SUCCESS != err) {
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = MCA_PML_CALL(recv(tmpbuf2, totalcounts, dtype, 0,
|
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
|
|
|
MPI_STATUS_IGNORE));
|
|
|
|
if (OMPI_SUCCESS != err) {
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
2014-11-04 06:16:30 +03:00
|
|
|
err = ompi_request_wait( &req, MPI_STATUS_IGNORE);
|
2005-08-10 21:53:43 +04:00
|
|
|
if (OMPI_SUCCESS != err) {
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Loop receiving and calling reduction function (C or Fortran)
|
2015-06-24 06:59:57 +03:00
|
|
|
* The result of this reduction operations is then in
|
|
|
|
* tmpbuf2.
|
2005-08-10 21:53:43 +04:00
|
|
|
*/
|
|
|
|
for (i = 1; i < rsize; i++) {
|
|
|
|
err = MCA_PML_CALL(recv(tmpbuf, totalcounts, dtype, i,
|
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
|
|
|
MPI_STATUS_IGNORE));
|
|
|
|
if (MPI_SUCCESS != err) {
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Perform the reduction */
|
|
|
|
ompi_op_reduce(op, tmpbuf, tmpbuf2, totalcounts, dtype);
|
|
|
|
}
|
2005-08-10 14:51:42 +04:00
|
|
|
} else {
|
2005-08-10 21:53:43 +04:00
|
|
|
/* If not root, send data to the root. */
|
|
|
|
err = MCA_PML_CALL(send(sbuf, totalcounts, dtype, root,
|
|
|
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
|
|
|
MCA_PML_BASE_SEND_STANDARD, comm));
|
|
|
|
if (OMPI_SUCCESS != err) {
|
|
|
|
goto exit;
|
|
|
|
}
|
2004-08-04 02:12:57 +04:00
|
|
|
}
|
2005-08-10 14:51:42 +04:00
|
|
|
|
2008-10-09 18:35:20 +04:00
|
|
|
/* Now do a scatterv on the local communicator */
|
|
|
|
err = comm->c_local_comm->c_coll.coll_scatterv(tmpbuf2, rcounts, disps, dtype,
|
|
|
|
rbuf, rcounts[rank], dtype, 0,
|
|
|
|
comm->c_local_comm,
|
|
|
|
comm->c_local_comm->c_coll.coll_scatterv_module);
|
2004-08-04 02:12:57 +04:00
|
|
|
|
2005-08-10 14:51:42 +04:00
|
|
|
exit:
|
|
|
|
if (NULL != tmpbuf) {
|
2005-08-10 21:53:43 +04:00
|
|
|
free(tmpbuf);
|
2004-08-04 02:12:57 +04:00
|
|
|
}
|
|
|
|
|
2005-08-10 14:51:42 +04:00
|
|
|
if (NULL != tmpbuf2) {
|
2005-08-10 21:53:43 +04:00
|
|
|
free(tmpbuf2);
|
2004-08-04 02:12:57 +04:00
|
|
|
}
|
|
|
|
|
2008-10-09 18:35:20 +04:00
|
|
|
if (NULL != disps) {
|
|
|
|
free(disps);
|
|
|
|
}
|
|
|
|
|
2004-08-04 21:08:29 +04:00
|
|
|
return err;
|
2004-01-12 00:26:55 +03:00
|
|
|
}
|