coll: reduce_scatter_block: add butterfly algorithm
Implements butterfly algorithm for MPI_Reduce_scatter_block. The algorithm can be used both by commutative and non-commutative operations, for power-of-two and non-power-of-two number of processes. Signed-off-by: Mikhail Kurnosov <mkurnosov@gmail.com>
Этот коммит содержится в:
родитель
9fff40647d
Коммит
28d5837dd9
@ -253,6 +253,7 @@ int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
|
||||
int ompi_coll_base_reduce_scatter_block_basic_linear(REDUCESCATTERBLOCK_ARGS);
|
||||
int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS);
|
||||
int ompi_coll_base_reduce_scatter_block_intra_recursivehalving(REDUCESCATTERBLOCK_ARGS);
|
||||
int ompi_coll_base_reduce_scatter_block_intra_butterfly(REDUCESCATTERBLOCK_ARGS);
|
||||
|
||||
/* Scan */
|
||||
int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
|
||||
|
@ -40,7 +40,6 @@
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
|
||||
/*
|
||||
* ompi_reduce_scatter_block_basic_linear
|
||||
*
|
||||
@ -511,3 +510,408 @@ cleanup_and_return:
|
||||
free(tmprecv_raw);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* ompi_mirror_perm: Returns mirror permutation of nbits low-order bits
|
||||
* of x [*].
|
||||
* [*] Warren Jr., Henry S. Hacker's Delight (2ed). 2013.
|
||||
* Chapter 7. Rearranging Bits and Bytes.
|
||||
*/
|
||||
static unsigned int ompi_mirror_perm(unsigned int x, int nbits)
|
||||
{
|
||||
x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
|
||||
x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
|
||||
x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
|
||||
x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
|
||||
x = ((x >> 16) | (x << 16));
|
||||
return x >> (sizeof(x) * CHAR_BIT - nbits);
|
||||
}
|
||||
|
||||
static int ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
|
||||
const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
/*
|
||||
* ompi_coll_base_reduce_scatter_block_intra_butterfly
|
||||
*
|
||||
* Function: Butterfly algorithm for reduce_scatter_block
|
||||
* Accepts: Same as MPI_Reduce_scatter_block
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
*
|
||||
* Description: Implements butterfly algorithm for MPI_Reduce_scatter_block [*].
|
||||
* The algorithm can be used both by commutative and non-commutative
|
||||
* operations, for power-of-two and non-power-of-two number of processes.
|
||||
*
|
||||
* [*] J.L. Traff. An improved Algorithm for (non-commutative) Reduce-scatter
|
||||
* with an Application // Proc. of EuroPVM/MPI, 2005. -- pp. 129-137.
|
||||
*
|
||||
* Time complexity:
|
||||
* m\lambda + (\alpha + m\beta + m\gamma) +
|
||||
* + 2\log_2(p)\alpha + 2m(1-1/p)\beta + m(1-1/p)\gamma +
|
||||
* + 3(\alpha + m/p\beta) = O(m\lambda + log(p)\alpha + m\beta + m\gamma),
|
||||
* where m = rcount * comm_size, p = comm_size
|
||||
* Memory requirements (per process): 2 * rcount * comm_size * typesize
|
||||
*
|
||||
* Example: comm_size=6, nprocs_pof2=4, nprocs_rem=2, rcount=1, sbuf=[0,1,...,5]
|
||||
* Step 1. Reduce the number of processes to 4
|
||||
* rank 0: [0|1|2|3|4|5]: send to 1: vrank -1
|
||||
* rank 1: [0|1|2|3|4|5]: recv from 0, op: vrank 0: [0|2|4|6|8|10]
|
||||
* rank 2: [0|1|2|3|4|5]: send to 3: vrank -1
|
||||
* rank 3: [0|1|2|3|4|5]: recv from 2, op: vrank 1: [0|2|4|6|8|10]
|
||||
* rank 4: [0|1|2|3|4|5]: vrank 2: [0|1|2|3|4|5]
|
||||
* rank 5: [0|1|2|3|4|5]: vrank 3: [0|1|2|3|4|5]
|
||||
*
|
||||
* Step 2. Butterfly. Buffer of 6 elements is divided into 4 blocks.
|
||||
* Round 1 (mask=1, nblocks=2)
|
||||
* 0: vrank -1
|
||||
* 1: vrank 0 [0 2|4 6|8|10]: exch with 1: send [2,3], recv [0,1]: [0 4|8 12|*|*]
|
||||
* 2: vrank -1
|
||||
* 3: vrank 1 [0 2|4 6|8|10]: exch with 0: send [0,1], recv [2,3]: [**|**|16|20]
|
||||
* 4: vrank 2 [0 1|2 3|4|5] : exch with 3: send [2,3], recv [0,1]: [0 2|4 6|*|*]
|
||||
* 5: vrank 3 [0 1|2 3|4|5] : exch with 2: send [0,1], recv [2,3]: [**|**|8|10]
|
||||
*
|
||||
* Round 2 (mask=2, nblocks=1)
|
||||
* 0: vrank -1
|
||||
* 1: vrank 0 [0 4|8 12|*|*]: exch with 2: send [1], recv [0]: [0 6|**|*|*]
|
||||
* 2: vrank -1
|
||||
* 3: vrank 1 [**|**|16|20] : exch with 3: send [3], recv [2]: [**|**|24|*]
|
||||
* 4: vrank 2 [0 2|4 6|*|*] : exch with 0: send [0], recv [1]: [**|12 18|*|*]
|
||||
* 5: vrank 3 [**|**|8|10] : exch with 1: send [2], recv [3]: [**|**|*|30]
|
||||
*
|
||||
* Step 3. Exchange with remote process according to a mirror permutation:
|
||||
* mperm(0)=0, mperm(1)=2, mperm(2)=1, mperm(3)=3
|
||||
* 0: vrank -1: recv "0" from process 0
|
||||
* 1: vrank 0 [0 6|**|*|*]: send "0" to 0, copy "6" to rbuf (mperm(0)=0)
|
||||
* 2: vrank -1: recv result "12" from process 4
|
||||
* 3: vrank 1 [**|**|24|*]
|
||||
* 4: vrank 2 [**|12 18|*|*]: send "12" to 2, send "18" to 3, recv "24" from 3
|
||||
* 5: vrank 3 [**|**|*|30]: copy "30" to rbuf (mperm(3)=3)
|
||||
*/
|
||||
int
|
||||
ompi_coll_base_reduce_scatter_block_intra_butterfly(
|
||||
const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
|
||||
ptrdiff_t span, gap, totalcount, extent;
|
||||
int err = MPI_SUCCESS;
|
||||
int comm_size = ompi_comm_size(comm);
|
||||
int rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:reduce_scatter_block_intra_butterfly: rank %d/%d",
|
||||
rank, comm_size));
|
||||
if (rcount == 0 || comm_size < 2)
|
||||
return MPI_SUCCESS;
|
||||
|
||||
if (!(comm_size & (comm_size - 1))) {
|
||||
/* Special case: comm_size is a power of two */
|
||||
return ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
|
||||
sbuf, rbuf, rcount, dtype, op, comm, module);
|
||||
}
|
||||
|
||||
totalcount = comm_size * rcount;
|
||||
ompi_datatype_type_extent(dtype, &extent);
|
||||
span = opal_datatype_span(&dtype->super, totalcount, &gap);
|
||||
tmpbuf[0] = malloc(span);
|
||||
tmpbuf[1] = malloc(span);
|
||||
if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
psend = tmpbuf[0] - gap;
|
||||
precv = tmpbuf[1] - gap;
|
||||
|
||||
if (sbuf != MPI_IN_PLACE) {
|
||||
err = ompi_datatype_copy_content_same_ddt(dtype, totalcount, psend, (char *)sbuf);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
} else {
|
||||
err = ompi_datatype_copy_content_same_ddt(dtype, totalcount, psend, rbuf);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
}
|
||||
|
||||
/*
|
||||
* Step 1. Reduce the number of processes to the nearest lower power of two
|
||||
* p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes.
|
||||
* In the first 2r processes (ranks 0 to 2r - 1), all the even ranks send
|
||||
* the input vector to their neighbor (rank + 1) and all the odd ranks recv
|
||||
* the input vector and perform local reduction.
|
||||
* The odd ranks (0 to 2r - 1) contain the reduction with the input
|
||||
* vector on their neighbors (the even ranks). The first r odd
|
||||
* processes and the p - 2r last processes are renumbered from
|
||||
* 0 to 2^{\floor{\log_2 p}} - 1. Even ranks do not participate in the
|
||||
* rest of the algorithm.
|
||||
*/
|
||||
|
||||
/* Find nearest power-of-two less than or equal to comm_size */
|
||||
int nprocs_pof2 = opal_next_poweroftwo(comm_size);
|
||||
nprocs_pof2 >>= 1;
|
||||
int nprocs_rem = comm_size - nprocs_pof2;
|
||||
int log2_size = opal_cube_dim(nprocs_pof2);
|
||||
|
||||
int vrank = -1;
|
||||
if (rank < 2 * nprocs_rem) {
|
||||
if ((rank % 2) == 0) {
|
||||
/* Even process */
|
||||
err = MCA_PML_CALL(send(psend, totalcount, dtype, rank + 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
/* This process does not participate in the rest of the algorithm */
|
||||
vrank = -1;
|
||||
} else {
|
||||
/* Odd process */
|
||||
err = MCA_PML_CALL(recv(precv, totalcount, dtype, rank - 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
ompi_op_reduce(op, precv, psend, totalcount, dtype);
|
||||
/* Adjust rank to be the bottom "remain" ranks */
|
||||
vrank = rank / 2;
|
||||
}
|
||||
} else {
|
||||
/* Adjust rank to show that the bottom "even remain" ranks dropped out */
|
||||
vrank = rank - nprocs_rem;
|
||||
}
|
||||
|
||||
if (vrank != -1) {
|
||||
/*
|
||||
* Now, psend vector of size rcount * comm_size elements is divided into
|
||||
* nprocs_pof2 blocks:
|
||||
* block 0 has 2*rcount elems (for process 0 and 1)
|
||||
* block 1 has 2*rcount elems (for process 2 and 3)
|
||||
* ...
|
||||
* block r-1 has 2*rcount elems (for process 2*(r-1) and 2*(r-1)+1)
|
||||
* block r has rcount elems (for process r+r)
|
||||
* block r+1 has rcount elems (for process r+r+1)
|
||||
* ...
|
||||
* block nprocs_pof2 - 1 has rcount elems (for process r + nprocs_pof2-1)
|
||||
*/
|
||||
int nblocks = nprocs_pof2, send_index = 0, recv_index = 0;
|
||||
for (int mask = 1; mask < nprocs_pof2; mask <<= 1) {
|
||||
int vpeer = vrank ^ mask;
|
||||
int peer = (vpeer < nprocs_rem) ? vpeer * 2 + 1 : vpeer + nprocs_rem;
|
||||
|
||||
nblocks /= 2;
|
||||
if ((vrank & mask) == 0) {
|
||||
/* Send the upper half of reduction buffer, recv the lower half */
|
||||
send_index += nblocks;
|
||||
} else {
|
||||
/* Send the upper half of reduction buffer, recv the lower half */
|
||||
recv_index += nblocks;
|
||||
}
|
||||
int send_count = rcount * ompi_range_sum(send_index,
|
||||
send_index + nblocks - 1, nprocs_rem - 1);
|
||||
int recv_count = rcount * ompi_range_sum(recv_index,
|
||||
recv_index + nblocks - 1, nprocs_rem - 1);
|
||||
ptrdiff_t sdispl = rcount * ((send_index <= nprocs_rem - 1) ?
|
||||
2 * send_index : nprocs_rem + send_index);
|
||||
ptrdiff_t rdispl = rcount * ((recv_index <= nprocs_rem - 1) ?
|
||||
2 * recv_index : nprocs_rem + recv_index);
|
||||
|
||||
err = ompi_coll_base_sendrecv(psend + (ptrdiff_t)sdispl * extent, send_count,
|
||||
dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
|
||||
precv + (ptrdiff_t)rdispl * extent, recv_count,
|
||||
dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
if (vrank < vpeer) {
|
||||
/* precv = psend <op> precv */
|
||||
ompi_op_reduce(op, psend + (ptrdiff_t)rdispl * extent,
|
||||
precv + (ptrdiff_t)rdispl * extent, recv_count, dtype);
|
||||
char *p = psend;
|
||||
psend = precv;
|
||||
precv = p;
|
||||
} else {
|
||||
/* psend = precv <op> psend */
|
||||
ompi_op_reduce(op, precv + (ptrdiff_t)rdispl * extent,
|
||||
psend + (ptrdiff_t)rdispl * extent, recv_count, dtype);
|
||||
}
|
||||
send_index = recv_index;
|
||||
}
|
||||
/*
|
||||
* psend points to the result: [send_index, send_index + recv_count - 1]
|
||||
* Exchange results with remote process according to a mirror permutation.
|
||||
*/
|
||||
int vpeer = ompi_mirror_perm(vrank, log2_size);
|
||||
int peer = (vpeer < nprocs_rem) ? vpeer * 2 + 1 : vpeer + nprocs_rem;
|
||||
|
||||
if (vpeer < nprocs_rem) {
|
||||
/*
|
||||
* Process has two blocks: for excluded process and own.
|
||||
* Send result to the excluded process.
|
||||
*/
|
||||
ptrdiff_t sdispl = rcount * ((send_index <= nprocs_rem - 1) ?
|
||||
2 * send_index : nprocs_rem + send_index);
|
||||
err = MCA_PML_CALL(send(psend + (ptrdiff_t)sdispl * extent,
|
||||
rcount, dtype, peer - 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
}
|
||||
|
||||
/* Send result to a remote process according to a mirror permutation */
|
||||
ptrdiff_t sdispl = rcount * ((send_index <= nprocs_rem - 1) ?
|
||||
2 * send_index : nprocs_rem + send_index);
|
||||
/* If process has two blocks, then send the second block (own block) */
|
||||
if (vpeer < nprocs_rem)
|
||||
sdispl += rcount;
|
||||
if (vpeer != vrank) {
|
||||
err = ompi_coll_base_sendrecv(psend + (ptrdiff_t)sdispl * extent, rcount,
|
||||
dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
|
||||
rbuf, rcount, dtype, peer,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
} else {
|
||||
err = ompi_datatype_copy_content_same_ddt(dtype, rcount, rbuf,
|
||||
psend + (ptrdiff_t)sdispl * extent);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
}
|
||||
|
||||
} else {
|
||||
/* Excluded process: receive result */
|
||||
int vpeer = ompi_mirror_perm((rank + 1) / 2, log2_size);
|
||||
int peer = (vpeer < nprocs_rem) ? vpeer * 2 + 1 : vpeer + nprocs_rem;
|
||||
err = MCA_PML_CALL(recv(rbuf, rcount, dtype, peer,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
}
|
||||
|
||||
cleanup_and_return:
|
||||
if (tmpbuf[0])
|
||||
free(tmpbuf[0]);
|
||||
if (tmpbuf[1])
|
||||
free(tmpbuf[1]);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2
|
||||
*
|
||||
* Function: Butterfly algorithm for reduce_scatter_block
|
||||
* Accepts: Same as MPI_Reduce_scatter_block
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
* Limitations: Power-of-two number of processes.
|
||||
*
|
||||
* Description: Implements butterfly algorithm for MPI_Reduce_scatter_block [*].
|
||||
* The algorithm can be used both by commutative and non-commutative
|
||||
* operations, for power-of-two number of processes.
|
||||
*
|
||||
* [*] J.L. Traff. An improved Algorithm for (non-commutative) Reduce-scatter
|
||||
* with an Application // Proc. of EuroPVM/MPI, 2005. -- pp. 129-137.
|
||||
*
|
||||
* Time complexity:
|
||||
* m\lambda + 2\log_2(p)\alpha + 2m(1-1/p)\beta + m(1-1/p)\gamma + m/p\lambda =
|
||||
* = O(m\lambda + log(p)\alpha + m\beta + m\gamma),
|
||||
* where m = rcount * comm_size, p = comm_size
|
||||
* Memory requirements (per process): 2 * rcount * comm_size * typesize
|
||||
*
|
||||
* Example: comm_size=4, rcount=1, sbuf=[0,1,2,3]
|
||||
* Step 1. Permute the blocks according to a mirror permutation:
|
||||
* mperm(0)=0, mperm(1)=2, mperm(2)=1, mperm(3)=3
|
||||
* sbuf=[0|1|2|3] ==> psend=[0|2|1|3]
|
||||
*
|
||||
* Step 2. Butterfly
|
||||
* Round 1 (mask=1, nblocks=2)
|
||||
* 0: [0|2|1|3]: exch with 1: send [2,3], recv [0,1]: [0|4|*|*]
|
||||
* 1: [0|2|1|3]: exch with 0: send [0,1], recv [2,3]: [*|*|2|6]
|
||||
* 2: [0|2|1|3]: exch with 3: send [2,3], recv [0,1]: [0|4|*|*]
|
||||
* 3: [0|2|1|3]: exch with 2: send [0,1], recv [2,3]: [*|*|2|6]
|
||||
*
|
||||
* Round 2 (mask=2, nblocks=1)
|
||||
* 0: [0|4|*|*]: exch with 2: send [1], recv [0]: [0|*|*|*]
|
||||
* 1: [*|*|2|6]: exch with 3: send [3], recv [2]: [*|*|4|*]
|
||||
* 2: [0|4|*|*]: exch with 0: send [0], recv [1]: [*|8|*|*]
|
||||
* 3: [*|*|2|6]: exch with 1: send [2], recv [3]: [*|*|*|12]
|
||||
*
|
||||
* Step 3. Copy result to rbuf
|
||||
*/
|
||||
static int
|
||||
ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
|
||||
const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
|
||||
ptrdiff_t span, gap, totalcount, extent;
|
||||
int err = MPI_SUCCESS;
|
||||
int comm_size = ompi_comm_size(comm);
|
||||
int rank = ompi_comm_rank(comm);
|
||||
|
||||
if (rcount == 0 || comm_size < 2)
|
||||
return MPI_SUCCESS;
|
||||
|
||||
totalcount = comm_size * rcount;
|
||||
ompi_datatype_type_extent(dtype, &extent);
|
||||
span = opal_datatype_span(&dtype->super, totalcount, &gap);
|
||||
tmpbuf[0] = malloc(span);
|
||||
tmpbuf[1] = malloc(span);
|
||||
if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup_and_return;
|
||||
}
|
||||
psend = tmpbuf[0] - gap;
|
||||
precv = tmpbuf[1] - gap;
|
||||
|
||||
/* Permute the blocks according to a mirror permutation */
|
||||
int log2_comm_size = opal_cube_dim(comm_size);
|
||||
char *pdata = (sbuf != MPI_IN_PLACE) ? (char *)sbuf : rbuf;
|
||||
for (int i = 0; i < comm_size; i++) {
|
||||
char *src = pdata + (ptrdiff_t)i * extent * rcount;
|
||||
char *dst = psend + (ptrdiff_t)ompi_mirror_perm(i, log2_comm_size) * extent * rcount;
|
||||
err = ompi_datatype_copy_content_same_ddt(dtype, rcount, dst, src);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
}
|
||||
|
||||
int nblocks = totalcount, send_index = 0, recv_index = 0;
|
||||
for (int mask = 1; mask < comm_size; mask <<= 1) {
|
||||
int peer = rank ^ mask;
|
||||
nblocks /= 2;
|
||||
|
||||
if ((rank & mask) == 0) {
|
||||
/* Send the upper half of reduction buffer, recv the lower half */
|
||||
send_index += nblocks;
|
||||
} else {
|
||||
/* Send the upper half of reduction buffer, recv the lower half */
|
||||
recv_index += nblocks;
|
||||
}
|
||||
err = ompi_coll_base_sendrecv(psend + (ptrdiff_t)send_index * extent,
|
||||
nblocks, dtype, peer,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
|
||||
precv + (ptrdiff_t)recv_index * extent,
|
||||
nblocks, dtype, peer,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
if (rank < peer) {
|
||||
/* precv = psend <op> precv */
|
||||
ompi_op_reduce(op, psend + (ptrdiff_t)recv_index * extent,
|
||||
precv + (ptrdiff_t)recv_index * extent, nblocks, dtype);
|
||||
char *p = psend;
|
||||
psend = precv;
|
||||
precv = p;
|
||||
} else {
|
||||
/* psend = precv <op> psend */
|
||||
ompi_op_reduce(op, precv + (ptrdiff_t)recv_index * extent,
|
||||
psend + (ptrdiff_t)recv_index * extent, nblocks, dtype);
|
||||
}
|
||||
send_index = recv_index;
|
||||
}
|
||||
/* Copy the result to the rbuf */
|
||||
err = ompi_datatype_copy_content_same_ddt(dtype, rcount, rbuf,
|
||||
psend + (ptrdiff_t)recv_index * extent);
|
||||
if (MPI_SUCCESS != err) { goto cleanup_and_return; }
|
||||
|
||||
cleanup_and_return:
|
||||
if (tmpbuf[0])
|
||||
free(tmpbuf[0]);
|
||||
if (tmpbuf[1])
|
||||
free(tmpbuf[1]);
|
||||
return err;
|
||||
}
|
||||
|
@ -36,6 +36,7 @@ static mca_base_var_enum_value_t reduce_scatter_block_algorithms[] = {
|
||||
{1, "basic_linear"},
|
||||
{2, "recursive_doubling"},
|
||||
{3, "recursive_halving"},
|
||||
{4, "butterfly"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
@ -75,7 +76,8 @@ int ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init (coll_tuned_for
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_block_algorithm",
|
||||
"Which reduce reduce_scatter_block algorithm is used. "
|
||||
"Can be locked down to choice of: 0 ignore, 1 basic_linear, 2 recursive_doubling",
|
||||
"Can be locked down to choice of: 0 ignore, 1 basic_linear, 2 recursive_doubling, "
|
||||
"3 recursive_halving, 4 butterfly",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_ALL,
|
||||
@ -128,6 +130,8 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r
|
||||
dtype, op, comm, module);
|
||||
case (3): return ompi_coll_base_reduce_scatter_block_intra_recursivehalving(sbuf, rbuf, rcount,
|
||||
dtype, op, comm, module);
|
||||
case (4): return ompi_coll_base_reduce_scatter_block_intra_butterfly(sbuf, rbuf, rcount, dtype, op, comm,
|
||||
module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_block_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTERBLOCK]));
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user